[dpdk-dev] [PATCH 0/3] generic channel for multi-process communication

Discussion:

[dpdk-dev] [PATCH 0/3] generic channel for multi-process communication

Jianfeng Tan

2017-11-30 18:44:07 UTC

This patchset adds a generic channel for multi-process (primary/secondary)
communication.

Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for those messages which need a response immediately.
Patch 3: Rework vfio to use this generic communication channel.

Jianfeng Tan (3):
eal: add channel for multi-process communication
eal: add synchronous multi-process communication
vfio: use the generic multi-process channel

lib/librte_eal/common/eal_common_proc.c | 546 +++++++++++++++++++++++++
lib/librte_eal/common/eal_filesystem.h | 18 +
lib/librte_eal/common/eal_private.h | 10 +
lib/librte_eal/common/include/rte_eal.h | 71 ++++
lib/librte_eal/linuxapp/eal/eal.c | 23 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 139 ++-----
lib/librte_eal/linuxapp/eal/eal_vfio.h | 15 +-
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 416 +++----------------
lib/librte_eal/rte_eal_version.map | 22 +
9 files changed, 785 insertions(+), 475 deletions(-)

--
2.7.4

Jianfeng Tan

2017-11-30 18:44:08 UTC

Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
1. Config-file based channel, in which, the primary process writes
info into a pre-defined config file, and the secondary process
reads info out.
2. vfio submodule has its own channel based on unix socket for the
secondary process to get container fd and group fd from the
primary process.
3. pdump submodule also has its own channel based on unix socket for
packet dump.

It'll be good to have a generic communication channel for multi-process
communication to accomodate the requirements including:
a. Secondary wants to send info to primary, for example, secondary
would like to send request (about some specific vdev to primary).
b. Sending info at any time, instead of just initialization time.
c. Share FDs with the other side, for vdev like vhost, related FDs
(memory region, kick) should be shared.
d. A send message request needs the other side to response immediately.

This patch proposes to create a communication channel, as an unix
socket connection, for above requirements. Primary will listen on
the unix socket; secondary will connect this socket to talk.

Three new APIs are added:

1. rte_eal_mp_action_register is used to register an action,
indexed by a string; if the calling component wants to
response the messages from the corresponding component in
its primary process or secondary processes.
2. rte_eal_mp_action_unregister is used to unregister the action
if the calling component does not want to response the messages.
3. rte_eal_mp_sendmsg is used to send a message.

Signed-off-by: Jianfeng Tan <***@intel.com>
---
lib/librte_eal/common/eal_common_proc.c | 497 ++++++++++++++++++++++++++++++++
lib/librte_eal/common/eal_filesystem.h | 18 ++
lib/librte_eal/common/eal_private.h | 10 +
lib/librte_eal/common/include/rte_eal.h | 68 +++++
lib/librte_eal/linuxapp/eal/eal.c | 9 +
lib/librte_eal/rte_eal_version.map | 22 ++
6 files changed, 624 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 60526ca..5d0a095 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -33,8 +33,21 @@
#include <stdio.h>
#include <fcntl.h>
#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/epoll.h>
+#include <limits.h>
+#include <unistd.h>
+#include <sys/un.h>
+#include <errno.h>
+#include <pthread.h>
+
+#include <rte_log.h>
#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_common.h>

+#include "eal_private.h"
#include "eal_filesystem.h"
#include "eal_internal_cfg.h"

@@ -59,3 +72,487 @@ rte_eal_primary_proc_alive(const char *config_file_path)

return !!ret;
}
+
+struct action_entry {
+ TAILQ_ENTRY(action_entry) next; /**< Next attached action entry */
+
+#define MAX_ACTION_NAME_LEN 64
+ char action_name[MAX_ACTION_NAME_LEN];
+ rte_eal_mp_t action;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(action_entry_list, action_entry);
+
+static struct action_entry_list action_entry_list =
+ TAILQ_HEAD_INITIALIZER(action_entry_list);
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+ int len = strlen(name);
+ struct action_entry *entry;
+
+ TAILQ_FOREACH(entry, &action_entry_list, next) {
+ if (strncmp(entry->action_name, name, len) == 0)
+ break;
+ }
+
+ return entry;
+}
+
+int
+rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action)
+{
+ struct action_entry *entry = malloc(sizeof(struct action_entry));
+
+ if (entry == NULL)
+ return -ENOMEM;
+
+ if (find_action_entry_by_name(action_name) != NULL)
+ return -EEXIST;
+
+ strncpy(entry->action_name, action_name, MAX_ACTION_NAME_LEN);
+ entry->action = action;
+ TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+ return 0;
+}
+
+void
+rte_eal_mp_action_unregister(const char *name)
+{
+ struct action_entry *entry = find_action_entry_by_name(name);
+
+ TAILQ_REMOVE(&action_entry_list, entry, next);
+ free(entry);
+}
+
+/* The maximum amount of fd for one recvmsg/sendmsg */
+#define SCM_MAX_FD 253
+#define MAX_SECONDARY_PROCS 8
+#define MAX_MESSAGE_LENGTH 1024
+
+struct mp_fds {
+ int efd;
+
+ union {
+ /* fds for primary process */
+ struct {
+ int listen;
+ /* fds used to send msg to secondary process(es) */
+ int secondaries[MAX_SECONDARY_PROCS];
+ };
+
+ /* fds for secondary process */
+ struct {
+ /* fds used to send msg to the primary process */
+ int primary;
+ };
+ };
+};
+
+static struct mp_fds mp_fds;
+
+struct msg_hdr {
+ char action_name[MAX_ACTION_NAME_LEN];
+ int fds_num;
+ int len_params;
+ char params[0];
+} __rte_packed;
+
+static int
+add_sec_proc(int fd)
+{
+ int i;
+
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+ if (mp_fds.secondaries[i] == -1)
+ break;
+
+ if (i >= MAX_SECONDARY_PROCS)
+ return -1;
+
+ mp_fds.secondaries[i] = fd;
+
+ return i;
+}
+
+static void
+del_sec_proc(int fd)
+{
+ int i;
+
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
+ if (mp_fds.secondaries[i] == fd) {
+ mp_fds.secondaries[i] = -1;
+ break;
+ }
+ }
+}
+
+static int
+read_msg(int sockfd, char *buf, int buflen, int *fds, int fds_num)
+{
+ struct iovec iov;
+ struct msghdr msgh;
+ size_t fdsize = fds_num * sizeof(int);
+ char control[CMSG_SPACE(fdsize)];
+ struct cmsghdr *cmsg;
+ struct msg_hdr *hdr = (struct msg_hdr *)buf;
+ int ret, total;
+
+ /* read msg_hdr */
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = hdr;
+ iov.iov_len = sizeof(*hdr);
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ ret = recvmsg(sockfd, &msgh, 0);
+ if (ret != sizeof(struct msg_hdr)) {
+ RTE_LOG(ERR, EAL, "recvmsg failed\n");
+ return ret;
+ }
+
+ if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+ RTE_LOG(ERR, EAL, "truncted msg\n");
+ return -1;
+ }
+ total = ret;
+
+ /* read auxiliary FDs if any */
+ for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+ if ((cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS)) {
+ memcpy(fds, CMSG_DATA(cmsg), fdsize);
+ break;
+ }
+ }
+
+ /* read params */
+ if (hdr->len_params) {
+ if (hdr->len_params > buflen - (int)sizeof(*hdr))
+ rte_exit(EXIT_FAILURE, "params too long\n");
+
+ ret = read(sockfd, &hdr->params, hdr->len_params);
+ if (ret != hdr->len_params)
+ rte_exit(EXIT_FAILURE, "failed to recv params\n");
+
+ total += ret;
+ }
+
+ RTE_LOG(INFO, EAL, "read msg: %s, %d\n", hdr->action_name,
+ (int)sizeof(*hdr) + hdr->len_params);
+ return total;
+}
+
+static int
+process_msg(int fd)
+{
+ int len;
+ int params_len;
+ char buf[MAX_MESSAGE_LENGTH];
+ int fds[SCM_MAX_FD];
+ struct msg_hdr *hdr;
+ struct action_entry *entry;
+
+ len = read_msg(fd, buf, MAX_MESSAGE_LENGTH, fds, SCM_MAX_FD);
+ if (len <= 0) {
+ RTE_LOG(ERR, EAL, "failed to read message: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ hdr = (struct msg_hdr *) buf;
+
+ entry = find_action_entry_by_name(hdr->action_name);
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "cannot find action by: %s\n",
+ hdr->action_name);
+ return -1;
+ }
+
+ params_len = len - sizeof(struct msg_hdr);
+
+ return entry->action(hdr->params, params_len, fds, hdr->fds_num);
+}
+
+static int
+add_secondary(void)
+{
+ int fd;
+ struct epoll_event ev;
+
+ while (1) {
+ fd = accept(mp_fds.listen, NULL, NULL);
+ if (fd < 0 && errno == EAGAIN)
+ break;
+ else if (fd < 0) {
+ RTE_LOG(ERR, EAL, "primary failed to accept: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ ev.events = EPOLLIN | EPOLLRDHUP;
+ ev.data.fd = fd;
+ if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, fd, &ev) < 0) {
+ RTE_LOG(ERR, EAL, "failed to add secondary: %s\n",
+ strerror(errno));
+ break;
+ }
+ if (add_sec_proc(fd) < 0) {
+ RTE_LOG(ERR, EAL, "too many secondary processes\n");
+ close(fd);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static void *
+mp_handler(void *arg __rte_unused)
+{
+ int fd;
+ int i, n;
+ struct epoll_event ev;
+ struct epoll_event *events;
+ int is_primary = rte_eal_process_type() == RTE_PROC_PRIMARY;
+
+ ev.events = EPOLLIN | EPOLLRDHUP;
+ ev.data.fd = (is_primary) ? mp_fds.listen : mp_fds.primary;
+ if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, ev.data.fd, &ev) < 0) {
+ RTE_LOG(ERR, EAL, "failed to epoll_ctl: %s\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ events = calloc(20, sizeof ev);
+
+ while (1) {
+ n = epoll_wait(mp_fds.efd, events, 20, -1);
+ for (i = 0; i < n; i++) {
+ if (is_primary && events[i].data.fd == mp_fds.listen) {
+ if (events[i].events != EPOLLIN) {
+ RTE_LOG(ERR, EAL, "what happens?\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (add_secondary() < 0)
+ break;
+
+ continue;
+ }
+
+ fd = events[i].data.fd;
+
+ if ((events[i].events & EPOLLIN)) {
+ if (process_msg(fd) < 0) {
+ RTE_LOG(ERR, EAL,
+ "failed to process msg\n");
+ if (!is_primary)
+ exit(EXIT_FAILURE);
+ }
+ continue;
+ }
+
+ /* EPOLLERR, EPOLLHUP, etc */
+ if (is_primary) {
+ RTE_LOG(ERR, EAL, "secondary exit: %d\n", fd);
+ epoll_ctl(mp_fds.efd, EPOLL_CTL_DEL, fd, NULL);
+ del_sec_proc(fd);
+ close(fd);
+ } else {
+ RTE_LOG(ERR, EAL, "primary exits, so do I\n");
+ /* Exit secondary when primary exits? */
+ exit(EXIT_FAILURE);
+ }
+ }
+ }
+
+ return NULL;
+}
+
+int
+rte_eal_mp_channel_init(void)
+{
+ int i, fd, ret;
+ const char *path;
+ struct sockaddr_un un;
+ pthread_t tid;
+ char thread_name[RTE_MAX_THREAD_NAME_LEN];
+
+ mp_fds.efd = epoll_create1(0);
+ if (mp_fds.efd < 0) {
+ RTE_LOG(ERR, EAL, "epoll_create1 failed\n");
+ return -1;
+ }
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Failed to create unix socket\n");
+ return -1;
+ }
+
+ memset(&un, 0, sizeof(un));
+ un.sun_family = AF_UNIX;
+ path = eal_mp_unix_path();
+ strncpy(un.sun_path, path, sizeof(un.sun_path));
+ un.sun_path[sizeof(un.sun_path) - 1] = '\0';
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+ mp_fds.secondaries[i] = -1;
+
+ if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
+ RTE_LOG(ERR, EAL, "cannot set nonblocking mode\n");
+ close(fd);
+ return -1;
+ }
+
+ /* The file still exists since last run */
+ unlink(path);
+
+ ret = bind(fd, (struct sockaddr *)&un, sizeof(un));
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to bind to %s: %s\n",
+ path, strerror(errno));
+ close(fd);
+ return -1;
+ }
+ RTE_LOG(INFO, EAL, "primary bind to %s\n", path);
+
+ ret = listen(fd, 1024);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to listen: %s\n",
+ strerror(errno));
+ close(fd);
+ return -1;
+ }
+ mp_fds.listen = fd;
+ } else {
+ ret = connect(fd, (struct sockaddr *)&un, sizeof(un));
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to connect primary\n");
+ return -1;
+ }
+ mp_fds.primary = fd;
+ }
+
+ ret = pthread_create(&tid, NULL, mp_handler, NULL);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to create thead: %s\n",
+ strerror(errno));
+ close(fd);
+ close(mp_fds.efd);
+ return -1;
+ }
+
+ snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
+ "rte_mp_handle");
+ ret = rte_thread_setname(tid, thread_name);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to set thead name\n");
+ close(fd);
+ close(mp_fds.efd);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+send_msg(int fd, struct msghdr *p_msgh)
+{
+ int ret;
+
+ do {
+ ret = sendmsg(fd, p_msgh, 0);
+ } while (ret < 0 && errno == EINTR);
+
+ if (ret < 0)
+ RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
+
+ return ret;
+}
+
+int
+rte_eal_mp_sendmsg(const char *action_name,
+ const void *params,
+ int len_params,
+ int fds[],
+ int fds_num)
+{
+ int i;
+ int ret = 0;
+ struct msghdr msgh;
+ struct iovec iov;
+ size_t fd_size = fds_num * sizeof(int);
+ char control[CMSG_SPACE(fd_size)];
+ struct cmsghdr *cmsg;
+ struct msg_hdr *msg;
+ int len_msg;
+
+ if (fds_num > SCM_MAX_FD) {
+ RTE_LOG(ERR, EAL,
+ "Cannot send more than %d FDs\n", SCM_MAX_FD);
+ return -E2BIG;
+ }
+
+ len_msg = sizeof(struct msg_hdr) + len_params;
+ if (len_msg > MAX_MESSAGE_LENGTH) {
+ RTE_LOG(ERR, EAL, "Message is too long\n");
+ return -ENOMEM;
+ }
+
+ RTE_LOG(INFO, EAL, "send msg: %s, %d\n", action_name, len_msg);
+
+ msg = malloc(len_msg);
+ if (!msg) {
+ RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
+ return -ENOMEM;
+ }
+ memset(msg, 0, len_msg);
+ strcpy(msg->action_name, action_name);
+ msg->fds_num = fds_num;
+ msg->len_params = len_params;
+ memcpy(msg->params, params, len_params);
+
+ memset(&msgh, 0, sizeof(msgh));
+ memset(control, 0, sizeof(control));
+
+ iov.iov_base = (uint8_t *)msg;
+ iov.iov_len = len_msg;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ cmsg->cmsg_len = CMSG_LEN(fd_size);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), fds, fd_size);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
+ if (mp_fds.secondaries[i] == -1)
+ continue;
+
+ ret = send_msg(mp_fds.secondaries[i], &msgh);
+ if (ret < 0)
+ break;
+ }
+ } else {
+ ret = send_msg(mp_fds.primary, &msgh);
+ }
+
+ free(msg);
+
+ return ret;
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index 8acbd99..3d9514f 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -67,6 +67,24 @@ eal_runtime_config_path(void)
return buffer;
}

+/** Path of primary/secondary communication unix socket file. */
+#define MP_UNIX_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_unix_path(void)
+{
+ static char buffer[PATH_MAX]; /* static so auto-zeroed */
+ const char *directory = default_config_dir;
+ const char *home_dir = getenv("HOME");
+
+ if (getuid() != 0 && home_dir != NULL)
+ directory = home_dir;
+ snprintf(buffer, sizeof(buffer) - 1, MP_UNIX_PATH_FMT,
+ directory, internal_config.hugefile_prefix);
+
+ return buffer;
+
+}
+
/** Path of hugepage info file. */
#define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"

diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 462226f..60944f2 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -224,4 +224,14 @@ int rte_eal_hugepage_attach(void);
*/
struct rte_bus *rte_bus_find_by_device_name(const char *str);

+/**
+ * Create the unix channel for primary/secondary communication.
+ *
+ * @return
+ * 0 on success;
+ * (<0) on failure.
+ */
+
+int rte_eal_mp_channel_init(void);
+
#endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 8e4e71c..8776bcf 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -215,6 +215,74 @@ int rte_eal_init(int argc, char **argv);
int rte_eal_primary_proc_alive(const char *config_file_path);

/**
+ * Action function typedef used by other components.
+ *
+ * As we create unix socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_eal_mp_t)(const void *params, int len,
+ int fds[], int fds_num);
+/**
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * @param action_name
+ * The action_name argument plays as the nonredundant key to find the action.
+ *
+ * @param action
+ * The action argument is the function pointer to the action function.
+ *
+ * @return
+ * - 0 on success.
+ * - (<0) on failure.
+ */
+int rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action);
+/**
+ * Unregister an action function for primary/secondary communication.
+ *
+ * Call this function to unregister an action if the calling component does
+ * not want to response the messages from the corresponding component in its
+ * primary process or secondary processes.
+ *
+ * @param action_name
+ * The action_name argument plays as the nonredundant key to find the action.
+ *
+ */
+void rte_eal_mp_action_unregister(const char *name);
+
+/**
+ * Send a message to the primary process or the secondary processes.
+ *
+ * This function will send a message which will be responsed by the action
+ * identified by action_name of the process on the other side.
+ *
+ * @param action_name
+ * The action_name argument is used to identify which action will be used.
+ *
+ * @param params
+ * The params argument contains the customized message.
+ *
+ * @param len_params
+ * The len_params argument is the length of the customized message.
+ *
+ * @param fds
+ * The fds argument is an array of fds sent with sendmsg.
+ *
+ * @param fds_num
+ * The fds_num argument is number of fds to be sent with sendmsg.
+ *
+ * @return
+ * - (>=0) on success.
+ * - (<0) on failure.
+ */
+int
+rte_eal_mp_sendmsg(const char *action_name, const void *params,
+ int len_params, int fds[], int fds_num);
+
+/**
* Usage function typedef used by the application usage function.
*
* Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 229eec9..a84eab4 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -896,6 +896,15 @@ rte_eal_init(int argc, char **argv)

eal_check_mem_on_local_socket();

+ if (rte_eal_mp_channel_init() < 0) {
+ rte_eal_init_alert("failed to init mp channel\n");
+ rte_errno = EFAULT;
+ return -1;
+ }
+
+ if (eal_plugins_init() < 0)
+ rte_eal_init_alert("Cannot init plugins\n");
+
eal_thread_init_master(rte_config.master_lcore);

ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index f4f46c1..6762397 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -235,4 +235,26 @@ EXPERIMENTAL {
rte_service_set_stats_enable;
rte_service_start_with_defaults;

+} DPDK_17.08;
+
+DPDK_17.11 {
+ global:
+
+ rte_bus_get_iommu_class;
+ rte_eal_iova_mode;
+ rte_eal_mbuf_default_mempool_ops;
+ rte_lcore_has_role;
+ rte_memcpy_ptr;
+ rte_pci_get_iommu_class;
+ rte_pci_match;
+
+} DPDK_17.08;
+
+DPDK_18.02 {
+ global:
+
+ rte_eal_mp_action_register;
+ rte_eal_mp_action_unregister;
+ rte_eal_mp_sendmsg;
+
} DPDK_17.11;

--
2.7.4

Burakov, Anatoly

2017-12-11 11:04:33 UTC

Post by Jianfeng Tan
Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
1. Config-file based channel, in which, the primary process writes
info into a pre-defined config file, and the secondary process
reads info out.
2. vfio submodule has its own channel based on unix socket for the
secondary process to get container fd and group fd from the
primary process.
3. pdump submodule also has its own channel based on unix socket for
packet dump.
It'll be good to have a generic communication channel for multi-process
a. Secondary wants to send info to primary, for example, secondary
would like to send request (about some specific vdev to primary).
b. Sending info at any time, instead of just initialization time.
c. Share FDs with the other side, for vdev like vhost, related FDs
(memory region, kick) should be shared.
d. A send message request needs the other side to response immediately.
This patch proposes to create a communication channel, as an unix
socket connection, for above requirements. Primary will listen on
the unix socket; secondary will connect this socket to talk.
1. rte_eal_mp_action_register is used to register an action,
indexed by a string; if the calling component wants to
response the messages from the corresponding component in
its primary process or secondary processes.
2. rte_eal_mp_action_unregister is used to unregister the action
if the calling component does not want to response the messages.
3. rte_eal_mp_sendmsg is used to send a message.
---

<...snip...>

Post by Jianfeng Tan
+
+int
+rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action)
+{
+ struct action_entry *entry = malloc(sizeof(struct action_entry));
+
+ if (entry == NULL)
+ return -ENOMEM;
+
+ if (find_action_entry_by_name(action_name) != NULL)
+ return -EEXIST;

This should probably do a free(entry).

Post by Jianfeng Tan
+
+ strncpy(entry->action_name, action_name, MAX_ACTION_NAME_LEN);
+ entry->action = action;
+ TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+ return 0;
+}
+

<...snip...>

Post by Jianfeng Tan
+
+static int
+add_secondary(void)
+{
+ int fd;
+ struct epoll_event ev;
+
+ while (1) {
+ fd = accept(mp_fds.listen, NULL, NULL);
+ if (fd < 0 && errno == EAGAIN)
+ break;
+ else if (fd < 0) {
+ RTE_LOG(ERR, EAL, "primary failed to accept: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ ev.events = EPOLLIN | EPOLLRDHUP;
+ ev.data.fd = fd;
+ if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, fd, &ev) < 0) {
+ RTE_LOG(ERR, EAL, "failed to add secondary: %s\n",
+ strerror(errno));
+ break;
+ }
+ if (add_sec_proc(fd) < 0) {
+ RTE_LOG(ERR, EAL, "too many secondary processes\n");
+ close(fd);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static void *
+mp_handler(void *arg __rte_unused)
+{
+ int fd;
+ int i, n;
+ struct epoll_event ev;
+ struct epoll_event *events;
+ int is_primary = rte_eal_process_type() == RTE_PROC_PRIMARY;
+
+ ev.events = EPOLLIN | EPOLLRDHUP;
+ ev.data.fd = (is_primary) ? mp_fds.listen : mp_fds.primary;
+ if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, ev.data.fd, &ev) < 0) {
+ RTE_LOG(ERR, EAL, "failed to epoll_ctl: %s\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);

rte_exit?

Post by Jianfeng Tan
+ }
+
+ events = calloc(20, sizeof ev);
+
+ while (1) {
+ n = epoll_wait(mp_fds.efd, events, 20, -1);
+ for (i = 0; i < n; i++) {
+ if (is_primary && events[i].data.fd == mp_fds.listen) {
+ if (events[i].events != EPOLLIN) {
+ RTE_LOG(ERR, EAL, "what happens?\n");

More descriptive error message would be nice :)

Post by Jianfeng Tan
+ exit(EXIT_FAILURE);

rte_exit?

Post by Jianfeng Tan
+ }
+
+ if (add_secondary() < 0)
+ break;

Doing epoll_ctl in multiple different places hurts readability IMO.
Might be a good idea to refactor add_secondary and mp_handler in a way
that keeps all epoll handling in one place.

Post by Jianfeng Tan
+
+ continue;
+ }
+
+ fd = events[i].data.fd;
+
+ if ((events[i].events & EPOLLIN)) {
+ if (process_msg(fd) < 0) {
+ RTE_LOG(ERR, EAL,
+ "failed to process msg\n");
+ if (!is_primary)
+ exit(EXIT_FAILURE);

rte_exit()?

Post by Jianfeng Tan
+ }
+ continue;
+ }
+
+ /* EPOLLERR, EPOLLHUP, etc */
+ if (is_primary) {
+ RTE_LOG(ERR, EAL, "secondary exit: %d\n", fd);
+ epoll_ctl(mp_fds.efd, EPOLL_CTL_DEL, fd, NULL);
+ del_sec_proc(fd);
+ close(fd);
+ } else {
+ RTE_LOG(ERR, EAL, "primary exits, so do I\n");
+ /* Exit secondary when primary exits? */
+ exit(EXIT_FAILURE);

This is changing previous behavior. I don't think exiting secondary when
primary exits is something we want to do, so i would just print an
error, but not exit the process.

Post by Jianfeng Tan
+ }
+ }
+ }
+
+ return NULL;
+}
+
+int
+rte_eal_mp_channel_init(void)
+{
+ int i, fd, ret;
+ const char *path;
+ struct sockaddr_un un;
+ pthread_t tid;
+ char thread_name[RTE_MAX_THREAD_NAME_LEN];
+
+ mp_fds.efd = epoll_create1(0);
+ if (mp_fds.efd < 0) {
+ RTE_LOG(ERR, EAL, "epoll_create1 failed\n");
+ return -1;
+ }
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Failed to create unix socket\n");
+ return -1;
+ }
+
+ memset(&un, 0, sizeof(un));
+ un.sun_family = AF_UNIX;
+ path = eal_mp_unix_path();
+ strncpy(un.sun_path, path, sizeof(un.sun_path));
+ un.sun_path[sizeof(un.sun_path) - 1] = '\0';
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+ mp_fds.secondaries[i] = -1;
+
+ if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
+ RTE_LOG(ERR, EAL, "cannot set nonblocking mode\n");
+ close(fd);
+ return -1;
+ }
+
+ /* The file still exists since last run */
+ unlink(path);
+
+ ret = bind(fd, (struct sockaddr *)&un, sizeof(un));
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to bind to %s: %s\n",
+ path, strerror(errno));
+ close(fd);
+ return -1;
+ }
+ RTE_LOG(INFO, EAL, "primary bind to %s\n", path);
+
+ ret = listen(fd, 1024);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to listen: %s\n",
+ strerror(errno));
+ close(fd);
+ return -1;
+ }
+ mp_fds.listen = fd;
+ } else {
+ ret = connect(fd, (struct sockaddr *)&un, sizeof(un));
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to connect primary\n");
+ return -1;

Do we want to prevent secondary from launching if it can't connect to
primary? Some use cases might rely on previous behavior. Maybe instead
add some checks in handling functions to ensure that we have a valid
connection to the primary before doing anything?

Post by Jianfeng Tan
+ }
+ mp_fds.primary = fd;
+ }
+
+ ret = pthread_create(&tid, NULL, mp_handler, NULL);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to create thead: %s\n",
+ strerror(errno));
+ close(fd);
+ close(mp_fds.efd);
+ return -1;
+ }

<...snip...>

Post by Jianfeng Tan
+ if (fds_num > SCM_MAX_FD) {
+ RTE_LOG(ERR, EAL,
+ "Cannot send more than %d FDs\n", SCM_MAX_FD);
+ return -E2BIG;
+ }
+
+ len_msg = sizeof(struct msg_hdr) + len_params;
+ if (len_msg > MAX_MESSAGE_LENGTH) {
+ RTE_LOG(ERR, EAL, "Message is too long\n");
+ return -ENOMEM;

Nitpicking, but is this really -ENOMEM? Shouldn't this be -EINVAL or
-E2BIG? Also, this is external API - maybe return -1 and set rte_errno?

Post by Jianfeng Tan
+ }
+
+ RTE_LOG(INFO, EAL, "send msg: %s, %d\n", action_name, len_msg);

Do we want this as INFO, not DEBUG?

Post by Jianfeng Tan
+
+ msg = malloc(len_msg);
+ if (!msg) {
+ RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
+ return -ENOMEM;
+ }

<...snip...>

Post by Jianfeng Tan
/**
+ * Action function typedef used by other components.
+ *
+ * As we create unix socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_eal_mp_t)(const void *params, int len,
+ int fds[], int fds_num);

Nitpicking, but probably needs newlines before comments, here and after
next function definition.

Post by Jianfeng Tan
+/**
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * The action_name argument plays as the nonredundant key to find the action.
+ *
+ * The action argument is the function pointer to the action function.
+ *
+ * - 0 on success.
+ * - (<0) on failure.
+ */

<...snip...>

Post by Jianfeng Tan
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 229eec9..a84eab4 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -896,6 +896,15 @@ rte_eal_init(int argc, char **argv)
eal_check_mem_on_local_socket();
+ if (rte_eal_mp_channel_init() < 0) {
+ rte_eal_init_alert("failed to init mp channel\n");
+ rte_errno = EFAULT;
+ return -1;
+ }

As noted above, maybe only fail if it's primary process?

Post by Jianfeng Tan
+
+ if (eal_plugins_init() < 0)
+ rte_eal_init_alert("Cannot init plugins\n");

This is probably a leftover of some other patch?

Post by Jianfeng Tan
+
eal_thread_init_master(rte_config.master_lcore);
ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index f4f46c1..6762397 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -235,4 +235,26 @@ EXPERIMENTAL {
rte_service_set_stats_enable;
rte_service_start_with_defaults;
+} DPDK_17.08;
+
+DPDK_17.11 {
+
+ rte_bus_get_iommu_class;
+ rte_eal_iova_mode;
+ rte_eal_mbuf_default_mempool_ops;
+ rte_lcore_has_role;
+ rte_memcpy_ptr;
+ rte_pci_get_iommu_class;
+ rte_pci_match;
+
+} DPDK_17.08;
+

Same here, this looks like leftovers of rebase.

Post by Jianfeng Tan
+DPDK_18.02 {
+
+ rte_eal_mp_action_register;
+ rte_eal_mp_action_unregister;
+ rte_eal_mp_sendmsg;
+
} DPDK_17.11;

--
Thanks,
Anatoly

Ananyev, Konstantin

2017-12-11 16:43:08 UTC

Hi Jianfeng,

-----Original Message-----
From: Tan, Jianfeng
Sent: Thursday, November 30, 2017 6:44 PM
Subject: [PATCH 1/3] eal: add channel for multi-process communication
Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
1. Config-file based channel, in which, the primary process writes
info into a pre-defined config file, and the secondary process
reads info out.
2. vfio submodule has its own channel based on unix socket for the
secondary process to get container fd and group fd from the
primary process.
3. pdump submodule also has its own channel based on unix socket for
packet dump.
It'll be good to have a generic communication channel for multi-process
a. Secondary wants to send info to primary, for example, secondary
would like to send request (about some specific vdev to primary).
b. Sending info at any time, instead of just initialization time.
c. Share FDs with the other side, for vdev like vhost, related FDs
(memory region, kick) should be shared.
d. A send message request needs the other side to response immediately.
This patch proposes to create a communication channel, as an unix
socket connection, for above requirements. Primary will listen on
the unix socket; secondary will connect this socket to talk.

Kind of generic question - why do you need a connection-oriented socket here?
Why just connection-less socket wouldn't be enough?
In that case you don't need to do listen/accept, again you don't need epoll() loop.
Instead with connection-less socket you can just use blocking recvmsg() to
inside mp_handler().

1. rte_eal_mp_action_register is used to register an action,
indexed by a string; if the calling component wants to
response the messages from the corresponding component in
its primary process or secondary processes.
2. rte_eal_mp_action_unregister is used to unregister the action
if the calling component does not want to response the messages.

I think you need some sort of synchronization between action_register/unregister()
and action_process() - mutex_lock or so.
Another thing - as I understand you do use string as message*action identification?
I think you need to limit max length of it.
Konstantin

3. rte_eal_mp_sendmsg is used to send a message.
---
lib/librte_eal/common/eal_common_proc.c | 497 ++++++++++++++++++++++++++++++++
lib/librte_eal/common/eal_filesystem.h | 18 ++
lib/librte_eal/common/eal_private.h | 10 +
lib/librte_eal/common/include/rte_eal.h | 68 +++++
lib/librte_eal/linuxapp/eal/eal.c | 9 +
lib/librte_eal/rte_eal_version.map | 22 ++
6 files changed, 624 insertions(+)
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 60526ca..5d0a095 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -33,8 +33,21 @@
#include <stdio.h>
#include <fcntl.h>
#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/epoll.h>
+#include <limits.h>
+#include <unistd.h>
+#include <sys/un.h>
+#include <errno.h>
+#include <pthread.h>
+
+#include <rte_log.h>
#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_common.h>
+#include "eal_private.h"
#include "eal_filesystem.h"
#include "eal_internal_cfg.h"
@@ -59,3 +72,487 @@ rte_eal_primary_proc_alive(const char *config_file_path)
return !!ret;
}
+
+struct action_entry {
+ TAILQ_ENTRY(action_entry) next; /**< Next attached action entry */
+
+#define MAX_ACTION_NAME_LEN 64
+ char action_name[MAX_ACTION_NAME_LEN];
+ rte_eal_mp_t action;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(action_entry_list, action_entry);
+
+static struct action_entry_list action_entry_list =
+ TAILQ_HEAD_INITIALIZER(action_entry_list);
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+ int len = strlen(name);
+ struct action_entry *entry;
+
+ TAILQ_FOREACH(entry, &action_entry_list, next) {
+ if (strncmp(entry->action_name, name, len) == 0)
+ break;
+ }
+
+ return entry;
+}
+
+int
+rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action)
+{
+ struct action_entry *entry = malloc(sizeof(struct action_entry));
+
+ if (entry == NULL)
+ return -ENOMEM;
+
+ if (find_action_entry_by_name(action_name) != NULL)
+ return -EEXIST;
+
+ strncpy(entry->action_name, action_name, MAX_ACTION_NAME_LEN);
+ entry->action = action;
+ TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+ return 0;
+}
+
+void
+rte_eal_mp_action_unregister(const char *name)
+{
+ struct action_entry *entry = find_action_entry_by_name(name);
+
+ TAILQ_REMOVE(&action_entry_list, entry, next);
+ free(entry);
+}
+
+/* The maximum amount of fd for one recvmsg/sendmsg */
+#define SCM_MAX_FD 253
+#define MAX_SECONDARY_PROCS 8
+#define MAX_MESSAGE_LENGTH 1024
+
+struct mp_fds {
+ int efd;
+
+ union {
+ /* fds for primary process */
+ struct {
+ int listen;
+ /* fds used to send msg to secondary process(es) */
+ int secondaries[MAX_SECONDARY_PROCS];
+ };
+
+ /* fds for secondary process */
+ struct {
+ /* fds used to send msg to the primary process */
+ int primary;
+ };
+ };
+};
+
+static struct mp_fds mp_fds;
+
+struct msg_hdr {
+ char action_name[MAX_ACTION_NAME_LEN];
+ int fds_num;
+ int len_params;
+ char params[0];
+} __rte_packed;
+
+static int
+add_sec_proc(int fd)
+{
+ int i;
+
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+ if (mp_fds.secondaries[i] == -1)
+ break;
+
+ if (i >= MAX_SECONDARY_PROCS)
+ return -1;
+
+ mp_fds.secondaries[i] = fd;
+
+ return i;
+}
+
+static void
+del_sec_proc(int fd)
+{
+ int i;
+
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
+ if (mp_fds.secondaries[i] == fd) {
+ mp_fds.secondaries[i] = -1;
+ break;
+ }
+ }
+}
+
+static int
+read_msg(int sockfd, char *buf, int buflen, int *fds, int fds_num)
+{
+ struct iovec iov;
+ struct msghdr msgh;
+ size_t fdsize = fds_num * sizeof(int);
+ char control[CMSG_SPACE(fdsize)];
+ struct cmsghdr *cmsg;
+ struct msg_hdr *hdr = (struct msg_hdr *)buf;
+ int ret, total;
+
+ /* read msg_hdr */
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = hdr;
+ iov.iov_len = sizeof(*hdr);
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ ret = recvmsg(sockfd, &msgh, 0);
+ if (ret != sizeof(struct msg_hdr)) {
+ RTE_LOG(ERR, EAL, "recvmsg failed\n");
+ return ret;
+ }
+
+ if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+ RTE_LOG(ERR, EAL, "truncted msg\n");
+ return -1;
+ }
+ total = ret;
+
+ /* read auxiliary FDs if any */
+ for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+ if ((cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS)) {
+ memcpy(fds, CMSG_DATA(cmsg), fdsize);
+ break;
+ }
+ }
+
+ /* read params */
+ if (hdr->len_params) {
+ if (hdr->len_params > buflen - (int)sizeof(*hdr))
+ rte_exit(EXIT_FAILURE, "params too long\n");
+
+ ret = read(sockfd, &hdr->params, hdr->len_params);
+ if (ret != hdr->len_params)
+ rte_exit(EXIT_FAILURE, "failed to recv params\n");
+
+ total += ret;
+ }
+
+ RTE_LOG(INFO, EAL, "read msg: %s, %d\n", hdr->action_name,
+ (int)sizeof(*hdr) + hdr->len_params);
+ return total;
+}
+
+static int
+process_msg(int fd)
+{
+ int len;
+ int params_len;
+ char buf[MAX_MESSAGE_LENGTH];
+ int fds[SCM_MAX_FD];
+ struct msg_hdr *hdr;
+ struct action_entry *entry;
+
+ len = read_msg(fd, buf, MAX_MESSAGE_LENGTH, fds, SCM_MAX_FD);
+ if (len <= 0) {
+ RTE_LOG(ERR, EAL, "failed to read message: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ hdr = (struct msg_hdr *) buf;
+
+ entry = find_action_entry_by_name(hdr->action_name);
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "cannot find action by: %s\n",
+ hdr->action_name);
+ return -1;
+ }
+
+ params_len = len - sizeof(struct msg_hdr);
+
+ return entry->action(hdr->params, params_len, fds, hdr->fds_num);
+}
+
+static int
+add_secondary(void)
+{
+ int fd;
+ struct epoll_event ev;
+
+ while (1) {
+ fd = accept(mp_fds.listen, NULL, NULL);
+ if (fd < 0 && errno == EAGAIN)
+ break;
+ else if (fd < 0) {
+ RTE_LOG(ERR, EAL, "primary failed to accept: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ ev.events = EPOLLIN | EPOLLRDHUP;
+ ev.data.fd = fd;
+ if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, fd, &ev) < 0) {
+ RTE_LOG(ERR, EAL, "failed to add secondary: %s\n",
+ strerror(errno));
+ break;
+ }
+ if (add_sec_proc(fd) < 0) {
+ RTE_LOG(ERR, EAL, "too many secondary processes\n");
+ close(fd);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static void *
+mp_handler(void *arg __rte_unused)
+{
+ int fd;
+ int i, n;
+ struct epoll_event ev;
+ struct epoll_event *events;
+ int is_primary = rte_eal_process_type() == RTE_PROC_PRIMARY;
+
+ ev.events = EPOLLIN | EPOLLRDHUP;
+ ev.data.fd = (is_primary) ? mp_fds.listen : mp_fds.primary;
+ if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, ev.data.fd, &ev) < 0) {
+ RTE_LOG(ERR, EAL, "failed to epoll_ctl: %s\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ events = calloc(20, sizeof ev);
+
+ while (1) {
+ n = epoll_wait(mp_fds.efd, events, 20, -1);
+ for (i = 0; i < n; i++) {
+ if (is_primary && events[i].data.fd == mp_fds.listen) {
+ if (events[i].events != EPOLLIN) {
+ RTE_LOG(ERR, EAL, "what happens?\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (add_secondary() < 0)
+ break;
+
+ continue;
+ }
+
+ fd = events[i].data.fd;
+
+ if ((events[i].events & EPOLLIN)) {
+ if (process_msg(fd) < 0) {
+ RTE_LOG(ERR, EAL,
+ "failed to process msg\n");
+ if (!is_primary)
+ exit(EXIT_FAILURE);
+ }
+ continue;
+ }
+
+ /* EPOLLERR, EPOLLHUP, etc */
+ if (is_primary) {
+ RTE_LOG(ERR, EAL, "secondary exit: %d\n", fd);
+ epoll_ctl(mp_fds.efd, EPOLL_CTL_DEL, fd, NULL);
+ del_sec_proc(fd);
+ close(fd);
+ } else {
+ RTE_LOG(ERR, EAL, "primary exits, so do I\n");
+ /* Exit secondary when primary exits? */
+ exit(EXIT_FAILURE);
+ }
+ }
+ }
+
+ return NULL;
+}
+
+int
+rte_eal_mp_channel_init(void)
+{
+ int i, fd, ret;
+ const char *path;
+ struct sockaddr_un un;
+ pthread_t tid;
+ char thread_name[RTE_MAX_THREAD_NAME_LEN];
+
+ mp_fds.efd = epoll_create1(0);
+ if (mp_fds.efd < 0) {
+ RTE_LOG(ERR, EAL, "epoll_create1 failed\n");
+ return -1;
+ }
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Failed to create unix socket\n");
+ return -1;
+ }
+
+ memset(&un, 0, sizeof(un));
+ un.sun_family = AF_UNIX;
+ path = eal_mp_unix_path();
+ strncpy(un.sun_path, path, sizeof(un.sun_path));
+ un.sun_path[sizeof(un.sun_path) - 1] = '\0';
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+ mp_fds.secondaries[i] = -1;
+
+ if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
+ RTE_LOG(ERR, EAL, "cannot set nonblocking mode\n");
+ close(fd);
+ return -1;
+ }
+
+ /* The file still exists since last run */
+ unlink(path);
+
+ ret = bind(fd, (struct sockaddr *)&un, sizeof(un));
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to bind to %s: %s\n",
+ path, strerror(errno));
+ close(fd);
+ return -1;
+ }
+ RTE_LOG(INFO, EAL, "primary bind to %s\n", path);
+
+ ret = listen(fd, 1024);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to listen: %s\n",
+ strerror(errno));
+ close(fd);
+ return -1;
+ }
+ mp_fds.listen = fd;
+ } else {
+ ret = connect(fd, (struct sockaddr *)&un, sizeof(un));
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to connect primary\n");
+ return -1;
+ }
+ mp_fds.primary = fd;
+ }
+
+ ret = pthread_create(&tid, NULL, mp_handler, NULL);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to create thead: %s\n",
+ strerror(errno));
+ close(fd);
+ close(mp_fds.efd);
+ return -1;
+ }
+
+ snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
+ "rte_mp_handle");
+ ret = rte_thread_setname(tid, thread_name);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to set thead name\n");
+ close(fd);
+ close(mp_fds.efd);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+send_msg(int fd, struct msghdr *p_msgh)
+{
+ int ret;
+
+ do {
+ ret = sendmsg(fd, p_msgh, 0);
+ } while (ret < 0 && errno == EINTR);
+
+ if (ret < 0)
+ RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
+
+ return ret;
+}
+
+int
+rte_eal_mp_sendmsg(const char *action_name,
+ const void *params,
+ int len_params,
+ int fds[],
+ int fds_num)
+{
+ int i;
+ int ret = 0;
+ struct msghdr msgh;
+ struct iovec iov;
+ size_t fd_size = fds_num * sizeof(int);
+ char control[CMSG_SPACE(fd_size)];
+ struct cmsghdr *cmsg;
+ struct msg_hdr *msg;
+ int len_msg;
+
+ if (fds_num > SCM_MAX_FD) {
+ RTE_LOG(ERR, EAL,
+ "Cannot send more than %d FDs\n", SCM_MAX_FD);
+ return -E2BIG;
+ }
+
+ len_msg = sizeof(struct msg_hdr) + len_params;
+ if (len_msg > MAX_MESSAGE_LENGTH) {
+ RTE_LOG(ERR, EAL, "Message is too long\n");
+ return -ENOMEM;
+ }
+
+ RTE_LOG(INFO, EAL, "send msg: %s, %d\n", action_name, len_msg);
+
+ msg = malloc(len_msg);
+ if (!msg) {
+ RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
+ return -ENOMEM;
+ }
+ memset(msg, 0, len_msg);
+ strcpy(msg->action_name, action_name);
+ msg->fds_num = fds_num;
+ msg->len_params = len_params;
+ memcpy(msg->params, params, len_params);
+
+ memset(&msgh, 0, sizeof(msgh));
+ memset(control, 0, sizeof(control));
+
+ iov.iov_base = (uint8_t *)msg;
+ iov.iov_len = len_msg;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ cmsg->cmsg_len = CMSG_LEN(fd_size);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), fds, fd_size);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
+ if (mp_fds.secondaries[i] == -1)
+ continue;
+
+ ret = send_msg(mp_fds.secondaries[i], &msgh);
+ if (ret < 0)
+ break;
+ }
+ } else {
+ ret = send_msg(mp_fds.primary, &msgh);
+ }
+
+ free(msg);
+
+ return ret;
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index 8acbd99..3d9514f 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -67,6 +67,24 @@ eal_runtime_config_path(void)
return buffer;
}
+/** Path of primary/secondary communication unix socket file. */
+#define MP_UNIX_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_unix_path(void)
+{
+ static char buffer[PATH_MAX]; /* static so auto-zeroed */
+ const char *directory = default_config_dir;
+ const char *home_dir = getenv("HOME");
+
+ if (getuid() != 0 && home_dir != NULL)
+ directory = home_dir;
+ snprintf(buffer, sizeof(buffer) - 1, MP_UNIX_PATH_FMT,
+ directory, internal_config.hugefile_prefix);
+
+ return buffer;
+
+}
+
/** Path of hugepage info file. */
#define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index 462226f..60944f2 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -224,4 +224,14 @@ int rte_eal_hugepage_attach(void);
*/
struct rte_bus *rte_bus_find_by_device_name(const char *str);
+/**
+ * Create the unix channel for primary/secondary communication.
+ *
+ * 0 on success;
+ * (<0) on failure.
+ */
+
+int rte_eal_mp_channel_init(void);
+
#endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 8e4e71c..8776bcf 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -215,6 +215,74 @@ int rte_eal_init(int argc, char **argv);
int rte_eal_primary_proc_alive(const char *config_file_path);
/**
+ * Action function typedef used by other components.
+ *
+ * As we create unix socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_eal_mp_t)(const void *params, int len,
+ int fds[], int fds_num);
+/**
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * The action_name argument plays as the nonredundant key to find the action.
+ *
+ * The action argument is the function pointer to the action function.
+ *
+ * - 0 on success.
+ * - (<0) on failure.
+ */
+int rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action);
+/**
+ * Unregister an action function for primary/secondary communication.
+ *
+ * Call this function to unregister an action if the calling component does
+ * not want to response the messages from the corresponding component in its
+ * primary process or secondary processes.
+ *
+ * The action_name argument plays as the nonredundant key to find the action.
+ *
+ */
+void rte_eal_mp_action_unregister(const char *name);
+
+/**
+ * Send a message to the primary process or the secondary processes.
+ *
+ * This function will send a message which will be responsed by the action
+ * identified by action_name of the process on the other side.
+ *
+ * The action_name argument is used to identify which action will be used.
+ *
+ * The params argument contains the customized message.
+ *
+ * The len_params argument is the length of the customized message.
+ *
+ * The fds argument is an array of fds sent with sendmsg.
+ *
+ * The fds_num argument is number of fds to be sent with sendmsg.
+ *
+ * - (>=0) on success.
+ * - (<0) on failure.
+ */
+int
+rte_eal_mp_sendmsg(const char *action_name, const void *params,
+ int len_params, int fds[], int fds_num);
+
+/**
* Usage function typedef used by the application usage function.
*
* Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 229eec9..a84eab4 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -896,6 +896,15 @@ rte_eal_init(int argc, char **argv)
eal_check_mem_on_local_socket();
+ if (rte_eal_mp_channel_init() < 0) {
+ rte_eal_init_alert("failed to init mp channel\n");
+ rte_errno = EFAULT;
+ return -1;
+ }
+
+ if (eal_plugins_init() < 0)
+ rte_eal_init_alert("Cannot init plugins\n");
+
eal_thread_init_master(rte_config.master_lcore);
ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index f4f46c1..6762397 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -235,4 +235,26 @@ EXPERIMENTAL {
rte_service_set_stats_enable;
rte_service_start_with_defaults;
+} DPDK_17.08;
+
+DPDK_17.11 {
+
+ rte_bus_get_iommu_class;
+ rte_eal_iova_mode;
+ rte_eal_mbuf_default_mempool_ops;
+ rte_lcore_has_role;
+ rte_memcpy_ptr;
+ rte_pci_get_iommu_class;
+ rte_pci_match;
+
+} DPDK_17.08;
+
+DPDK_18.02 {
+
+ rte_eal_mp_action_register;
+ rte_eal_mp_action_unregister;
+ rte_eal_mp_sendmsg;
+
} DPDK_17.11;
--
2.7.4

Jianfeng Tan

2017-11-30 18:44:09 UTC

We need the synchronous way for multi-process communication, that
is to say we need an immediate response after we send a message
to the other side.

We will stop the mp_handler thread, and after sending message,
the send thread will wait there for reponse and process the
respond.

Suggested-by: Anatoly Burakov <***@intel.com>
Signed-off-by: Jianfeng Tan <***@intel.com>
---
lib/librte_eal/common/eal_common_proc.c | 53 +++++++++++++++++++++++++++++++--
lib/librte_eal/common/include/rte_eal.h | 5 +++-
2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 5d0a095..65ebaf2 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -30,6 +30,8 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

+#define _GNU_SOURCE
+
#include <stdio.h>
#include <fcntl.h>
#include <stdlib.h>
@@ -41,6 +43,8 @@
#include <sys/un.h>
#include <errno.h>
#include <pthread.h>
+#include <sys/eventfd.h>
+#include <signal.h>

#include <rte_log.h>
#include <rte_eal.h>
@@ -134,6 +138,7 @@ rte_eal_mp_action_unregister(const char *name)

struct mp_fds {
int efd;
+ int evfd; /* eventfd used for pausing mp_handler thread */

union {
/* fds for primary process */
@@ -331,6 +336,13 @@ mp_handler(void *arg __rte_unused)
exit(EXIT_FAILURE);
}

+ ev.data.fd = mp_fds.evfd;
+ if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, ev.data.fd, &ev) < 0) {
+ RTE_LOG(ERR, EAL, "epoll_ctl failed: %s\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
events = calloc(20, sizeof ev);

while (1) {
@@ -348,6 +360,14 @@ mp_handler(void *arg __rte_unused)
continue;
}

+ if (events[i].data.fd == mp_fds.evfd) {
+ RTE_LOG(INFO, EAL, "mp_handler thread will pause\n");
+ pause();
+ RTE_LOG(INFO, EAL, "mp_handler thread stops pausing\n");
+
+ continue;
+ }
+
fd = events[i].data.fd;

if ((events[i].events & EPOLLIN)) {
@@ -377,13 +397,14 @@ mp_handler(void *arg __rte_unused)
return NULL;
}

+static pthread_t tid;
+
int
rte_eal_mp_channel_init(void)
{
int i, fd, ret;
const char *path;
struct sockaddr_un un;
- pthread_t tid;
char thread_name[RTE_MAX_THREAD_NAME_LEN];

mp_fds.efd = epoll_create1(0);
@@ -462,6 +483,8 @@ rte_eal_mp_channel_init(void)
return -1;
}

+ mp_fds.evfd = eventfd(0, 0);
+
return 0;
}

@@ -485,7 +508,8 @@ rte_eal_mp_sendmsg(const char *action_name,
const void *params,
int len_params,
int fds[],
- int fds_num)
+ int fds_num,
+ int need_ack)
{
int i;
int ret = 0;
@@ -511,6 +535,11 @@ rte_eal_mp_sendmsg(const char *action_name,

RTE_LOG(INFO, EAL, "send msg: %s, %d\n", action_name, len_msg);

+ if (need_ack) {
+ // stop mp_handler thread.
+ eventfd_write(mp_fds.evfd, (eventfd_t)1);
+ }
+
msg = malloc(len_msg);
if (!msg) {
RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
@@ -547,12 +576,32 @@ rte_eal_mp_sendmsg(const char *action_name,
ret = send_msg(mp_fds.secondaries[i], &msgh);
if (ret < 0)
break;
+
+ if (need_ack) {
+ /* We will hang there until the other side
+ * responses and what if other side is sending
+ * msg at the same time?
+ */
+ process_msg(mp_fds.secondaries[i]);
+ }
}
} else {
ret = send_msg(mp_fds.primary, &msgh);
+
+ if (ret > 0 && need_ack) {
+ // We will hang there until the other side responses
+ ret = process_msg(mp_fds.primary);
+ }
}

free(msg);

+ if (need_ack) {
+ // start mp_handler thread.
+ union sigval value;
+
+ pthread_sigqueue(tid, 0, value);
+ }
+
return ret;
}
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 8776bcf..9875cae 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -274,13 +274,16 @@ void rte_eal_mp_action_unregister(const char *name);
* @param fds_num
* The fds_num argument is number of fds to be sent with sendmsg.
*
+ * @param need_ack
+ * The fds_num argument is number of fds to be sent with sendmsg.
+ *
* @return
* - (>=0) on success.
* - (<0) on failure.
*/
int
rte_eal_mp_sendmsg(const char *action_name, const void *params,
- int len_params, int fds[], int fds_num);
+ int len_params, int fds[], int fds_num, int need_ack);

/**
* Usage function typedef used by the application usage function.

--
2.7.4

Burakov, Anatoly

2017-12-11 11:39:22 UTC

Post by Jianfeng Tan
We need the synchronous way for multi-process communication, that
is to say we need an immediate response after we send a message
to the other side.
We will stop the mp_handler thread, and after sending message,
the send thread will wait there for reponse and process the
respond.
---
lib/librte_eal/common/eal_common_proc.c | 53 +++++++++++++++++++++++++++++++--
lib/librte_eal/common/include/rte_eal.h | 5 +++-
2 files changed, 55 insertions(+), 3 deletions(-)
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 5d0a095..65ebaf2 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -30,6 +30,8 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#define _GNU_SOURCE
+

shouldn't this be in Makefile flags?

Post by Jianfeng Tan
#include <stdio.h>
#include <fcntl.h>
#include <stdlib.h>
@@ -41,6 +43,8 @@
#include <sys/un.h>
#include <errno.h>
#include <pthread.h>
+#include <sys/eventfd.h>
+#include <signal.h>
#include <rte_log.h>
#include <rte_eal.h>
@@ -134,6 +138,7 @@ rte_eal_mp_action_unregister(const char *name)
struct mp_fds {
int efd;
+ int evfd; /* eventfd used for pausing mp_handler thread */
union {
/* fds for primary process */
@@ -331,6 +336,13 @@ mp_handler(void *arg __rte_unused)
exit(EXIT_FAILURE);
}
+ ev.data.fd = mp_fds.evfd;
+ if (epoll_ctl(mp_fds.efd, EPOLL_CTL_ADD, ev.data.fd, &ev) < 0) {
+ RTE_LOG(ERR, EAL, "epoll_ctl failed: %s\n",
+ strerror(errno));
+ exit(EXIT_FAILURE);

here and in other places - rte_exit?

Post by Jianfeng Tan
+ }
+
events = calloc(20, sizeof ev);
while (1) {
@@ -348,6 +360,14 @@ mp_handler(void *arg __rte_unused)
continue;
}
+ if (events[i].data.fd == mp_fds.evfd) {
+ RTE_LOG(INFO, EAL, "mp_handler thread will pause\n");
+ pause();
+ RTE_LOG(INFO, EAL, "mp_handler thread stops pausing\n");
+
+ continue;
+ }
+
fd = events[i].data.fd;
if ((events[i].events & EPOLLIN)) {
@@ -377,13 +397,14 @@ mp_handler(void *arg __rte_unused)
return NULL;
}
+static pthread_t tid;
+
int
rte_eal_mp_channel_init(void)
{
int i, fd, ret;
const char *path;
struct sockaddr_un un;
- pthread_t tid;
char thread_name[RTE_MAX_THREAD_NAME_LEN];
mp_fds.efd = epoll_create1(0);
@@ -462,6 +483,8 @@ rte_eal_mp_channel_init(void)
return -1;
}
+ mp_fds.evfd = eventfd(0, 0);
+
return 0;
}
@@ -485,7 +508,8 @@ rte_eal_mp_sendmsg(const char *action_name,
const void *params,
int len_params,
int fds[],
- int fds_num)
+ int fds_num,
+ int need_ack)

I think "need_ack" is a misnomer because what we really want is not
"ack" but a response.

More importantly, i think for clarity's sake, this should be a separate
function - something like rte_eal_mp_sendreq() or maybe a better name
(reqdata? communicate?). Also, i don't think reusing send parameters is
a good idea - a user is expecting a response, so a user allocates data
for a response separately from requests, and passes it explicitly.

Post by Jianfeng Tan
{
int i;
int ret = 0;
@@ -511,6 +535,11 @@ rte_eal_mp_sendmsg(const char *action_name,
RTE_LOG(INFO, EAL, "send msg: %s, %d\n", action_name, len_msg);
+ if (need_ack) {
+ // stop mp_handler thread.

Do we accept C++-style comments?

Post by Jianfeng Tan
+ eventfd_write(mp_fds.evfd, (eventfd_t)1);
+ }
+
msg = malloc(len_msg);
if (!msg) {
RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
@@ -547,12 +576,32 @@ rte_eal_mp_sendmsg(const char *action_name,
ret = send_msg(mp_fds.secondaries[i], &msgh);
if (ret < 0)
break;
+
+ if (need_ack) {
+ /* We will hang there until the other side
+ * responses and what if other side is sending
+ * msg at the same time?
+ */
+ process_msg(mp_fds.secondaries[i]);
+ }
}
} else {
ret = send_msg(mp_fds.primary, &msgh);
+
+ if (ret > 0 && need_ack) {
+ // We will hang there until the other side responses
+ ret = process_msg(mp_fds.primary);
+ }
}
free(msg);
+ if (need_ack) {
+ // start mp_handler thread.
+ union sigval value;

it's not used, but still, maybe zero-initialize it?

Post by Jianfeng Tan
+
+ pthread_sigqueue(tid, 0, value);
+ }
+
return ret;
}
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 8776bcf..9875cae 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -274,13 +274,16 @@ void rte_eal_mp_action_unregister(const char *name);
* The fds_num argument is number of fds to be sent with sendmsg.
*
+ * The fds_num argument is number of fds to be sent with sendmsg.
+ *
* - (>=0) on success.
* - (<0) on failure.
*/
int
rte_eal_mp_sendmsg(const char *action_name, const void *params,
- int len_params, int fds[], int fds_num);
+ int len_params, int fds[], int fds_num, int need_ack);
/**
* Usage function typedef used by the application usage function.

--
Thanks,
Anatoly

Ananyev, Konstantin

2017-12-11 16:49:18 UTC

Post by Burakov, Anatoly

Post by Jianfeng Tan
@@ -485,7 +508,8 @@ rte_eal_mp_sendmsg(const char *action_name,
const void *params,
int len_params,
int fds[],
- int fds_num)
+ int fds_num,
+ int need_ack)

I think "need_ack" is a misnomer because what we really want is not
"ack" but a response.
More importantly, i think for clarity's sake, this should be a separate
function - something like rte_eal_mp_sendreq() or maybe a better name
(reqdata? communicate?).

+1 for a separate function.
Also I don't think it should disturb/block mp_handler() - there could be messages
for other actions (from other endpoints).
I think only rte_eal_mp_sendreq() should be blocked till ack/response is received.
And probably it needs max timeout to block f

Jianfeng Tan

2017-11-30 18:44:10 UTC

Previously, vfio has its own channel for the secondary process to
get container fd and group fd from the primary process.

This patch changes to use the generic mp channel.

Signed-off-by: Jianfeng Tan <***@intel.com>
---
lib/librte_eal/linuxapp/eal/eal.c | 14 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 139 +++------
lib/librte_eal/linuxapp/eal/eal_vfio.h | 15 +-
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 416 ++++---------------------
4 files changed, 109 insertions(+), 475 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index a84eab4..93824bf 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -713,18 +713,8 @@ static int rte_eal_vfio_setup(void)
return -1;
vfio_enabled = rte_vfio_is_enabled("vfio");

- if (vfio_enabled) {
-
- /* if we are primary process, create a thread to communicate with
- * secondary processes. the thread will use a socket to wait for
- * requests from secondary process to send open file descriptors,
- * because VFIO does not allow multiple open descriptors on a group or
- * VFIO container.
- */
- if (internal_config.process_type == RTE_PROC_PRIMARY &&
- vfio_mp_sync_setup() < 0)
- return -1;
- }
+ if (vfio_enabled && vfio_mp_sync_setup() < 0)
+ return -1;

return 0;
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 58f0123..dbea350 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -68,9 +68,11 @@ int
vfio_get_group_fd(int iommu_group_no)
{
int i;
+ int ret;
int vfio_group_fd;
char filename[PATH_MAX];
struct vfio_group *cur_grp;
+ struct vfio_mp_param p;

/* check if we already have the group descriptor open */
for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -129,51 +131,21 @@ vfio_get_group_fd(int iommu_group_no)
vfio_cfg.vfio_active_groups++;
return vfio_group_fd;
}
- /* if we're in a secondary process, request group fd from the primary
- * process via our socket
- */
- else {
- int socket_fd, ret;
+ /* For secondary process, request group fd from the primary */

- socket_fd = vfio_mp_sync_connect_to_primary();
+ p.req = SOCKET_REQ_GROUP;
+ p.group_no = iommu_group_no;

- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group number!\n");
- close(socket_fd);
- return -1;
- }
- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- case SOCKET_NO_FD:
- close(socket_fd);
- return 0;
- case SOCKET_OK:
- vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
- /* if we got the fd, store it and return it */
- if (vfio_group_fd > 0) {
- close(socket_fd);
- cur_grp->group_no = iommu_group_no;
- cur_grp->fd = vfio_group_fd;
- vfio_cfg.vfio_active_groups++;
- return vfio_group_fd;
- }
- /* fall-through on error */
- default:
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
- }
+ ret = rte_eal_mp_sendmsg("vfio", &p, sizeof(p), NULL, 0, 1);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, " cannot request group fd!\n");
+ cur_grp->group_no = -1;
+ } else {
+ cur_grp->group_no = iommu_group_no;
+ vfio_cfg.vfio_active_groups++;
}
- return -1;
+
+ return ret;
}

@@ -229,11 +201,12 @@ int
clear_group(int vfio_group_fd)
{
int i;
- int socket_fd, ret;
+ struct vfio_mp_param p;
+
+ i = get_vfio_group_idx(vfio_group_fd);

if (internal_config.process_type == RTE_PROC_PRIMARY) {

- i = get_vfio_group_idx(vfio_group_fd);
if (i < 0)
return -1;
vfio_cfg.vfio_groups[i].group_no = -1;
@@ -243,44 +216,20 @@ clear_group(int vfio_group_fd)
return 0;
}

- /* This is just for SECONDARY processes */
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
+ p.req = SOCKET_CLR_GROUP;
+ p.group_no = vfio_cfg.vfio_groups[i].group_no;

- if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
+ if (rte_eal_mp_sendmsg("vfio", &p, sizeof(p), NULL, 0, 1) < 0) {
+ RTE_LOG(ERR, EAL, "request primary to clear group fd, failed!\n");
return -1;
}

- if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group fd!\n");
- close(socket_fd);
- return -1;
- }
+ vfio_cfg.vfio_groups[i].group_no = -1;
+ vfio_cfg.vfio_groups[i].fd = -1;
+ vfio_cfg.vfio_groups[i].devices = 0;
+ vfio_cfg.vfio_active_groups--;

- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- case SOCKET_NO_FD:
- RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
- close(socket_fd);
- break;
- case SOCKET_OK:
- close(socket_fd);
- return 0;
- case SOCKET_ERR:
- RTE_LOG(ERR, EAL, " Socket error\n");
- close(socket_fd);
- break;
- default:
- RTE_LOG(ERR, EAL, " UNKNOWN reply, %d\n", ret);
- close(socket_fd);
- }
- return -1;
+ return 0;
}

int
@@ -590,6 +539,7 @@ int
vfio_get_container_fd(void)
{
int ret, vfio_container_fd;
+ struct vfio_mp_param p;

/* if we're in a primary process, try to open the container */
if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -620,34 +570,17 @@ vfio_get_container_fd(void)
}

return vfio_container_fd;
- } else {
- /*
- * if we're in a secondary process, request container fd from the
- * primary process via our socket
- */
- int socket_fd;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
- if (vfio_container_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
- }
- close(socket_fd);
- return vfio_container_fd;
}

- return -1;
+ /* For secondary process, request container fd from primary process */
+
+ p.req = SOCKET_REQ_CONTAINER;
+
+ ret = rte_eal_mp_sendmsg("vfio", &p, sizeof(p), NULL, 0, 1);
+ if (ret < 0)
+ RTE_LOG(ERR, EAL, " cannot request container fd!\n");
+
+ return ret;
}

int
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index ba7892b..7907c22 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -117,15 +117,6 @@ struct vfio_iommu_spapr_tce_info {
#define VFIO_MAX_GROUPS 64

/*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
* we don't need to store device fd's anywhere since they can be obtained from
* the group fd via an ioctl() call.
*/
@@ -190,6 +181,12 @@ int vfio_mp_sync_setup(void);
#define SOCKET_NO_FD 0x1
#define SOCKET_ERR 0xFF

+struct vfio_mp_param {
+ int req;
+ int result;
+ int group_no;
+};
+
#endif /* VFIO_PRESENT */

#endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index b53ed7e..dfba58f 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -1,7 +1,7 @@
/*-
* BSD LICENSE
*
- * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ * Copyright(c) 2010-2017 Intel Corporation. All rights reserved.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -31,31 +31,11 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

-#include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif
-
#include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
#include <rte_vfio.h>
+#include <rte_eal.h>

-#include "eal_filesystem.h"
#include "eal_vfio.h"
-#include "eal_thread.h"

/**
* @file
@@ -66,360 +46,94 @@

#ifdef VFIO_PRESENT

-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
- do {\
- (chdr).cmsg_len = CMSGLEN;\
- (chdr).cmsg_level = SOL_SOCKET;\
- (chdr).cmsg_type = SCM_RIGHTS;\
- memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
- } while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
- memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
- const char *dir = "/var/run";
- const char *home_dir = getenv("HOME");
-
- if (getuid() != 0 && home_dir != NULL)
- dir = home_dir;
-
- /* use current prefix as file path */
- snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
- internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * data flow for socket comm protocol:
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
- struct msghdr hdr;
- struct iovec iov;
- int buf;
- int ret;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = req;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct iovec iov;
- int ret, req;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = SOCKET_ERR;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
-{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
- int ret;
-
- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- buf = SOCKET_OK;
- FD_TO_CMSGHDR(fd, *chdr);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
- int ret, req, fd;
-
- buf = SOCKET_ERR;
-
- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- if (req != SOCKET_OK)
- return -1;
-
- CMSGHDR_TO_FD(*chdr, fd);
-
- return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
+static int
+vfio_mp_primary(const void *params, int len,
+ int fd[] __rte_unused, int fds_num __rte_unused)
{
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
- int socket_fd;
+ int fds[1];
+ const struct vfio_mp_param *p = params;
+ struct vfio_mp_param r;

- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+ if (len != sizeof(*p)) {
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}

- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
- return socket_fd;
-
- /* if connect failed */
- close(socket_fd);
- return -1;
-}
-
-
-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
- int ret, fd, vfio_data;
-
- /* wait for requests on the socket */
- for (;;) {
- int conn_sock;
- struct sockaddr_un addr;
- socklen_t sockaddr_len = sizeof(addr);
-
- /* this is a blocking call */
- conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
- &sockaddr_len);
-
- /* just restart on error */
- if (conn_sock == -1)
- continue;
-
- /* set socket to linger after close */
- struct linger l;
- l.l_onoff = 1;
- l.l_linger = 60;
-
- if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
- RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
- "on listen socket (%s)\n", strerror(errno));
-
- ret = vfio_mp_sync_receive_request(conn_sock);
-
- switch (ret) {
- case SOCKET_REQ_CONTAINER:
- fd = vfio_get_container_fd();
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- else
- vfio_mp_sync_send_fd(conn_sock, fd);
- if (fd >= 0)
- close(fd);
- break;
- case SOCKET_REQ_GROUP:
- /* wait for group number */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- fd = vfio_get_group_fd(vfio_data);
-
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+ switch (p->req) {
+ case SOCKET_REQ_GROUP:
+ r.req = SOCKET_REQ_GROUP;
+ r.group_no = p->group_no;
+ fds[0] = vfio_get_group_fd(p->group_no);
+ if (fds[0] < 0) {
+ r.result = SOCKET_ERR;
+ rte_eal_mp_sendmsg("vfio", &r, sizeof(r), NULL, 0, 0);
+ } else if (fds[0] == 0) {
/* if VFIO group exists but isn't bound to VFIO driver */
- else if (fd == 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+ r.result = SOCKET_NO_FD;
+ rte_eal_mp_sendmsg("vfio", &r, sizeof(r), NULL, 0, 0);
+ } else {
/* if group exists and is bound to VFIO driver */
- else {
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- vfio_mp_sync_send_fd(conn_sock, fd);
- }
- break;
- case SOCKET_CLR_GROUP:
- /* wait for group fd */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- ret = clear_group(vfio_data);
-
- if (ret < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
- else
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- break;
- default:
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- break;
+ r.result = SOCKET_OK;
+ rte_eal_mp_sendmsg("vfio", &r, sizeof(r), fds, 1, 0);
}
- close(conn_sock);
+ break;
+ case SOCKET_REQ_CONTAINER:
+ r.req = SOCKET_REQ_CONTAINER;
+ fds[0] = vfio_get_container_fd();
+ rte_eal_mp_sendmsg("vfio", &r, sizeof(r), fds, 1, 0);
+ break;
+ default:
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
+ return -1;
}
+
+ return 0;
}

static int
-vfio_mp_sync_socket_setup(void)
+vfio_mp_secondary(const void *params, int len, int fds[],
+ int fds_num __rte_unused)
{
- int ret, socket_fd;
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
- return -1;
- }
-
- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
+ const struct vfio_mp_param *p = params;

- unlink(addr.sun_path);
-
- ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
- close(socket_fd);
+ if (len != sizeof(*p)) {
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}

- ret = listen(socket_fd, 50);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
- close(socket_fd);
+ switch (p->req) {
+ case SOCKET_REQ_GROUP:
+ switch (p->result) {
+ case SOCKET_NO_FD:
+ return 0;
+ case SOCKET_OK:
+ if (fds_num == 1 && fds[0] > 0)
+ return fds[0];
+ /* fall-through on error */
+ default:
+ RTE_LOG(ERR, EAL, " cannot get group fd!\n");
+ return -1;
+ }
+ case SOCKET_REQ_CONTAINER:
+ if (fds_num == 1 && fds[0] > 0)
+ return fds[0];
return -1;
+ default:
+ RTE_LOG(ERR, EAL, "Invalid req!\n");
}
-
- /* save the socket in local configuration */
- mp_socket_fd = socket_fd;
-
- return 0;
+ return -1;
}

-/*
- * set up a local socket and tell it to listen for incoming connections
- */
int
vfio_mp_sync_setup(void)
{
- int ret;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
+ rte_eal_mp_t action;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ action = vfio_mp_primary;
+ else
+ action = vfio_mp_secondary;

- if (vfio_mp_sync_socket_setup() < 0) {
- RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
- return -1;
- }
-
- ret = pthread_create(&socket_thread, NULL,
- vfio_mp_sync_thread, NULL);
- if (ret) {
- RTE_LOG(ERR, EAL,
- "Failed to create thread for communication with secondary processes!\n");
- close(mp_socket_fd);
- return -1;
- }
-
- /* Set thread_name for aid in debugging. */
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
- ret = rte_thread_setname(socket_thread, thread_name);
- if (ret)
- RTE_LOG(DEBUG, EAL,
- "Failed to set thread name for secondary processes!\n");
-
- return 0;
+ return rte_eal_mp_action_register("vfio", action);
}

#endif

--
2.7.4

Burakov, Anatoly

2017-12-11 12:01:08 UTC

Post by Jianfeng Tan
Previously, vfio has its own channel for the secondary process to
get container fd and group fd from the primary process.
This patch changes to use the generic mp channel.
---
lib/librte_eal/linuxapp/eal/eal.c | 14 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 139 +++------
lib/librte_eal/linuxapp/eal/eal_vfio.h | 15 +-
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 416 ++++---------------------
4 files changed, 109 insertions(+), 475 deletions(-)
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index a84eab4..93824bf 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c

<...snip...>

Post by Jianfeng Tan
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
- }
+ ret = rte_eal_mp_sendmsg("vfio", &p, sizeof(p), NULL, 0, 1);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, " cannot request group fd!\n");
+ cur_grp->group_no = -1;
+ } else {
+ cur_grp->group_no = iommu_group_no;
+ vfio_cfg.vfio_active_groups++;
}

Either i'm missing something here, or i don't see where we actually
store the group fd (e.g. the "cur_gtp->fd = vfio_group_fd" part from the
previous code).

Also, this is why i mentioned "receive parameters" in comments to
previous patch - looking at this code, it is quite unclear that the
return from rte_eal_mp_sendmsg is either error or, well, "something",
defined as "whatever mp_action returns". It would be much clearer if we
were explicitly getting some data in response.

Post by Jianfeng Tan
- return -1;
+
+ return ret;
}

<...snip...>

Post by Jianfeng Tan
+ /* For secondary process, request container fd from primary process */
+
+ p.req = SOCKET_REQ_CONTAINER;
+
+ ret = rte_eal_mp_sendmsg("vfio", &p, sizeof(p), NULL, 0, 1);
+ if (ret < 0)
+ RTE_LOG(ERR, EAL, " cannot request container fd!\n");

Again here, looks counter-intuitive to get container fd in return - it
would've been much clearer to have a separate response parameter.

Post by Jianfeng Tan
+
+ return ret;
}

<...snip...>

Post by Jianfeng Tan
static int
-vfio_mp_sync_socket_setup(void)
+vfio_mp_secondary(const void *params, int len, int fds[],
+ int fds_num __rte_unused)

fds_num isn't unused here.

Post by Jianfeng Tan
{

--
Thanks,
Anatoly

Burakov, Anatoly

2017-12-11 09:59:46 UTC

Post by Jianfeng Tan
This patchset adds a generic channel for multi-process (primary/secondary)
communication.
Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for those messages which need a response immediately.
Patch 3: Rework vfio to use this generic communication channel.

Hi Jianfeng,

Just a general comment: I am assuming this has the limitation of
"everything happens through primary process's involvement". This will
work for VFIO, as secondary always needs to ask the primary before doing
anything, but it doesn't address other issues that could have been
addressed with IPC.

For example, if a primary process would've hotplugged a device, it can't
notify all secondary processes about this; rather, it has to wait until
secondary processes ask for this info. Neither can it do anything if
secondary requests a primary to do something, and notify other secondary
processes about it (i.e. if secondary wants to hotplug a device, but
there are other secondaries also running). It would be great to have a
standard way of doing things like this in future revisions of our IPC.

--
Thanks,
Anatoly

Tan, Jianfeng

2017-12-12 07:34:03 UTC

Hi Anatoly,

-----Original Message-----
From: Burakov, Anatoly
Sent: Monday, December 11, 2017 6:00 PM
Subject: Re: [PATCH 0/3] generic channel for multi-process communication

Post by Jianfeng Tan
This patchset adds a generic channel for multi-process (primary/secondary)
communication.
Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for those messages which need a response

immediately.

Post by Jianfeng Tan
Patch 3: Rework vfio to use this generic communication channel.

Hi Jianfeng,
Just a general comment: I am assuming this has the limitation of
"everything happens through primary process's involvement". This will
work for VFIO, as secondary always needs to ask the primary before doing
anything, but it doesn't address other issues that could have been
addressed with IPC.
For example, if a primary process would've hotplugged a device, it can't
notify all secondary processes about this; rather, it has to wait until
secondary processes ask for this info.

No need to wait the secondary to pull such info.

It can work like this:
(1) Register a hotplug callback for each process at initialization;
(2) Whenever a process wants to hotplug a device, it will broadcast the info, by broadcast, I mean:
- if plugin happens at the primary, the primary will tell all of the secondary processes;
- if plugin happens at one secondary, it will firstly tell the primary, and the primary will broadcast it to all the secondary process.

Neither can it do anything if
secondary requests a primary to do something, and notify other secondary
processes about it (i.e. if secondary wants to hotplug a device, but
there are other secondaries also running). It would be great to have a
standard way of doing things like this in future revisions of our IPC.

Please review above thing; If you are OK with that, I'll include this in the next version.

Thanks,
Jianf

Burakov, Anatoly

2017-12-12 16:18:15 UTC

Post by Tan, Jianfeng
Hi Anatoly,

-----Original Message-----
From: Burakov, Anatoly
Sent: Monday, December 11, 2017 6:00 PM
Subject: Re: [PATCH 0/3] generic channel for multi-process communication

Post by Jianfeng Tan
This patchset adds a generic channel for multi-process (primary/secondary)
communication.
Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for those messages which need a response

immediately.

Post by Jianfeng Tan
Patch 3: Rework vfio to use this generic communication channel.

Hi Jianfeng,
Just a general comment: I am assuming this has the limitation of
"everything happens through primary process's involvement". This will
work for VFIO, as secondary always needs to ask the primary before doing
anything, but it doesn't address other issues that could have been
addressed with IPC.
For example, if a primary process would've hotplugged a device, it can't
notify all secondary processes about this; rather, it has to wait until
secondary processes ask for this info.

No need to wait the secondary to pull such info.
(1) Register a hotplug callback for each process at initialization;
- if plugin happens at the primary, the primary will tell all of the secondary processes;
- if plugin happens at one secondary, it will firstly tell the primary, and the primary will broadcast it to all the secondary process.

Neither can it do anything if
secondary requests a primary to do something, and notify other secondary
processes about it (i.e. if secondary wants to hotplug a device, but
there are other secondaries also running). It would be great to have a
standard way of doing things like this in future revisions of our IPC.

Please review above thing; If you are OK with that, I'll include this in the next version.
Thanks,
Jianfeng

--
Thanks,
Anatoly

Yes, that would work, my bad. However i don't think we necessarily need
it right now. This can go in a separate patch. I was rather looking at
other, future potential use cases, hotplug was just an example.

--
Thanks,
Anatoly

Jianfeng Tan

2018-01-11 04:07:30 UTC

v1->v2: (Address comments from Anatoly and Konstantin)
- Use datagram unix socket to supersede stream unix socket + epoll.
- Change the secondary add/del mechanism as now we use connection-less channel.
- Add mp_mutex_action to sync action register/unregister/reference.
- Limit max length of action name to 64B.
- New APIs for synchronous communication: rte_eal_mp_request/rte_eal_mp_reply.
- Formalize the errno handle.
- Some other small issues.

This patchset adds a generic channel for multi-process (primary/secondary)
communication.

Patch 1: addess the purpose and howto;
Patch 2: secondary process add/del;
Patch 3: add a syncrhonous way for the requests which need a immediate response.
Patch 4: Rework vfio to use this generic communication channel.

Jianfeng Tan (4):
eal: add channel for multi-process communication
eal: add and del secondary processes in the primary
eal: add synchronous multi-process communication
vfio: use the generic multi-process channel

lib/librte_eal/common/eal_common_proc.c | 594 +++++++++++++++++++++++++
lib/librte_eal/common/eal_filesystem.h | 17 +
lib/librte_eal/common/eal_private.h | 10 +
lib/librte_eal/common/include/rte_eal.h | 138 ++++++
lib/librte_eal/linuxapp/eal/eal.c | 22 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 133 ++----
lib/librte_eal/linuxapp/eal/eal_vfio.h | 15 +-
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 409 +++--------------
lib/librte_eal/rte_eal_version.map | 11 +
9 files changed, 872 insertions(+), 477 deletions(-)

--
2.7.4

Jianfeng Tan

2018-01-11 04:07:31 UTC

Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
1. Config-file based channel, in which, the primary process writes
info into a pre-defined config file, and the secondary process
reads the info out.
2. vfio submodule has its own channel based on unix socket for the
secondary process to get container fd and group fd from the
primary process.
3. pdump submodule also has its own channel based on unix socket for
packet dump.

It'd be good to have a generic communication channel for multi-process
communication to accomodate the requirements including:
a. Secondary wants to send info to primary, for example, secondary
would like to send request (about some specific vdev to primary).
b. Sending info at any time, instead of just initialization time.
c. Share FDs with the other side, for vdev like vhost, related FDs
(memory region, kick) should be shared.
d. A send message request needs the other side to response immediately.

This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.

Three new APIs are added:

1. rte_eal_mp_action_register() is used to register an action,
indexed by a string, when a component at receiver side would like
to response the messages from the peer processe.
2. rte_eal_mp_action_unregister() is used to unregister the action
if the calling component does not want to response the messages.
3. rte_eal_mp_sendmsg() is used to send a message, and returns
immediately. If there are 1:n primary:secondary processes, the
primary process will send n messages.

Suggested-by: Konstantin Ananyev <***@intel.com>
Signed-off-by: Jianfeng Tan <***@intel.com>
---
lib/librte_eal/common/eal_common_proc.c | 388 ++++++++++++++++++++++++++++++++
lib/librte_eal/common/eal_filesystem.h | 17 ++
lib/librte_eal/common/eal_private.h | 10 +
lib/librte_eal/common/include/rte_eal.h | 69 ++++++
lib/librte_eal/linuxapp/eal/eal.c | 8 +
lib/librte_eal/rte_eal_version.map | 9 +
6 files changed, 501 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 40fa982..d700e9e 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -5,11 +5,55 @@
#include <stdio.h>
#include <fcntl.h>
#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <limits.h>
+#include <unistd.h>
+#include <sys/un.h>
+#include <errno.h>
+#include <pthread.h>
+
+#include <rte_log.h>
#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_common.h>

+#include "eal_private.h"
#include "eal_filesystem.h"
#include "eal_internal_cfg.h"

+#define MAX_SECONDARY_PROCS 8
+#define MAX_ACTION_NAME_LEN 64
+#define MAX_UNIX_PATH_LEN 104
+#define MAX_MSG_LENGTH 1024
+#define SCM_MAX_FD 253 /* The max amount of fds */
+
+static int mp_fd = -1;
+static char *mp_sec_sockets[MAX_SECONDARY_PROCS];
+static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+
+struct action_entry {
+ TAILQ_ENTRY(action_entry) next; /**< Next attached action entry */
+
+#define MAX_ACTION_NAME_LEN 64
+ char action_name[MAX_ACTION_NAME_LEN];
+ rte_eal_mp_t action;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(action_entry_list, action_entry);
+
+static struct action_entry_list action_entry_list =
+ TAILQ_HEAD_INITIALIZER(action_entry_list);
+
+struct mp_msghdr {
+ char action_name[MAX_ACTION_NAME_LEN];
+ int fds_num;
+ int len_params;
+ char params[0];
+} __rte_packed;
+
int
rte_eal_primary_proc_alive(const char *config_file_path)
{
@@ -31,3 +75,347 @@ rte_eal_primary_proc_alive(const char *config_file_path)

return !!ret;
}
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+ int len = strlen(name);
+ struct action_entry *entry;
+
+ TAILQ_FOREACH(entry, &action_entry_list, next) {
+ if (strncmp(entry->action_name, name, len) == 0)
+ break;
+ }
+
+ return entry;
+}
+
+int
+rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action)
+{
+ struct action_entry *entry = malloc(sizeof(struct action_entry));
+
+ if (entry == NULL) {
+ rte_errno = -ENOMEM;
+ return -1;
+ }
+
+ if (strlen(action_name) > MAX_ACTION_NAME_LEN) {
+ rte_errno = -E2BIG;
+ return -1;
+ }
+
+ pthread_mutex_lock(&mp_mutex_action);
+ if (find_action_entry_by_name(action_name) != NULL) {
+ free(entry);
+ rte_errno = -EEXIST;
+ return -1;
+ }
+ strncpy(entry->action_name, action_name, MAX_ACTION_NAME_LEN);
+ entry->action = action;
+ TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+ pthread_mutex_unlock(&mp_mutex_action);
+ return 0;
+}
+
+void
+rte_eal_mp_action_unregister(const char *name)
+{
+ struct action_entry *entry;
+
+ pthread_mutex_lock(&mp_mutex_action);
+ entry = find_action_entry_by_name(name);
+ TAILQ_REMOVE(&action_entry_list, entry, next);
+ free(entry);
+ pthread_mutex_unlock(&mp_mutex_action);
+}
+
+static int
+read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
+{
+ int ret;
+ struct iovec iov;
+ struct msghdr msgh;
+ size_t fdsize = fds_num * sizeof(int);
+ char control[CMSG_SPACE(fdsize)];
+ struct cmsghdr *cmsg;
+
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = buf;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ ret = recvmsg(fd, &msgh, 0);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
+ return -1;
+ }
+
+ if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+ RTE_LOG(ERR, EAL, "truncted msg\n");
+ return -1;
+ }
+
+ /* read auxiliary FDs if any */
+ for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+ if ((cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS)) {
+ memcpy(fds, CMSG_DATA(cmsg), fdsize);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int
+process_msg(struct mp_msghdr *hdr, int len, int fds[])
+{
+ int ret;
+ int params_len;
+ struct action_entry *entry;
+
+ RTE_LOG(DEBUG, EAL, "msg: %s\n", hdr->action_name);
+
+ pthread_mutex_lock(&mp_mutex_action);
+ entry = find_action_entry_by_name(hdr->action_name);
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "cannot find action by: %s\n",
+ hdr->action_name);
+ pthread_mutex_unlock(&mp_mutex_action);
+ return -1;
+ }
+
+ params_len = len - sizeof(struct mp_msghdr);
+ ret = entry->action(hdr->params, params_len, fds, hdr->fds_num);
+ pthread_mutex_unlock(&mp_mutex_action);
+ return ret;
+
+}
+
+static void *
+mp_handle(void *arg __rte_unused)
+{
+ int len;
+ int fds[SCM_MAX_FD];
+ char buf[MAX_MSG_LENGTH];
+
+ while (1) {
+ len = read_msg(mp_fd, buf, MAX_MSG_LENGTH, fds, SCM_MAX_FD);
+ if (len > 0)
+ process_msg((struct mp_msghdr *)buf, len, fds);
+ }
+
+ return NULL;
+}
+
+static inline const char *
+get_unix_path(int is_server)
+{
+ static char unix_path[MAX_UNIX_PATH_LEN];
+ const char *prefix = eal_mp_unix_path();
+ const char *suffix = (is_server) ? "" : "_c";
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ snprintf(unix_path, MAX_UNIX_PATH_LEN, "%s%s", prefix, suffix);
+ else
+ snprintf(unix_path, MAX_UNIX_PATH_LEN, "%s%s_%d",
+ prefix, suffix, getpid());
+ return unix_path;
+}
+
+static int
+open_unix_fd(int is_server)
+{
+ int fd;
+ struct sockaddr_un un;
+
+ fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "failed to create unix socket\n");
+ return -1;
+ }
+
+ memset(&un, 0, sizeof(un));
+ un.sun_family = AF_UNIX;
+ snprintf(un.sun_path, MAX_UNIX_PATH_LEN, "%s",
+ get_unix_path(is_server));
+ unlink(un.sun_path); /* May still exist since last run */
+ if (bind(fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+ RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
+ un.sun_path, strerror(errno));
+ close(fd);
+ return -1;
+ }
+
+ RTE_LOG(INFO, EAL, "bind to %s\n", un.sun_path);
+ return fd;
+}
+
+int
+rte_eal_mp_channel_init(void)
+{
+ pthread_t tid;
+ char thread_name[RTE_MAX_THREAD_NAME_LEN];
+
+ mp_fd = open_unix_fd(1);
+ if (mp_fd < 0)
+ return -1;
+
+ if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "failed to create mp handle thead: %s\n",
+ strerror(errno));
+ goto error;
+ }
+
+ snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
+ if (rte_thread_setname(tid, thread_name) < 0) {
+ RTE_LOG(ERR, EAL, "failed to set thead name\n");
+ goto error;
+ }
+
+ return 0;
+error:
+ close(mp_fd);
+ mp_fd = -1;
+ return -1;
+}
+
+static inline struct mp_msghdr *
+format_msg(const char *act_name, const void *p, int len_params, int fds_num)
+{
+ int len_msg;
+ struct mp_msghdr *msg;
+
+ len_msg = sizeof(struct mp_msghdr) + len_params;
+ if (len_msg > MAX_MSG_LENGTH) {
+ RTE_LOG(ERR, EAL, "Message is too long\n");
+ rte_errno = -EINVAL;
+ return NULL;
+ }
+
+ msg = malloc(len_msg);
+ if (!msg) {
+ RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
+ rte_errno = -ENOMEM;
+ return NULL;
+ }
+ memset(msg, 0, len_msg);
+ strcpy(msg->action_name, act_name);
+ msg->fds_num = fds_num;
+ msg->len_params = len_params;
+ memcpy(msg->params, p, len_params);
+ return msg;
+}
+
+static int
+send_msg(int fd, const char *dst_path, struct mp_msghdr *msg, int fds[])
+{
+ int ret;
+ struct msghdr msgh;
+ struct iovec iov;
+ size_t fd_size = msg->fds_num * sizeof(int);
+ char control[CMSG_SPACE(fd_size)];
+ struct cmsghdr *cmsg;
+ struct sockaddr_un dst;
+
+ memset(&dst, 0, sizeof(dst));
+ dst.sun_family = AF_UNIX;
+ snprintf(dst.sun_path, MAX_UNIX_PATH_LEN, "%s", dst_path);
+
+ memset(&msgh, 0, sizeof(msgh));
+ memset(control, 0, sizeof(control));
+
+ iov.iov_base = (uint8_t *)msg;
+ iov.iov_len = sizeof(struct mp_msghdr) + msg->len_params;
+
+ msgh.msg_name = &dst;
+ msgh.msg_namelen = sizeof(dst);
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ cmsg->cmsg_len = CMSG_LEN(fd_size);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), fds, fd_size);
+
+ do {
+ ret = sendmsg(fd, &msgh, 0);
+ } while (ret < 0 && errno == EINTR);
+
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ RTE_LOG(ERR, EAL, "secondary process (%s) exited\n",
+ dst_path);
+ else if (!rte_eal_primary_proc_alive(NULL))
+ RTE_LOG(ERR, EAL, "primary process exited\n");
+
+ return 0;
+ }
+
+ return 1;
+}
+
+static int
+mp_send(const char *action_name,
+ const void *params,
+ int len_params,
+ int fds[],
+ int fds_num)
+{
+ int i;
+ int n = 0;
+ int sockfd;
+ struct mp_msghdr *msg;
+
+ if (fds_num > SCM_MAX_FD) {
+ RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
+ rte_errno = -E2BIG;
+ return 0;
+ }
+
+ msg = format_msg(action_name, params, len_params, fds_num);
+ if (msg == NULL)
+ return 0;
+
+ if ((sockfd = open_unix_fd(0)) < 0) {
+ free(msg);
+ return 0;
+ }
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ /* broadcast to all secondaries */
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
+ if (mp_sec_sockets[i] == NULL)
+ continue;
+
+ n += send_msg(sockfd, mp_sec_sockets[i], msg, fds);
+ }
+ } else
+ n += send_msg(sockfd, eal_mp_unix_path(), msg, fds);
+
+ free(msg);
+ close(sockfd);
+ return n;
+}
+
+int
+rte_eal_mp_sendmsg(const char *action_name,
+ const void *params,
+ int len_params,
+ int fds[],
+ int fds_num)
+{
+ RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", action_name);
+ return mp_send(action_name, params, len_params, fds, fds_num);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index e8959eb..e95399b 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -38,6 +38,23 @@ eal_runtime_config_path(void)
return buffer;
}

+/** Path of primary/secondary communication unix socket file. */
+#define MP_UNIX_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_unix_path(void)
+{
+ static char buffer[PATH_MAX]; /* static so auto-zeroed */
+ const char *directory = default_config_dir;
+ const char *home_dir = getenv("HOME");
+
+ if (getuid() != 0 && home_dir != NULL)
+ directory = home_dir;
+ snprintf(buffer, sizeof(buffer) - 1, MP_UNIX_PATH_FMT,
+ directory, internal_config.hugefile_prefix);
+
+ return buffer;
+}
+
/** Path of hugepage info file. */
#define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"

diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index c46dd8f..e36e3b5 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -195,4 +195,14 @@ int rte_eal_hugepage_attach(void);
*/
struct rte_bus *rte_bus_find_by_device_name(const char *str);

+/**
+ * Create the unix channel for primary/secondary communication.
+ *
+ * @return
+ * 0 on success;
+ * (<0) on failure.
+ */
+
+int rte_eal_mp_channel_init(void);
+
#endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 02fa109..9884c0b 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -186,6 +186,75 @@ int rte_eal_init(int argc, char **argv);
int rte_eal_primary_proc_alive(const char *config_file_path);

/**
+ * Action function typedef used by other components.
+ *
+ * As we create unix socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_eal_mp_t)(const void *params, int len,
+ int fds[], int fds_num);
+
+/**
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * @param action_name
+ * The action_name argument plays as the nonredundant key to find the action.
+ *
+ * @param action
+ * The action argument is the function pointer to the action function.
+ *
+ * @return
+ * - 0 on success.
+ * - (<0) on failure.
+ */
+int rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action);
+
+/**
+ * Unregister an action function for primary/secondary communication.
+ *
+ * Call this function to unregister an action if the calling component does
+ * not want to response the messages from the corresponding component in its
+ * primary process or secondary processes.
+ *
+ * @param action_name
+ * The action_name argument plays as the nonredundant key to find the action.
+ *
+ */
+void rte_eal_mp_action_unregister(const char *name);
+
+/**
+ * Send a message to the peer process.
+ *
+ * This function will send a message which will be responsed by the action
+ * identified by action_name of the process on the other side.
+ *
+ * @param action_name
+ * The action_name argument is used to identify which action will be used.
+ *
+ * @param params
+ * The params argument contains the customized message.
+ *
+ * @param len_params
+ * The len_params argument is the length of the customized message.
+ *
+ * @param fds
+ * The fds argument is an array of fds sent with sendmsg.
+ *
+ * @param fds_num
+ * The fds_num argument is number of fds to be sent with sendmsg.
+ *
+ * @return
+ * - Returns the number of messages being sent successfully.
+ */
+int
+rte_eal_mp_sendmsg(const char *action_name, const void *params,
+ int len_params, int fds[], int fds_num);
+
+/**
* Usage function typedef used by the application usage function.
*
* Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 229eec9..f231724 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -896,6 +896,14 @@ rte_eal_init(int argc, char **argv)

eal_check_mem_on_local_socket();

+ if (rte_eal_mp_channel_init() < 0) {
+ rte_eal_init_alert("failed to init mp channel\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ rte_errno = EFAULT;
+ return -1;
+ }
+ }
+
eal_thread_init_master(rte_config.master_lcore);

ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index f4f46c1..5dacde5 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -235,4 +235,13 @@ EXPERIMENTAL {
rte_service_set_stats_enable;
rte_service_start_with_defaults;

+} DPDK_17.08;
+
+DPDK_18.02 {
+ global:
+
+ rte_eal_mp_action_register;
+ rte_eal_mp_action_unregister;
+ rte_eal_mp_sendmsg;
+
} DPDK_17.11;

--
2.7.4

Burakov, Anatoly

2018-01-13 12:57:29 UTC

Post by Jianfeng Tan
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 40fa982..d700e9e 100644

<snip>

Post by Jianfeng Tan
+
int
rte_eal_primary_proc_alive(const char *config_file_path)
{
@@ -31,3 +75,347 @@ rte_eal_primary_proc_alive(const char *config_file_path)
return !!ret;
}
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+ int len = strlen(name);

Why do strlen() here? You already have MAX_ACTION_NAME_LEN, strncmp will
take care of the rest, no?

Post by Jianfeng Tan
+ struct action_entry *entry;
+
+ TAILQ_FOREACH(entry, &action_entry_list, next) {
+ if (strncmp(entry->action_name, name, len) == 0)
+ break;
+ }
+
+ return entry;
+}
+
+int
+rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action)
+{
+ struct action_entry *entry = malloc(sizeof(struct action_entry));
+
+ if (entry == NULL) {
+ rte_errno = -ENOMEM;
+ return -1;
+ }
+
+ if (strlen(action_name) > MAX_ACTION_NAME_LEN) {
+ rte_errno = -E2BIG;
+ return -1;
+ }

strnlen perhaps? strnlen(action_name) == MAX_ACTION_NAME_LEN will be an
error condition, and unlike strlen you won't have to scan the entire
memory if your string was corrupted.

Post by Jianfeng Tan
+
+ pthread_mutex_lock(&mp_mutex_action);
+ if (find_action_entry_by_name(action_name) != NULL) {
+ free(entry);
+ rte_errno = -EEXIST;
+ return -1;
+ }
+ strncpy(entry->action_name, action_name, MAX_ACTION_NAME_LEN);
+ entry->action = action;
+ TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+ pthread_mutex_unlock(&mp_mutex_action);
+ return 0;
+}
+
+void
+rte_eal_mp_action_unregister(const char *name)
+{

name == NULL? You do a find_action_entry_by_name with it, which calls
strlen, which IIRC segfaults on NULL pointer. Also, maybe add an strlen
(or better yet, strnlen) check here like in action_register, so that
find_action_entry_by_name doesn't need to care about string lengths and
can work off MAX_ACTION_NAME_LEN instead.

Post by Jianfeng Tan
+ struct action_entry *entry;
+
+ pthread_mutex_lock(&mp_mutex_action);
+ entry = find_action_entry_by_name(name);

entry == NULL?

Post by Jianfeng Tan
+ TAILQ_REMOVE(&action_entry_list, entry, next);
+ free(entry);
+ pthread_mutex_unlock(&mp_mutex_action);
+}
+
+static int
+read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
+{
+ int ret;
+ struct iovec iov;
+ struct msghdr msgh;

<snip>

Post by Jianfeng Tan
+ return -1;
+ }
+
+ params_len = len - sizeof(struct mp_msghdr);
+ ret = entry->action(hdr->params, params_len, fds, hdr->fds_num);
+ pthread_mutex_unlock(&mp_mutex_action);
+ return ret;
+

unnecessary newline.

Post by Jianfeng Tan
+}
+
+static void *
+mp_handle(void *arg __rte_unused)
+{
+ int len;
+ int fds[SCM_MAX_FD];
+ char buf[MAX_MSG_LENGTH];
+
+ while (1) {

<snip>

Post by Jianfeng Tan
+ goto error;
+ }
+
+ return 0;
+ close(mp_fd);
+ mp_fd = -1;
+ return -1;
+}
+
+static inline struct mp_msghdr *
+format_msg(const char *act_name, const void *p, int len_params, int fds_num)

The name is slightly misleading, as this function actually *creates* a
message, not just formats it. create_msg? alloc_msg?

Post by Jianfeng Tan
+{
+ int len_msg;
+ struct mp_msghdr *msg;
+
+ len_msg = sizeof(struct mp_msghdr) + len_params;
+ if (len_msg > MAX_MSG_LENGTH) {
+ RTE_LOG(ERR, EAL, "Message is too long\n");
+ rte_errno = -EINVAL;
+ return NULL;
+ }
+
+ msg = malloc(len_msg);
+ if (!msg) {
+ RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
+ rte_errno = -ENOMEM;
+ return NULL;
+ }
+ memset(msg, 0, len_msg);
+ strcpy(msg->action_name, act_name);

strncpy?

Post by Jianfeng Tan
+ msg->fds_num = fds_num;
+ msg->len_params = len_params;
+ memcpy(msg->params, p, len_params);
+ return msg;
+}
+
+static int
+send_msg(int fd, const char *dst_path, struct mp_msghdr *msg, int fds[])
+{
+ int ret;
+ struct msghdr msgh;
+ struct iovec iov;
+ size_t fd_size = msg->fds_num * sizeof(int);

<snip>

Post by Jianfeng Tan
+ return mp_send(action_name, params, len_params, fds, fds_num);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index e8959eb..e95399b 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -38,6 +38,23 @@ eal_runtime_config_path(void)
return buffer;
}
+/** Path of primary/secondary communication unix socket file. */
+#define MP_UNIX_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_unix_path(void)

perhaps eal_mp_socket_path would've been more descriptive? API doesn't
need to care what kind of socket it is.

Post by Jianfeng Tan
+{
+ static char buffer[PATH_MAX]; /* static so auto-zeroed */
+ const char *directory = default_config_dir;
+ const char *home_dir = getenv("HOME");
+
+ if (getuid() != 0 && home_dir != NULL)
+ directory = home_dir;

<snip>

--
Thanks,
Anatoly

Ananyev, Konstantin

2018-01-15 19:52:07 UTC

Hi Jianfeng,

Post by Jianfeng Tan
Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
1. Config-file based channel, in which, the primary process writes
info into a pre-defined config file, and the secondary process
reads the info out.
2. vfio submodule has its own channel based on unix socket for the
secondary process to get container fd and group fd from the
primary process.
3. pdump submodule also has its own channel based on unix socket for
packet dump.
It'd be good to have a generic communication channel for multi-process
a. Secondary wants to send info to primary, for example, secondary
would like to send request (about some specific vdev to primary).
b. Sending info at any time, instead of just initialization time.
c. Share FDs with the other side, for vdev like vhost, related FDs
(memory region, kick) should be shared.
d. A send message request needs the other side to response immediately.
This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.
1. rte_eal_mp_action_register() is used to register an action,
indexed by a string, when a component at receiver side would like
to response the messages from the peer processe.
2. rte_eal_mp_action_unregister() is used to unregister the action
if the calling component does not want to response the messages.
3. rte_eal_mp_sendmsg() is used to send a message, and returns
immediately. If there are 1:n primary:secondary processes, the
primary process will send n messages.
---
lib/librte_eal/common/eal_common_proc.c | 388 ++++++++++++++++++++++++++++++++
lib/librte_eal/common/eal_filesystem.h | 17 ++
lib/librte_eal/common/eal_private.h | 10 +
lib/librte_eal/common/include/rte_eal.h | 69 ++++++
lib/librte_eal/linuxapp/eal/eal.c | 8 +
lib/librte_eal/rte_eal_version.map | 9 +
6 files changed, 501 insertions(+)
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 40fa982..d700e9e 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -5,11 +5,55 @@
#include <stdio.h>
#include <fcntl.h>
#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <limits.h>
+#include <unistd.h>
+#include <sys/un.h>
+#include <errno.h>
+#include <pthread.h>
+
+#include <rte_log.h>
#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_common.h>
+#include "eal_private.h"
#include "eal_filesystem.h"
#include "eal_internal_cfg.h"
+#define MAX_SECONDARY_PROCS 8
+#define MAX_ACTION_NAME_LEN 64
+#define MAX_UNIX_PATH_LEN 104

Why do you need this?
Why not just PATH_MAX?

Post by Jianfeng Tan
+#define MAX_MSG_LENGTH 1024
+#define SCM_MAX_FD 253 /* The max amount of fds */
+
+static int mp_fd = -1;
+static char *mp_sec_sockets[MAX_SECONDARY_PROCS];

Who will init it and why it could be only 8?

Post by Jianfeng Tan
+static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+
+struct action_entry {
+ TAILQ_ENTRY(action_entry) next; /**< Next attached action entry */
+
+#define MAX_ACTION_NAME_LEN 64
+ char action_name[MAX_ACTION_NAME_LEN];
+ rte_eal_mp_t action;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(action_entry_list, action_entry);
+
+static struct action_entry_list action_entry_list =
+ TAILQ_HEAD_INITIALIZER(action_entry_list);
+
+struct mp_msghdr {
+ char action_name[MAX_ACTION_NAME_LEN];
+ int fds_num;
+ int len_params;
+ char params[0];
+} __rte_packed;
+
int
rte_eal_primary_proc_alive(const char *config_file_path)
{
@@ -31,3 +75,347 @@ rte_eal_primary_proc_alive(const char *config_file_path)
return !!ret;
}
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+ int len = strlen(name);
+ struct action_entry *entry;
+
+ TAILQ_FOREACH(entry, &action_entry_list, next) {
+ if (strncmp(entry->action_name, name, len) == 0)

I think it has be just strcmp() here.

Post by Jianfeng Tan
+ break;
+ }
+
+ return entry;
+}
+
+int
+rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action)
+{
+ struct action_entry *entry = malloc(sizeof(struct action_entry));
+
+ if (entry == NULL) {
+ rte_errno = -ENOMEM;
+ return -1;
+ }
+
+ if (strlen(action_name) > MAX_ACTION_NAME_LEN) {

No space for '\0' left.
either >= MAX_ACTION_NAME_LEN, or make entry.name[MAX_ACTION_NAME_LEN + 1];
Even better just
- allocate new action_entry.
if (snprintf(action->name, "%s", action_name) >= sizeof(action->name)) {
free(action);
return -E2BIG;
}

Post by Jianfeng Tan
+ rte_errno = -E2BIG;
+ return -1;
+ }
+
+ pthread_mutex_lock(&mp_mutex_action);
+ if (find_action_entry_by_name(action_name) != NULL) {
+ free(entry);

Forgot to do mutex_unlock().

Post by Jianfeng Tan
+ rte_errno = -EEXIST;
+ return -1;
+ }
+ strncpy(entry->action_name, action_name, MAX_ACTION_NAME_LEN);
+ entry->action = action;
+ TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+ pthread_mutex_unlock(&mp_mutex_action);
+ return 0;
+}
+
+void
+rte_eal_mp_action_unregister(const char *name)
+{
+ struct action_entry *entry;
+
+ pthread_mutex_lock(&mp_mutex_action);
+ entry = find_action_entry_by_name(name);
+ TAILQ_REMOVE(&action_entry_list, entry, next);
+ free(entry);

Better to do free() after releasing the mutex.

Post by Jianfeng Tan
+ pthread_mutex_unlock(&mp_mutex_action);
+}
+
+static int
+read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
+{
+ int ret;
+ struct iovec iov;
+ struct msghdr msgh;
+ size_t fdsize = fds_num * sizeof(int);
+ char control[CMSG_SPACE(fdsize)];
+ struct cmsghdr *cmsg;
+
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = buf;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ ret = recvmsg(fd, &msgh, 0);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
+ return -1;
+ }
+
+ if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
+ RTE_LOG(ERR, EAL, "truncted msg\n");
+ return -1;
+ }
+
+ /* read auxiliary FDs if any */
+ for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+ if ((cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS)) {
+ memcpy(fds, CMSG_DATA(cmsg), fdsize);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int
+process_msg(struct mp_msghdr *hdr, int len, int fds[])
+{
+ int ret;
+ int params_len;
+ struct action_entry *entry;
+
+ RTE_LOG(DEBUG, EAL, "msg: %s\n", hdr->action_name);
+
+ pthread_mutex_lock(&mp_mutex_action);
+ entry = find_action_entry_by_name(hdr->action_name);
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "cannot find action by: %s\n",
+ hdr->action_name);
+ pthread_mutex_unlock(&mp_mutex_action);
+ return -1;

If no action is specified for that message - who will free it?
If action() exisits is it a responsibility of action() to free msg?

Post by Jianfeng Tan
+ }
+
+ params_len = len - sizeof(struct mp_msghdr);
+ ret = entry->action(hdr->params, params_len, fds, hdr->fds_num);

Do you really need to do action() with lock held?

Post by Jianfeng Tan
+ pthread_mutex_unlock(&mp_mutex_action);
+ return ret;
+
+}
+
+static void *

Why just not 'void' here?

Post by Jianfeng Tan
+mp_handle(void *arg __rte_unused)
+{
+ int len;
+ int fds[SCM_MAX_FD];
+ char buf[MAX_MSG_LENGTH];
+
+ while (1) {
+ len = read_msg(mp_fd, buf, MAX_MSG_LENGTH, fds, SCM_MAX_FD);
+ if (len > 0)
+ process_msg((struct mp_msghdr *)buf, len, fds);
+ }
+
+ return NULL;
+}
+
+static inline const char *
+get_unix_path(int is_server)
+{
+ static char unix_path[MAX_UNIX_PATH_LEN];

PATH_MAX?

Why just not make that function to accept char path[PATH_MAX] as a parameter?

Post by Jianfeng Tan
+ const char *prefix = eal_mp_unix_path();
+ const char *suffix = (is_server) ? "" : "_c";
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ snprintf(unix_path, MAX_UNIX_PATH_LEN, "%s%s", prefix, suffix);
+ else
+ snprintf(unix_path, MAX_UNIX_PATH_LEN, "%s%s_%d",
+ prefix, suffix, getpid());
+ return unix_path;
+}
+
+static int
+open_unix_fd(int is_server)
+{
+ int fd;
+ struct sockaddr_un un;
+
+ fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "failed to create unix socket\n");
+ return -1;
+ }
+
+ memset(&un, 0, sizeof(un));
+ un.sun_family = AF_UNIX;
+ snprintf(un.sun_path, MAX_UNIX_PATH_LEN, "%s",
+ get_unix_path(is_server));
+ unlink(un.sun_path); /* May still exist since last run */
+ if (bind(fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+ RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
+ un.sun_path, strerror(errno));
+ close(fd);
+ return -1;
+ }
+
+ RTE_LOG(INFO, EAL, "bind to %s\n", un.sun_path);
+ return fd;
+}
+
+int
+rte_eal_mp_channel_init(void)
+{
+ pthread_t tid;
+ char thread_name[RTE_MAX_THREAD_NAME_LEN];
+
+ mp_fd = open_unix_fd(1);
+ if (mp_fd < 0)
+ return -1;
+
+ if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "failed to create mp handle thead: %s\n",
+ strerror(errno));
+ goto error;
+ }
+
+ snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
+ if (rte_thread_setname(tid, thread_name) < 0) {
+ RTE_LOG(ERR, EAL, "failed to set thead name\n");

Forgot to terminate thread?

Post by Jianfeng Tan
+ goto error;

As a nit - can we reorder code a bit to avoid 'goto's?

Post by Jianfeng Tan
+ }
+
+ return 0;
+ close(mp_fd);
+ mp_fd = -1;
+ return -1;
+}
+
+static inline struct mp_msghdr *
+format_msg(const char *act_name, const void *p, int len_params, int fds_num)
+{
+ int len_msg;
+ struct mp_msghdr *msg;
+
+ len_msg = sizeof(struct mp_msghdr) + len_params;
+ if (len_msg > MAX_MSG_LENGTH) {
+ RTE_LOG(ERR, EAL, "Message is too long\n");
+ rte_errno = -EINVAL;
+ return NULL;
+ }
+
+ msg = malloc(len_msg);
+ if (!msg) {
+ RTE_LOG(ERR, EAL, "Cannot alloc memory for msg\n");
+ rte_errno = -ENOMEM;
+ return NULL;
+ }
+ memset(msg, 0, len_msg);
+ strcpy(msg->action_name, act_name);
+ msg->fds_num = fds_num;
+ msg->len_params = len_params;
+ memcpy(msg->params, p, len_params);
+ return msg;
+}
+
+static int
+send_msg(int fd, const char *dst_path, struct mp_msghdr *msg, int fds[])
+{
+ int ret;
+ struct msghdr msgh;
+ struct iovec iov;
+ size_t fd_size = msg->fds_num * sizeof(int);
+ char control[CMSG_SPACE(fd_size)];
+ struct cmsghdr *cmsg;
+ struct sockaddr_un dst;
+
+ memset(&dst, 0, sizeof(dst));
+ dst.sun_family = AF_UNIX;
+ snprintf(dst.sun_path, MAX_UNIX_PATH_LEN, "%s", dst_path);
+
+ memset(&msgh, 0, sizeof(msgh));
+ memset(control, 0, sizeof(control));
+
+ iov.iov_base = (uint8_t *)msg;
+ iov.iov_len = sizeof(struct mp_msghdr) + msg->len_params;
+
+ msgh.msg_name = &dst;
+ msgh.msg_namelen = sizeof(dst);
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ cmsg->cmsg_len = CMSG_LEN(fd_size);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), fds, fd_size);
+
+ do {
+ ret = sendmsg(fd, &msgh, 0);
+ } while (ret < 0 && errno == EINTR);
+
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ RTE_LOG(ERR, EAL, "secondary process (%s) exited\n",
+ dst_path);
+ else if (!rte_eal_primary_proc_alive(NULL))
+ RTE_LOG(ERR, EAL, "primary process exited\n");

So secondary to secondary are not allowed?

Post by Jianfeng Tan
+
+ return 0;
+ }
+
+ return 1;
+}
+
+static int
+mp_send(const char *action_name,
+ const void *params,
+ int len_params,
+ int fds[],
+ int fds_num)
+{
+ int i;
+ int n = 0;
+ int sockfd;
+ struct mp_msghdr *msg;
+
+ if (fds_num > SCM_MAX_FD) {
+ RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
+ rte_errno = -E2BIG;
+ return 0;
+ }
+
+ msg = format_msg(action_name, params, len_params, fds_num);
+ if (msg == NULL)
+ return 0;
+
+ if ((sockfd = open_unix_fd(0)) < 0) {
+ free(msg);
+ return 0;
+ }
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ /* broadcast to all secondaries */
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
+ if (mp_sec_sockets[i] == NULL)
+ continue;
+
+ n += send_msg(sockfd, mp_sec_sockets[i], msg, fds);
+ }
+ } else
+ n += send_msg(sockfd, eal_mp_unix_path(), msg, fds);
+
+ free(msg);
+ close(sockfd);
+ return n;
+}
+
+int
+rte_eal_mp_sendmsg(const char *action_name,
+ const void *params,
+ int len_params,
+ int fds[],
+ int fds_num)
+{
+ RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", action_name);
+ return mp_send(action_name, params, len_params, fds, fds_num);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index e8959eb..e95399b 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -38,6 +38,23 @@ eal_runtime_config_path(void)
return buffer;
}
+/** Path of primary/secondary communication unix socket file. */
+#define MP_UNIX_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_unix_path(void)
+{
+ static char buffer[PATH_MAX]; /* static so auto-zeroed */
+ const char *directory = default_config_dir;
+ const char *home_dir = getenv("HOME");
+
+ if (getuid() != 0 && home_dir != NULL)
+ directory = home_dir;
+ snprintf(buffer, sizeof(buffer) - 1, MP_UNIX_PATH_FMT,
+ directory, internal_config.hugefile_prefix);
+
+ return buffer;
+}
+
/** Path of hugepage info file. */
#define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"
diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index c46dd8f..e36e3b5 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -195,4 +195,14 @@ int rte_eal_hugepage_attach(void);
*/
struct rte_bus *rte_bus_find_by_device_name(const char *str);
+/**
+ * Create the unix channel for primary/secondary communication.
+ *
+ * 0 on success;
+ * (<0) on failure.
+ */
+
+int rte_eal_mp_channel_init(void);
+
#endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 02fa109..9884c0b 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -186,6 +186,75 @@ int rte_eal_init(int argc, char **argv);
int rte_eal_primary_proc_alive(const char *config_file_path);
/**
+ * Action function typedef used by other components.
+ *
+ * As we create unix socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_eal_mp_t)(const void *params, int len,
+ int fds[], int fds_num);
+
+/**
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * The action_name argument plays as the nonredundant key to find the action.
+ *
+ * The action argument is the function pointer to the action function.
+ *
+ * - 0 on success.
+ * - (<0) on failure.
+ */
+int rte_eal_mp_action_register(const char *action_name, rte_eal_mp_t action);
+
+/**
+ * Unregister an action function for primary/secondary communication.
+ *
+ * Call this function to unregister an action if the calling component does
+ * not want to response the messages from the corresponding component in its
+ * primary process or secondary processes.
+ *
+ * The action_name argument plays as the nonredundant key to find the action.
+ *
+ */
+void rte_eal_mp_action_unregister(const char *name);
+
+/**
+ * Send a message to the peer process.
+ *
+ * This function will send a message which will be responsed by the action
+ * identified by action_name of the process on the other side.
+ *
+ * The action_name argument is used to identify which action will be used.
+ *
+ * The params argument contains the customized message.
+ *
+ * The len_params argument is the length of the customized message.
+ *
+ * The fds argument is an array of fds sent with sendmsg.
+ *
+ * The fds_num argument is number of fds to be sent with sendmsg.
+ *
+ * - Returns the number of messages being sent successfully.
+ */
+int
+rte_eal_mp_sendmsg(const char *action_name, const void *params,
+ int len_params, int fds[], int fds_num);
+
+/**
* Usage function typedef used by the application usage function.
*
* Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 229eec9..f231724 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -896,6 +896,14 @@ rte_eal_init(int argc, char **argv)
eal_check_mem_on_local_socket();
+ if (rte_eal_mp_channel_init() < 0) {
+ rte_eal_init_alert("failed to init mp channel\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ rte_errno = EFAULT;
+ return -1;
+ }
+ }
+
eal_thread_init_master(rte_config.master_lcore);
ret = eal_thread_dump_affinity(cpuset, RTE_CPU_AFFINITY_STR_LEN);
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index f4f46c1..5dacde5 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -235,4 +235,13 @@ EXPERIMENTAL {
rte_service_set_stats_enable;
rte_service_start_with_defaults;
+} DPDK_17.08;
+
+DPDK_18.02 {
+
+ rte_eal_mp_action_register;
+ rte_eal_mp_action_unregister;
+ rte_eal_mp_sendmsg;
+
} DPDK_17.11;
--
2.7.4

Jianfeng Tan

2018-01-11 04:07:32 UTC

By the multi-process channel, we add an mp action named "proc".

As a secondary process starts, it sends a "proc add" message to
the primary.

As the primary finds a failure in sending message to a specific
secondary process, that secondary process is treated as exited;
and we remove it from the secondary array by sending a "proc del"
message to the primary itself.

Test:
1. Start the primary and the secondary process
$ (testpmd) -c 0x3 -n 4 -- -i
$ (helloworld) -c 0xc -n 4 --proc-type=auto --

2. Check the log of testpmd:
...
EAL: bind to /var/run/.rte_unix
...
EAL: add secondary: /var/run/.testpmd_unix_(xxx)
...

3. Check the log of helloworld:
...
EAL: bind to /var/run/.testpmd_unix_xxx
EAL: bind to /var/run/.testpmd_unix_c_xxx
...

Signed-off-by: Jianfeng Tan <***@intel.com>
---
lib/librte_eal/common/eal_common_proc.c | 88 ++++++++++++++++++++++++++++++++-
1 file changed, 86 insertions(+), 2 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index d700e9e..70519cc 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -54,6 +54,13 @@ struct mp_msghdr {
char params[0];
} __rte_packed;

+struct proc_request {
+#define MP_PROC_ADD 0
+#define MP_PROC_DEL 1
+ int type;
+ char path[MAX_UNIX_PATH_LEN];
+};
+
int
rte_eal_primary_proc_alive(const char *config_file_path)
{
@@ -214,6 +221,58 @@ mp_handle(void *arg __rte_unused)
return NULL;
}

+static int
+add_sec_proc(const char *path)
+{
+ int i;
+
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+ if (mp_sec_sockets[i] == NULL)
+ break;
+ if (i < MAX_SECONDARY_PROCS)
+ mp_sec_sockets[i] = strdup(path);
+
+ return i < MAX_SECONDARY_PROCS;
+}
+
+static int
+del_sec_proc(const char *path)
+{
+ int i;
+
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
+ if (!strcmp(mp_sec_sockets[i], path)) {
+ free(mp_sec_sockets[i]);
+ mp_sec_sockets[i] = NULL;
+ break;
+ }
+ }
+
+ return i < MAX_SECONDARY_PROCS;
+}
+
+static int
+mp_primary_proc(const void *params,
+ int len __rte_unused,
+ int fds[] __rte_unused,
+ int fds_num __rte_unused)
+{
+ const struct proc_request *r = (const struct proc_request *)params;
+
+ switch (r->type) {
+ case MP_PROC_ADD:
+ RTE_LOG(INFO, EAL, "add secondary: %s\n", r->path);
+ return add_sec_proc(r->path);
+ case MP_PROC_DEL:
+ RTE_LOG(INFO, EAL, "del secondary: %s\n", r->path);
+ return del_sec_proc(r->path);
+ default:
+ RTE_LOG(ERR, EAL, "invalid type: %d\n", r->type);
+ }
+
+ return -1;
+}
+
static inline const char *
get_unix_path(int is_server)
{
@@ -267,6 +326,22 @@ rte_eal_mp_channel_init(void)
if (mp_fd < 0)
return -1;

+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ if (rte_eal_mp_action_register("proc", mp_primary_proc) < 0) {
+ RTE_LOG(ERR, EAL, "failed to register handler\n");
+ goto error;
+ }
+ } else {
+ struct proc_request r;
+
+ r.type = MP_PROC_ADD;
+ snprintf(r.path, MAX_UNIX_PATH_LEN, "%s", get_unix_path(1));
+ if (rte_eal_mp_sendmsg("proc", &r, sizeof(r), NULL, 0) < 0) {
+ RTE_LOG(ERR, EAL, "failed to add into primary\n");
+ goto error;
+ }
+ }
+
if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
RTE_LOG(ERR, EAL, "failed to create mp handle thead: %s\n",
strerror(errno));
@@ -354,10 +429,19 @@ send_msg(int fd, const char *dst_path, struct mp_msghdr *msg, int fds[])
if (ret < 0) {
RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));

- if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ struct proc_request r;
+
RTE_LOG(ERR, EAL, "secondary process (%s) exited\n",
dst_path);
- else if (!rte_eal_primary_proc_alive(NULL))
+ r.type = MP_PROC_DEL;
+ snprintf(r.path, MAX_UNIX_PATH_LEN, "%s", dst_path);
+ if (rte_eal_mp_sendmsg("proc", &r,
+ sizeof(r), NULL, 0) < 0)
+ RTE_LOG(ERR, EAL,
+ "failed to del secondary %s\n",
+ dst_path);
+ } else if (!rte_eal_primary_proc_alive(NULL))
RTE_LOG(ERR, EAL, "primary process exited\n");

return 0;

--
2.7.4

Burakov, Anatoly

2018-01-13 13:11:59 UTC

Post by Jianfeng Tan
By the multi-process channel, we add an mp action named "proc".
As a secondary process starts, it sends a "proc add" message to
the primary.
As the primary finds a failure in sending message to a specific
secondary process, that secondary process is treated as exited;
and we remove it from the secondary array by sending a "proc del"
message to the primary itself.
1. Start the primary and the secondary process
$ (testpmd) -c 0x3 -n 4 -- -i
$ (helloworld) -c 0xc -n 4 --proc-type=auto --
...
EAL: bind to /var/run/.rte_unix
...
EAL: add secondary: /var/run/.testpmd_unix_(xxx)
...
...
EAL: bind to /var/run/.testpmd_unix_xxx
EAL: bind to /var/run/.testpmd_unix_c_xxx
...

it says "unix" all over the place, but that's an internal implementation
detail. "mp_socket" or similar should do, no?

Post by Jianfeng Tan
---
lib/librte_eal/common/eal_common_proc.c | 88 ++++++++++++++++++++++++++++++++-
1 file changed, 86 insertions(+), 2 deletions(-)
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index d700e9e..70519cc 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -54,6 +54,13 @@ struct mp_msghdr {
char params[0];
} __rte_packed;
+struct proc_request {
+#define MP_PROC_ADD 0
+#define MP_PROC_DEL 1
+ int type;
+ char path[MAX_UNIX_PATH_LEN];
+};
+
int
rte_eal_primary_proc_alive(const char *config_file_path)
{
@@ -214,6 +221,58 @@ mp_handle(void *arg __rte_unused)
return NULL;
}
+static int
+add_sec_proc(const char *path)
+{
+ int i;
+
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+ if (mp_sec_sockets[i] == NULL)
+ break;
+ if (i < MAX_SECONDARY_PROCS)
+ mp_sec_sockets[i] = strdup(path);
+
+ return i < MAX_SECONDARY_PROCS;
+}

While it's equivalent, the intent behind this isn't clear, it's
needlessly complicating the more common idiom of

for (i = 0; i < MAX; i++) {}
if (i == MAX)
return error;
do_something;
return success;

Post by Jianfeng Tan
+
+static int
+del_sec_proc(const char *path)
+{
+ int i;
+
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
+ if (!strcmp(mp_sec_sockets[i], path)) {
+ free(mp_sec_sockets[i]);
+ mp_sec_sockets[i] = NULL;
+ break;
+ }
+ }
+
+ return i < MAX_SECONDARY_PROCS;
+}

Same as above - maybe rewrite it as a more commonly used idiom. Also,
you probably want to use strncmp(), and check for NULL pointers, IIRC
strncmp(NULL) is undefined behavior.

Post by Jianfeng Tan
+
+static int
+mp_primary_proc(const void *params,
+ int len __rte_unused,
+ int fds[] __rte_unused,
+ int fds_num __rte_unused)
+{
+ const struct proc_request *r = (const struct proc_request *)params;
+
+ switch (r->type) {
+ RTE_LOG(INFO, EAL, "add secondary: %s\n", r->path);
+ return add_sec_proc(r->path);
+ RTE_LOG(INFO, EAL, "del secondary: %s\n", r->path);
+ return del_sec_proc(r->path);
+ RTE_LOG(ERR, EAL, "invalid type: %d\n", r->type);
+ }
+
+ return -1;
+}
+
static inline const char *
get_unix_path(int is_server)
{
@@ -267,6 +326,22 @@ rte_eal_mp_channel_init(void)
if (mp_fd < 0)
return -1;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ if (rte_eal_mp_action_register("proc", mp_primary_proc) < 0) {
+ RTE_LOG(ERR, EAL, "failed to register handler\n");
+ goto error;
+ }
+ } else {
+ struct proc_request r;
+
+ r.type = MP_PROC_ADD;
+ snprintf(r.path, MAX_UNIX_PATH_LEN, "%s", get_unix_path(1));

Nitpicking, but maybe just send PID instead of the whole path?
Primary/secondary share their prefix and most of their socket path
anyway, so the real difference is the PID. This would also eliminate the
need for using strings in many places.

Post by Jianfeng Tan
+ if (rte_eal_mp_sendmsg("proc", &r, sizeof(r), NULL, 0) < 0) {
+ RTE_LOG(ERR, EAL, "failed to add into primary\n");
+ goto error;
+ }
+ }
+
if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
RTE_LOG(ERR, EAL, "failed to create mp handle thead: %s\n",
strerror(errno));
@@ -354,10 +429,19 @@ send_msg(int fd, const char *dst_path, struct mp_msghdr *msg, int fds[])
if (ret < 0) {
RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
- if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ struct proc_request r;
+
RTE_LOG(ERR, EAL, "secondary process (%s) exited\n",
dst_path);
- else if (!rte_eal_primary_proc_alive(NULL))
+ r.type = MP_PROC_DEL;
+ snprintf(r.path, MAX_UNIX_PATH_LEN, "%s", dst_path);
+ if (rte_eal_mp_sendmsg("proc", &r,
+ sizeof(r), NULL, 0) < 0)
+ RTE_LOG(ERR, EAL,
+ "failed to del secondary %s\n",
+ dst_path);
+ } else if (!rte_eal_primary_proc_alive(NULL))
RTE_LOG(ERR, EAL, "primary process exited\n");
return 0;

--
Thanks,
Anatoly

Ananyev, Konstantin

2018-01-15 21:45:53 UTC

-----Original Message-----
From: Tan, Jianfeng
Sent: Thursday, January 11, 2018 4:08 AM
Subject: [PATCH v2 2/4] eal: add and del secondary processes in the primary
By the multi-process channel, we add an mp action named "proc".
As a secondary process starts, it sends a "proc add" message to
the primary.
As the primary finds a failure in sending message to a specific
secondary process, that secondary process is treated as exited;
and we remove it from the secondary array by sending a "proc del"
message to the primary itself.
1. Start the primary and the secondary process
$ (testpmd) -c 0x3 -n 4 -- -i
$ (helloworld) -c 0xc -n 4 --proc-type=auto --
...
EAL: bind to /var/run/.rte_unix
...
EAL: add secondary: /var/run/.testpmd_unix_(xxx)
...
...
EAL: bind to /var/run/.testpmd_unix_xxx
EAL: bind to /var/run/.testpmd_unix_c_xxx
...
---
lib/librte_eal/common/eal_common_proc.c | 88 ++++++++++++++++++++++++++++++++-
1 file changed, 86 insertions(+), 2 deletions(-)
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index d700e9e..70519cc 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -54,6 +54,13 @@ struct mp_msghdr {
char params[0];
} __rte_packed;
+struct proc_request {
+#define MP_PROC_ADD 0
+#define MP_PROC_DEL 1
+ int type;
+ char path[MAX_UNIX_PATH_LEN];
+};
+
int
rte_eal_primary_proc_alive(const char *config_file_path)
{
@@ -214,6 +221,58 @@ mp_handle(void *arg __rte_unused)
return NULL;
}
+static int
+add_sec_proc(const char *path)
+{
+ int i;
+
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+ if (mp_sec_sockets[i] == NULL)
+ break;
+ if (i < MAX_SECONDARY_PROCS)
+ mp_sec_sockets[i] = strdup(path);
+
+ return i < MAX_SECONDARY_PROCS;
+}
+
+static int
+del_sec_proc(const char *path)
+{
+ int i;
+
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
+ if (!strcmp(mp_sec_sockets[i], path)) {
+ free(mp_sec_sockets[i]);
+ mp_sec_sockets[i] = NULL;
+ break;
+ }
+ }
+
+ return i < MAX_SECONDARY_PROCS;
+}

I am not sure we really need all these add/del messages and mp_sec_sockets[]...
For broadcast - why we can't just scan contents of our home dir for all open client sockets
and send a message to each such socket found.
Konstantin

+
+static int
+mp_primary_proc(const void *params,
+ int len __rte_unused,
+ int fds[] __rte_unused,
+ int fds_num __rte_unused)
+{
+ const struct proc_request *r = (const struct proc_request *)params;
+
+ switch (r->type) {
+ RTE_LOG(INFO, EAL, "add secondary: %s\n", r->path);
+ return add_sec_proc(r->path);
+ RTE_LOG(INFO, EAL, "del secondary: %s\n", r->path);
+ return del_sec_proc(r->path);
+ RTE_LOG(ERR, EAL, "invalid type: %d\n", r->type);
+ }
+
+ return -1;
+}
+
static inline const char *
get_unix_path(int is_server)
{
@@ -267,6 +326,22 @@ rte_eal_mp_channel_init(void)
if (mp_fd < 0)
return -1;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ if (rte_eal_mp_action_register("proc", mp_primary_proc) < 0) {
+ RTE_LOG(ERR, EAL, "failed to register handler\n");
+ goto error;
+ }
+ } else {
+ struct proc_request r;
+
+ r.type = MP_PROC_ADD;
+ snprintf(r.path, MAX_UNIX_PATH_LEN, "%s", get_unix_path(1));
+ if (rte_eal_mp_sendmsg("proc", &r, sizeof(r), NULL, 0) < 0) {
+ RTE_LOG(ERR, EAL, "failed to add into primary\n");
+ goto error;
+ }
+ }
+
if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
RTE_LOG(ERR, EAL, "failed to create mp handle thead: %s\n",
strerror(errno));
@@ -354,10 +429,19 @@ send_msg(int fd, const char *dst_path, struct mp_msghdr *msg, int fds[])
if (ret < 0) {
RTE_LOG(ERR, EAL, "failed to send msg: %s\n", strerror(errno));
- if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ struct proc_request r;
+
RTE_LOG(ERR, EAL, "secondary process (%s) exited\n",
dst_path);
- else if (!rte_eal_primary_proc_alive(NULL))
+ r.type = MP_PROC_DEL;
+ snprintf(r.path, MAX_UNIX_PATH_LEN, "%s", dst_path);
+ if (rte_eal_mp_sendmsg("proc", &r,
+ sizeof(r), NULL, 0) < 0)
+ RTE_LOG(ERR, EAL,
+ "failed to del secondary %s\n",
+ dst_path);
+ } else if (!rte_eal_primary_proc_alive(NULL))
RTE_LOG(ERR, EAL, "primary process exited\n");
return 0;
--
2.7.4

Jianfeng Tan

2018-01-11 04:07:33 UTC

We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.

We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The
timeout is hard-coded 5 Sec. And the replied message will be copied
in the parameters of this API so that the caller can decide how
to translate those information (including params and fds). Note
if a primary process owns multiple secondary processes, this API
will fail.

The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.

We use mutex in rte_eal_mp_request() to guarantee that only one
request is on the fly for one pair of processes.

Suggested-by: Anatoly Burakov <***@intel.com>
Signed-off-by: Jianfeng Tan <***@intel.com>
---
lib/librte_eal/common/eal_common_proc.c | 144 +++++++++++++++++++++++++++++---
lib/librte_eal/common/include/rte_eal.h | 73 +++++++++++++++-
lib/librte_eal/rte_eal_version.map | 2 +
3 files changed, 206 insertions(+), 13 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 70519cc..f194a52 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -32,6 +32,7 @@
static int mp_fd = -1;
static char *mp_sec_sockets[MAX_SECONDARY_PROCS];
static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t mp_mutex_request = PTHREAD_MUTEX_INITIALIZER;

struct action_entry {
TAILQ_ENTRY(action_entry) next; /**< Next attached action entry */
@@ -49,6 +50,10 @@ static struct action_entry_list action_entry_list =

struct mp_msghdr {
char action_name[MAX_ACTION_NAME_LEN];
+#define MP_MSG 0 /* Share message with peers, will not block */
+#define MP_REQ 1 /* Request for information, Will block for a reply */
+#define MP_REP 2 /* Reply to previously-received request */
+ int type;
int fds_num;
int len_params;
char params[0];
@@ -138,7 +143,8 @@ rte_eal_mp_action_unregister(const char *name)
}

static int
-read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
+read_msg(int fd, char *buf, int buflen,
+ int *fds, int fds_num, struct sockaddr_un *s)
{
int ret;
struct iovec iov;
@@ -151,6 +157,8 @@ read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
iov.iov_base = buf;
iov.iov_len = buflen;

+ msgh.msg_name = s;
+ msgh.msg_namelen = sizeof(*s);
msgh.msg_iov = &iov;
msgh.msg_iovlen = 1;
msgh.msg_control = control;
@@ -181,7 +189,7 @@ read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
}

static int
-process_msg(struct mp_msghdr *hdr, int len, int fds[])
+process_msg(struct mp_msghdr *hdr, int len, int fds[], struct sockaddr_un *s)
{
int ret;
int params_len;
@@ -199,10 +207,10 @@ process_msg(struct mp_msghdr *hdr, int len, int fds[])
}

params_len = len - sizeof(struct mp_msghdr);
- ret = entry->action(hdr->params, params_len, fds, hdr->fds_num);
+ ret = entry->action(hdr->params, params_len,
+ fds, hdr->fds_num, s->sun_path);
pthread_mutex_unlock(&mp_mutex_action);
return ret;
-
}

static void *
@@ -211,11 +219,12 @@ mp_handle(void *arg __rte_unused)
int len;
int fds[SCM_MAX_FD];
char buf[MAX_MSG_LENGTH];
+ struct sockaddr_un sa;

while (1) {
- len = read_msg(mp_fd, buf, MAX_MSG_LENGTH, fds, SCM_MAX_FD);
+ len = read_msg(mp_fd, buf, MAX_MSG_LENGTH, fds, SCM_MAX_FD, &sa);
if (len > 0)
- process_msg((struct mp_msghdr *)buf, len, fds);
+ process_msg((struct mp_msghdr *)buf, len, fds, &sa);
}

return NULL;
@@ -255,7 +264,8 @@ static int
mp_primary_proc(const void *params,
int len __rte_unused,
int fds[] __rte_unused,
- int fds_num __rte_unused)
+ int fds_num __rte_unused,
+ const void *peer __rte_unused)
{
const struct proc_request *r = (const struct proc_request *)params;

@@ -362,7 +372,8 @@ rte_eal_mp_channel_init(void)
}

static inline struct mp_msghdr *
-format_msg(const char *act_name, const void *p, int len_params, int fds_num)
+format_msg(const char *act_name, const void *p,
+ int len_params, int fds_num, int type)
{
int len_msg;
struct mp_msghdr *msg;
@@ -384,6 +395,7 @@ format_msg(const char *act_name, const void *p, int len_params, int fds_num)
strcpy(msg->action_name, act_name);
msg->fds_num = fds_num;
msg->len_params = len_params;
+ msg->type = type;
memcpy(msg->params, p, len_params);
return msg;
}
@@ -455,7 +467,9 @@ mp_send(const char *action_name,
const void *params,
int len_params,
int fds[],
- int fds_num)
+ int fds_num,
+ int type,
+ const void *peer)
{
int i;
int n = 0;
@@ -468,7 +482,7 @@ mp_send(const char *action_name,
return 0;
}

- msg = format_msg(action_name, params, len_params, fds_num);
+ msg = format_msg(action_name, params, len_params, fds_num, type);
if (msg == NULL)
return 0;

@@ -477,6 +491,11 @@ mp_send(const char *action_name,
return 0;
}

+ if (peer) {
+ n += send_msg(sockfd, peer, msg, fds);
+ goto ret;
+ }
+
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
/* broadcast to all secondaries */
for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
@@ -488,6 +507,7 @@ mp_send(const char *action_name,
} else
n += send_msg(sockfd, eal_mp_unix_path(), msg, fds);

+ret:
free(msg);
close(sockfd);
return n;
@@ -501,5 +521,107 @@ rte_eal_mp_sendmsg(const char *action_name,
int fds_num)
{
RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", action_name);
- return mp_send(action_name, params, len_params, fds, fds_num);
+ return mp_send(action_name, params, len_params,
+ fds, fds_num, MP_MSG, NULL);
+}
+
+int
+rte_eal_mp_request(const char *action_name,
+ void *params,
+ int len_p,
+ int fds[],
+ int fds_in,
+ int fds_out)
+{
+ int i, j;
+ int sockfd;
+ int nprocs;
+ int ret = 0;
+ struct mp_msghdr *req;
+ struct timeval tv;
+ char buf[MAX_MSG_LENGTH];
+ struct mp_msghdr *hdr;
+
+ RTE_LOG(DEBUG, EAL, "request: %s\n", action_name);
+
+ if (fds_in > SCM_MAX_FD || fds_out > SCM_MAX_FD) {
+ RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
+ rte_errno = -E2BIG;
+ return 0;
+ }
+
+ req = format_msg(action_name, params, len_p, fds_in, MP_REQ);
+ if (req == NULL)
+ return 0;
+
+ if ((sockfd = open_unix_fd(0)) < 0) {
+ free(req);
+ return 0;
+ }
+
+ tv.tv_sec = 5; /* 5 Secs Timeout */
+ tv.tv_usec = 0;
+ if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO,
+ (const void *)&tv, sizeof(struct timeval)) < 0)
+ RTE_LOG(INFO, EAL, "Failed to set recv timeout\n");
+
+ /* Only allow one req at a time */
+ pthread_mutex_lock(&mp_mutex_request);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ nprocs = 0;
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+ if (!mp_sec_sockets[i]) {
+ j = i;
+ nprocs++;
+ }
+
+ if (nprocs > 1) {
+ RTE_LOG(ERR, EAL,
+ "multi secondary processes not supported\n");
+ goto free_and_ret;
+ }
+
+ ret = send_msg(sockfd, mp_sec_sockets[j], req, fds);
+ } else
+ ret = send_msg(sockfd, eal_mp_unix_path(), req, fds);
+
+ if (ret == 0) {
+ RTE_LOG(ERR, EAL, "failed to send request: %s\n", action_name);
+ ret = -1;
+ goto free_and_ret;
+ }
+
+ ret = read_msg(sockfd, buf, MAX_MSG_LENGTH, fds, fds_out, NULL);
+ if (ret > 0) {
+ hdr = (struct mp_msghdr *)buf;
+ if (hdr->len_params == len_p)
+ memcpy(params, hdr->params, len_p);
+ else {
+ RTE_LOG(ERR, EAL, "invalid reply\n");
+ ret = 0;
+ }
+ }
+
+free_and_ret:
+ free(req);
+ close(sockfd);
+ pthread_mutex_unlock(&mp_mutex_request);
+ return ret;
+}
+
+int
+rte_eal_mp_reply(const char *action_name,
+ const void *params,
+ int len_p,
+ int fds[],
+ int fds_in,
+ const void *peer)
+{
+ RTE_LOG(DEBUG, EAL, "reply: %s\n", action_name);
+ if (peer == NULL) {
+ RTE_LOG(ERR, EAL, "peer is not specified\n");
+ return 0;
+ }
+ return mp_send(action_name, params, len_p, fds, fds_in, MP_REP, peer);
}
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 9884c0b..2690a77 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -192,7 +192,7 @@ int rte_eal_primary_proc_alive(const char *config_file_path);
* this function typedef to register action for coming messages.
*/
typedef int (*rte_eal_mp_t)(const void *params, int len,
- int fds[], int fds_num);
+ int fds[], int fds_num, const void *peer);

/**
* Register an action function for primary/secondary communication.
@@ -245,7 +245,7 @@ void rte_eal_mp_action_unregister(const char *name);
* The fds argument is an array of fds sent with sendmsg.
*
* @param fds_num
- * The fds_num argument is number of fds to be sent with sendmsg.
+ * The number of fds to be sent with sendmsg.
*
* @return
* - Returns the number of messages being sent successfully.
@@ -255,6 +255,75 @@ rte_eal_mp_sendmsg(const char *action_name, const void *params,
int len_params, int fds[], int fds_num);

/**
+ * Send a request to the peer process and expect a reply.
+ *
+ * This function sends a request message to the peer process, and will
+ * block until receiving reply message from the peer process. Note:
+ * this does not work for the primary process sending requests to its
+ * multiple (>1) secondary processes.
+ *
+ * @param action_name
+ * The action_name argument is used to identify which action will be used.
+ *
+ * @param params
+ * The params argument contains the customized message; as the reply is
+ * received, the replied params will be copied to this pointer.
+ *
+ * @param len_p
+ * The length of the customized message.
+ *
+ * @param fds
+ * The fds argument is an array of fds sent with sendmsg; as the reply
+ * is received, the replied fds will be copied into this array.
+ *
+ * @param fds_in
+ * The number of fds to be sent.
+ *
+ * @param fds_out
+ * The number of fds to be received.
+ *
+ * @return
+ * - (1) on success;
+ * - (0) on sending request successfully but no valid reply received.
+ * - (<0) on failing to sending request.
+ */
+int
+rte_eal_mp_request(const char *action_name, void *params,
+ int len_p, int fds[], int fds_in, int fds_out);
+
+/**
+ * Send a reply to the peer process.
+ *
+ * This function will send a reply message in response to a request message
+ * received previously.
+ *
+ * @param action_name
+ * The action_name argument is used to identify which action will be used.
+ *
+ * @param params
+ * The params argument contains the customized message.
+ *
+ * @param len_p
+ * The length of the customized message.
+ *
+ * @param fds
+ * The fds argument is an array of fds sent with sendmsg.
+ *
+ * @param fds_in
+ * The number of fds to be sent with sendmsg.
+ *
+ * @param peer
+ * The fds_num argument is number of fds to be sent with sendmsg.
+ *
+ * @return
+ * - (1) on success;
+ * - (0) on failure.
+ */
+int
+rte_eal_mp_reply(const char *action_name, const void *params,
+ int len_p, int fds[], int fds_in, const void *peer);
+
+/**
* Usage function typedef used by the application usage function.
*
* Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 5dacde5..068ac0b 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -243,5 +243,7 @@ DPDK_18.02 {
rte_eal_mp_action_register;
rte_eal_mp_action_unregister;
rte_eal_mp_sendmsg;
+ rte_eal_mp_request;
+ rte_eal_mp_reply;

} DPDK_17.11;

--
2.7.4

Burakov, Anatoly

2018-01-13 13:41:50 UTC

Post by Jianfeng Tan
---
lib/librte_eal/common/eal_common_proc.c | 144 +++++++++++++++++++++++++++++---
lib/librte_eal/common/include/rte_eal.h | 73 +++++++++++++++-
lib/librte_eal/rte_eal_version.map | 2 +
3 files changed, 206 insertions(+), 13 deletions(-)
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 70519cc..f194a52 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -32,6 +32,7 @@
static int mp_fd = -1;
static char *mp_sec_sockets[MAX_SECONDARY_PROCS];
static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t mp_mutex_request = PTHREAD_MUTEX_INITIALIZER;
struct action_entry {
TAILQ_ENTRY(action_entry) next; /**< Next attached action entry */
@@ -49,6 +50,10 @@ static struct action_entry_list action_entry_list =
struct mp_msghdr {
char action_name[MAX_ACTION_NAME_LEN];
+#define MP_MSG 0 /* Share message with peers, will not block */
+#define MP_REQ 1 /* Request for information, Will block for a reply */
+#define MP_REP 2 /* Reply to previously-received request */

nitpicking, but... response instead of reply?

Post by Jianfeng Tan
+ int type;
int fds_num;
int len_params;
char params[0];
@@ -138,7 +143,8 @@ rte_eal_mp_action_unregister(const char *name)
}
static int
-read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
+read_msg(int fd, char *buf, int buflen,
+ int *fds, int fds_num, struct sockaddr_un *s)

<snip>

Post by Jianfeng Tan
+ return mp_send(action_name, params, len_params,
+ fds, fds_num, MP_MSG, NULL);
+}
+
+int
+rte_eal_mp_request(const char *action_name,
+ void *params,
+ int len_p,
+ int fds[],
+ int fds_in,
+ int fds_out)

name == NULL? name too long?

Post by Jianfeng Tan
+{
+ int i, j;
+ int sockfd;
+ int nprocs;
+ int ret = 0;
+ struct mp_msghdr *req;
+ struct timeval tv;
+ char buf[MAX_MSG_LENGTH];
+ struct mp_msghdr *hdr;
+
+ RTE_LOG(DEBUG, EAL, "request: %s\n", action_name);
+
+ if (fds_in > SCM_MAX_FD || fds_out > SCM_MAX_FD) {
+ RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
+ rte_errno = -E2BIG;

(this also applies to previous patches) you set rte_errno to -EINVAL in
format_msg when message with parameters is too big - should that be
setting -E2BIG as well? Also, maybe not set rte_errno in multiple
places, and put all parameter checking (or at least errno setting) in
rte_eal_mp_* functions?

Post by Jianfeng Tan
+ return 0;
+ }
+
+ req = format_msg(action_name, params, len_p, fds_in, MP_REQ);
+ if (req == NULL)
+ return 0;
+
+ if ((sockfd = open_unix_fd(0)) < 0) {
+ free(req);
+ return 0;
+ }
+
+ tv.tv_sec = 5; /* 5 Secs Timeout */
+ tv.tv_usec = 0;
+ if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO,
+ (const void *)&tv, sizeof(struct timeval)) < 0)
+ RTE_LOG(INFO, EAL, "Failed to set recv timeout\n");
+
+ /* Only allow one req at a time */
+ pthread_mutex_lock(&mp_mutex_request);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ nprocs = 0;
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i)

What follows is a bit confusing, some comments explaining what happens
and maybe more informative variable names would've been helpful.

Post by Jianfeng Tan
+ if (!mp_sec_sockets[i]) {
+ j = i;
+ nprocs++;
+ }
+
+ if (nprocs > 1) {
+ RTE_LOG(ERR, EAL,
+ "multi secondary processes not supported\n");
+ goto free_and_ret;
+ }
+

<snip>

--
Thanks,
Anatoly

Ananyev, Konstantin

2018-01-16 00:00:43 UTC

Post by Jianfeng Tan
We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.
We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The
timeout is hard-coded 5 Sec. And the replied message will be copied
in the parameters of this API so that the caller can decide how
to translate those information (including params and fds). Note
if a primary process owns multiple secondary processes, this API
will fail.
The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.
We use mutex in rte_eal_mp_request() to guarantee that only one
request is on the fly for one pair of processes.

You don't need to do things in such strange and restrictive way.
Instead you can do something like that:
1) Introduce new struct, list for it and mutex
struct sync_request {
int reply_received;
char dst[PATH_MAX];
char reply[...];
LIST_ENTRY(sync_request) next;
};

static struct
LIST_HEAD(list, sync_request);
pthread_mutex_t lock;
pthead_cond_t cond;
} sync_requests;

2) then at request() call:
Grab sync_requests.lock
Check do we already have a pending request for that destination,
If yes - the release the lock and returns with error.
- allocate and init new sync_request struct, set reply_received=0
- do send_msg()
-then in a cycle:
pthread_cond_timed_wait(&sync_requests.cond, &sync_request.lock, &timespec);
- at return from it check if sync_request.reply_received == 1, if not
check if timeout expired and either return a failure or go to the start of the cycle.

3) at mp_handler() if REPLY received - grab sync_request.lock,
search through sync_requests.list for dst[] ,
if found, then set it's reply_received=1, copy the received message into reply
and call pthread_cond_braodcast((&sync_requests.cond);

Post by Jianfeng Tan
---
lib/librte_eal/common/eal_common_proc.c | 144 +++++++++++++++++++++++++++++---
lib/librte_eal/common/include/rte_eal.h | 73 +++++++++++++++-
lib/librte_eal/rte_eal_version.map | 2 +
3 files changed, 206 insertions(+), 13 deletions(-)
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 70519cc..f194a52 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -32,6 +32,7 @@
static int mp_fd = -1;
static char *mp_sec_sockets[MAX_SECONDARY_PROCS];
static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t mp_mutex_request = PTHREAD_MUTEX_INITIALIZER;
struct action_entry {
TAILQ_ENTRY(action_entry) next; /**< Next attached action entry */
@@ -49,6 +50,10 @@ static struct action_entry_list action_entry_list =
struct mp_msghdr {
char action_name[MAX_ACTION_NAME_LEN];
+#define MP_MSG 0 /* Share message with peers, will not block */
+#define MP_REQ 1 /* Request for information, Will block for a reply */
+#define MP_REP 2 /* Reply to previously-received request */

As a nit - please use enum {} instead for the above macros.

Post by Jianfeng Tan
+ int type;
int fds_num;
int len_params;
char params[0];
@@ -138,7 +143,8 @@ rte_eal_mp_action_unregister(const char *name)
}
static int
-read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
+read_msg(int fd, char *buf, int buflen,
+ int *fds, int fds_num, struct sockaddr_un *s)
{
int ret;
struct iovec iov;
@@ -151,6 +157,8 @@ read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
iov.iov_base = buf;
iov.iov_len = buflen;
+ msgh.msg_name = s;
+ msgh.msg_namelen = sizeof(*s);
msgh.msg_iov = &iov;
msgh.msg_iovlen = 1;
msgh.msg_control = control;
@@ -181,7 +189,7 @@ read_msg(int fd, char *buf, int buflen, int *fds, int fds_num)
}
static int
-process_msg(struct mp_msghdr *hdr, int len, int fds[])
+process_msg(struct mp_msghdr *hdr, int len, int fds[], struct sockaddr_un *s)
{
int ret;
int params_len;
@@ -199,10 +207,10 @@ process_msg(struct mp_msghdr *hdr, int len, int fds[])
}
params_len = len - sizeof(struct mp_msghdr);
- ret = entry->action(hdr->params, params_len, fds, hdr->fds_num);
+ ret = entry->action(hdr->params, params_len,
+ fds, hdr->fds_num, s->sun_path);
pthread_mutex_unlock(&mp_mutex_action);
return ret;
-
}
static void *
@@ -211,11 +219,12 @@ mp_handle(void *arg __rte_unused)
int len;
int fds[SCM_MAX_FD];
char buf[MAX_MSG_LENGTH];
+ struct sockaddr_un sa;
while (1) {
- len = read_msg(mp_fd, buf, MAX_MSG_LENGTH, fds, SCM_MAX_FD);
+ len = read_msg(mp_fd, buf, MAX_MSG_LENGTH, fds, SCM_MAX_FD, &sa);
if (len > 0)
- process_msg((struct mp_msghdr *)buf, len, fds);
+ process_msg((struct mp_msghdr *)buf, len, fds, &sa);
}
return NULL;
@@ -255,7 +264,8 @@ static int
mp_primary_proc(const void *params,
int len __rte_unused,
int fds[] __rte_unused,
- int fds_num __rte_unused)
+ int fds_num __rte_unused,
+ const void *peer __rte_unused)
{
const struct proc_request *r = (const struct proc_request *)params;
@@ -362,7 +372,8 @@ rte_eal_mp_channel_init(void)
}
static inline struct mp_msghdr *
-format_msg(const char *act_name, const void *p, int len_params, int fds_num)
+format_msg(const char *act_name, const void *p,
+ int len_params, int fds_num, int type)
{
int len_msg;
struct mp_msghdr *msg;
@@ -384,6 +395,7 @@ format_msg(const char *act_name, const void *p, int len_params, int fds_num)
strcpy(msg->action_name, act_name);
msg->fds_num = fds_num;
msg->len_params = len_params;
+ msg->type = type;
memcpy(msg->params, p, len_params);
return msg;
}
@@ -455,7 +467,9 @@ mp_send(const char *action_name,
const void *params,
int len_params,
int fds[],
- int fds_num)
+ int fds_num,
+ int type,
+ const void *peer)
{
int i;
int n = 0;
@@ -468,7 +482,7 @@ mp_send(const char *action_name,
return 0;
}
- msg = format_msg(action_name, params, len_params, fds_num);
+ msg = format_msg(action_name, params, len_params, fds_num, type);
if (msg == NULL)
return 0;
@@ -477,6 +491,11 @@ mp_send(const char *action_name,
return 0;
}
+ if (peer) {
+ n += send_msg(sockfd, peer, msg, fds);
+ goto ret;
+ }
+
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
/* broadcast to all secondaries */
for (i = 0; i < MAX_SECONDARY_PROCS; ++i) {
@@ -488,6 +507,7 @@ mp_send(const char *action_name,
} else
n += send_msg(sockfd, eal_mp_unix_path(), msg, fds);
free(msg);
close(sockfd);
return n;
@@ -501,5 +521,107 @@ rte_eal_mp_sendmsg(const char *action_name,
int fds_num)
{
RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", action_name);
- return mp_send(action_name, params, len_params, fds, fds_num);
+ return mp_send(action_name, params, len_params,
+ fds, fds_num, MP_MSG, NULL);
+}
+
+int
+rte_eal_mp_request(const char *action_name,
+ void *params,
+ int len_p,
+ int fds[],
+ int fds_in,
+ int fds_out)
+{
+ int i, j;
+ int sockfd;
+ int nprocs;
+ int ret = 0;
+ struct mp_msghdr *req;
+ struct timeval tv;
+ char buf[MAX_MSG_LENGTH];
+ struct mp_msghdr *hdr;
+
+ RTE_LOG(DEBUG, EAL, "request: %s\n", action_name);
+
+ if (fds_in > SCM_MAX_FD || fds_out > SCM_MAX_FD) {
+ RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
+ rte_errno = -E2BIG;
+ return 0;
+ }
+
+ req = format_msg(action_name, params, len_p, fds_in, MP_REQ);
+ if (req == NULL)
+ return 0;
+
+ if ((sockfd = open_unix_fd(0)) < 0) {
+ free(req);
+ return 0;
+ }
+
+ tv.tv_sec = 5; /* 5 Secs Timeout */
+ tv.tv_usec = 0;
+ if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO,
+ (const void *)&tv, sizeof(struct timeval)) < 0)
+ RTE_LOG(INFO, EAL, "Failed to set recv timeout\n");
+
+ /* Only allow one req at a time */
+ pthread_mutex_lock(&mp_mutex_request);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ nprocs = 0;
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+ if (!mp_sec_sockets[i]) {
+ j = i;
+ nprocs++;
+ }
+
+ if (nprocs > 1) {
+ RTE_LOG(ERR, EAL,
+ "multi secondary processes not supported\n");
+ goto free_and_ret;
+ }
+
+ ret = send_msg(sockfd, mp_sec_sockets[j], req, fds);
+ } else
+ ret = send_msg(sockfd, eal_mp_unix_path(), req, fds);
+
+ if (ret == 0) {
+ RTE_LOG(ERR, EAL, "failed to send request: %s\n", action_name);
+ ret = -1;
+ goto free_and_ret;
+ }
+
+ ret = read_msg(sockfd, buf, MAX_MSG_LENGTH, fds, fds_out, NULL);
+ if (ret > 0) {
+ hdr = (struct mp_msghdr *)buf;
+ if (hdr->len_params == len_p)
+ memcpy(params, hdr->params, len_p);
+ else {
+ RTE_LOG(ERR, EAL, "invalid reply\n");
+ ret = 0;
+ }
+ }
+
+ free(req);
+ close(sockfd);
+ pthread_mutex_unlock(&mp_mutex_request);
+ return ret;
+}
+
+int
+rte_eal_mp_reply(const char *action_name,
+ const void *params,
+ int len_p,
+ int fds[],
+ int fds_in,
+ const void *peer)
+{
+ RTE_LOG(DEBUG, EAL, "reply: %s\n", action_name);
+ if (peer == NULL) {
+ RTE_LOG(ERR, EAL, "peer is not specified\n");
+ return 0;
+ }
+ return mp_send(action_name, params, len_p, fds, fds_in, MP_REP, peer);
}
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 9884c0b..2690a77 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -192,7 +192,7 @@ int rte_eal_primary_proc_alive(const char *config_file_path);
* this function typedef to register action for coming messages.
*/
typedef int (*rte_eal_mp_t)(const void *params, int len,
- int fds[], int fds_num);
+ int fds[], int fds_num, const void *peer);
/**
* Register an action function for primary/secondary communication.
@@ -245,7 +245,7 @@ void rte_eal_mp_action_unregister(const char *name);
* The fds argument is an array of fds sent with sendmsg.
*
- * The fds_num argument is number of fds to be sent with sendmsg.
+ * The number of fds to be sent with sendmsg.
*
* - Returns the number of messages being sent successfully.
@@ -255,6 +255,75 @@ rte_eal_mp_sendmsg(const char *action_name, const void *params,
int len_params, int fds[], int fds_num);
/**
+ * Send a request to the peer process and expect a reply.
+ *
+ * This function sends a request message to the peer process, and will
+ * this does not work for the primary process sending requests to its
+ * multiple (>1) secondary processes.
+ *
+ * The action_name argument is used to identify which action will be used.
+ *
+ * The params argument contains the customized message; as the reply is
+ * received, the replied params will be copied to this pointer.
+ *
+ * The length of the customized message.
+ *
+ * The fds argument is an array of fds sent with sendmsg; as the reply
+ * is received, the replied fds will be copied into this array.
+ *
+ * The number of fds to be sent.
+ *
+ * The number of fds to be received.
+ *
+ * - (1) on success;
+ * - (0) on sending request successfully but no valid reply received.
+ * - (<0) on failing to sending request.
+ */
+int
+rte_eal_mp_request(const char *action_name, void *params,
+ int len_p, int fds[], int fds_in, int fds_out);
+
+/**
+ * Send a reply to the peer process.
+ *
+ * This function will send a reply message in response to a request message
+ * received previously.
+ *
+ * The action_name argument is used to identify which action will be used.
+ *
+ * The params argument contains the customized message.
+ *
+ * The length of the customized message.
+ *
+ * The fds argument is an array of fds sent with sendmsg.
+ *
+ * The number of fds to be sent with sendmsg.
+ *
+ * The fds_num argument is number of fds to be sent with sendmsg.
+ *
+ * - (1) on success;
+ * - (0) on failure.
+ */
+int
+rte_eal_mp_reply(const char *action_name, const void *params,
+ int len_p, int fds[], int fds_in, const void *peer);
+
+/**
* Usage function typedef used by the application usage function.
*
* Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 5dacde5..068ac0b 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -243,5 +243,7 @@ DPDK_18.02 {
rte_eal_mp_action_register;
rte_eal_mp_action_unregister;
rte_eal_mp_sendmsg;
+ rte_eal_mp_request;
+ rte_eal_mp_reply;
} DPDK_17.11;
--
2.7.4

Tan, Jianfeng

2018-01-16 08:10:31 UTC

Thank you, Konstantin and Anatoly firstly. Other comments are well
received and I'll send out a new version.

Post by Ananyev, Konstantin

Post by Jianfeng Tan
We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.
We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The
timeout is hard-coded 5 Sec. And the replied message will be copied
in the parameters of this API so that the caller can decide how
to translate those information (including params and fds). Note
if a primary process owns multiple secondary processes, this API
will fail.
The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.
We use mutex in rte_eal_mp_request() to guarantee that only one
request is on the fly for one pair of processes.

You don't need to do things in such strange and restrictive way.
1) Introduce new struct, list for it and mutex
struct sync_request {
int reply_received;
char dst[PATH_MAX];
char reply[...];
LIST_ENTRY(sync_request) next;
};
static struct
LIST_HEAD(list, sync_request);
pthread_mutex_t lock;
pthead_cond_t cond;
} sync_requests;
Grab sync_requests.lock
Check do we already have a pending request for that destination,
If yes - the release the lock and returns with error.
- allocate and init new sync_request struct, set reply_received=0
- do send_msg()
pthread_cond_timed_wait(&sync_requests.cond, &sync_request.lock, &timespec);
- at return from it check if sync_request.reply_received == 1, if not
check if timeout expired and either return a failure or go to the start of the cycle.
3) at mp_handler() if REPLY received - grab sync_request.lock,
search through sync_requests.list for dst[] ,
if found, then set it's reply_received=1, copy the received message into reply
and call pthread_cond_braodcast((&sync_requests.cond);

The only benefit I can see is that now the sender can request to
multiple receivers at the same time. And it makes things more
complicated. Do we really need this?

Thanks,
Jianfeng

Ananyev, Konstantin

2018-01-16 11:12:47 UTC

Hi Jianfeng,

-----Original Message-----
From: Tan, Jianfeng
Sent: Tuesday, January 16, 2018 8:11 AM
Subject: Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
Thank you, Konstantin and Anatoly firstly. Other comments are well
received and I'll send out a new version.

Post by Ananyev, Konstantin

Post by Jianfeng Tan
We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.
We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The
timeout is hard-coded 5 Sec. And the replied message will be copied
in the parameters of this API so that the caller can decide how
to translate those information (including params and fds). Note
if a primary process owns multiple secondary processes, this API
will fail.
The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.
We use mutex in rte_eal_mp_request() to guarantee that only one
request is on the fly for one pair of processes.

You don't need to do things in such strange and restrictive way.
1) Introduce new struct, list for it and mutex
struct sync_request {
int reply_received;
char dst[PATH_MAX];
char reply[...];
LIST_ENTRY(sync_request) next;
};
static struct
LIST_HEAD(list, sync_request);
pthread_mutex_t lock;
pthead_cond_t cond;
} sync_requests;
Grab sync_requests.lock
Check do we already have a pending request for that destination,
If yes - the release the lock and returns with error.
- allocate and init new sync_request struct, set reply_received=0
- do send_msg()
pthread_cond_timed_wait(&sync_requests.cond, &sync_request.lock, &timespec);
- at return from it check if sync_request.reply_received == 1, if not
check if timeout expired and either return a failure or go to the start of the cycle.
3) at mp_handler() if REPLY received - grab sync_request.lock,
search through sync_requests.list for dst[] ,
if found, then set it's reply_received=1, copy the received message into reply
and call pthread_cond_braodcast((&sync_requests.cond);

The only benefit I can see is that now the sender can request to
multiple receivers at the same time. And it makes things more
complicated. Do we really need this?

The benefit is that one thread is blocked waiting for response,
your mp_handler can still receive and handle other messages.
Plus as you said - other threads can keep sending messages.
Konstantin

Thanks,
Jianfeng

Tan, Jianfeng

2018-01-16 16:47:46 UTC

Post by Burakov, Anatoly
Hi Jianfeng,

-----Original Message-----
From: Tan, Jianfeng
Sent: Tuesday, January 16, 2018 8:11 AM
Subject: Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
Thank you, Konstantin and Anatoly firstly. Other comments are well
received and I'll send out a new version.

Post by Ananyev, Konstantin

Post by Jianfeng Tan
We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.
We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The
timeout is hard-coded 5 Sec. And the replied message will be copied
in the parameters of this API so that the caller can decide how
to translate those information (including params and fds). Note
if a primary process owns multiple secondary processes, this API
will fail.
The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.
We use mutex in rte_eal_mp_request() to guarantee that only one
request is on the fly for one pair of processes.

You don't need to do things in such strange and restrictive way.
1) Introduce new struct, list for it and mutex
struct sync_request {
int reply_received;
char dst[PATH_MAX];
char reply[...];
LIST_ENTRY(sync_request) next;
};
static struct
LIST_HEAD(list, sync_request);
pthread_mutex_t lock;
pthead_cond_t cond;
} sync_requests;
Grab sync_requests.lock
Check do we already have a pending request for that destination,
If yes - the release the lock and returns with error.
- allocate and init new sync_request struct, set reply_received=0
- do send_msg()
pthread_cond_timed_wait(&sync_requests.cond, &sync_request.lock, &timespec);
- at return from it check if sync_request.reply_received == 1, if not
check if timeout expired and either return a failure or go to the start of the cycle.
3) at mp_handler() if REPLY received - grab sync_request.lock,
search through sync_requests.list for dst[] ,
if found, then set it's reply_received=1, copy the received message into reply
and call pthread_cond_braodcast((&sync_requests.cond);

The only benefit I can see is that now the sender can request to
multiple receivers at the same time. And it makes things more
complicated. Do we really need this?

The benefit is that one thread is blocked waiting for response,
your mp_handler can still receive and handle other messages.

This can already be done in the original implementation. mp_handler
listens for msg, request from the other peer(s), and replies the
requests, which is not affected.

Post by Burakov, Anatoly
Plus as you said - other threads can keep sending messages.

For this one, in the original implementation, other threads can still
send msg, but not request. I suppose the request is not in a fast path,
why we care to make it fast?

Thanks,
Jianfeng

Post by Burakov, Anatoly
Konstantin

Thanks,
Jianfeng

Ananyev, Konstantin

2018-01-17 10:50:22 UTC

Post by Tan, Jianfeng

Post by Burakov, Anatoly
Hi Jianfeng,

-----Original Message-----
From: Tan, Jianfeng
Sent: Tuesday, January 16, 2018 8:11 AM
Subject: Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
Thank you, Konstantin and Anatoly firstly. Other comments are well
received and I'll send out a new version.

Post by Ananyev, Konstantin

Post by Jianfeng Tan
We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.
We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The
timeout is hard-coded 5 Sec. And the replied message will be copied
in the parameters of this API so that the caller can decide how
to translate those information (including params and fds). Note
if a primary process owns multiple secondary processes, this API
will fail.
The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.
We use mutex in rte_eal_mp_request() to guarantee that only one
request is on the fly for one pair of processes.

You don't need to do things in such strange and restrictive way.
1) Introduce new struct, list for it and mutex
struct sync_request {
int reply_received;
char dst[PATH_MAX];
char reply[...];
LIST_ENTRY(sync_request) next;
};
static struct
LIST_HEAD(list, sync_request);
pthread_mutex_t lock;
pthead_cond_t cond;
} sync_requests;
Grab sync_requests.lock
Check do we already have a pending request for that destination,
If yes - the release the lock and returns with error.
- allocate and init new sync_request struct, set reply_received=0
- do send_msg()
pthread_cond_timed_wait(&sync_requests.cond, &sync_request.lock, &timespec);
- at return from it check if sync_request.reply_received == 1, if not
check if timeout expired and either return a failure or go to the start of the cycle.
3) at mp_handler() if REPLY received - grab sync_request.lock,
search through sync_requests.list for dst[] ,
if found, then set it's reply_received=1, copy the received message into reply
and call pthread_cond_braodcast((&sync_requests.cond);

The only benefit I can see is that now the sender can request to
multiple receivers at the same time. And it makes things more
complicated. Do we really need this?

The benefit is that one thread is blocked waiting for response,
your mp_handler can still receive and handle other messages.

This can already be done in the original implementation. mp_handler
listens for msg, request from the other peer(s), and replies the
requests, which is not affected.

Post by Burakov, Anatoly
Plus as you said - other threads can keep sending messages.

For this one, in the original implementation, other threads can still
send msg, but not request. I suppose the request is not in a fast path,
why we care to make it fast?

+int
+rte_eal_mp_request(const char *action_name,
+ void *params,
+ int len_p,
+ int fds[],
+ int fds_in,
+ int fds_out)
+{
+ int i, j;
+ int sockfd;
+ int nprocs;
+ int ret = 0;
+ struct mp_msghdr *req;
+ struct timeval tv;
+ char buf[MAX_MSG_LENGTH];
+ struct mp_msghdr *hdr;
+
+ RTE_LOG(DEBUG, EAL, "request: %s\n", action_name);
+
+ if (fds_in > SCM_MAX_FD || fds_out > SCM_MAX_FD) {
+ RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
+ rte_errno = -E2BIG;
+ return 0;
+ }
+
+ req = format_msg(action_name, params, len_p, fds_in, MP_REQ);
+ if (req == NULL)
+ return 0;
+
+ if ((sockfd = open_unix_fd(0)) < 0) {
+ free(req);
+ return 0;
+ }
+
+ tv.tv_sec = 5; /* 5 Secs Timeout */
+ tv.tv_usec = 0;
+ if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO,
+ (const void *)&tv, sizeof(struct timeval)) < 0)
+ RTE_LOG(INFO, EAL, "Failed to set recv timeout\n");

I f you set it just for one call, why do you not restore it?
Also I don't think it is a good idea to change it here -
if you'll make timeout a parameter value - then it could be overwritten
by different threads.

+
+ /* Only allow one req at a time */
+ pthread_mutex_lock(&mp_mutex_request);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ nprocs = 0;
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+ if (!mp_sec_sockets[i]) {
+ j = i;
+ nprocs++;
+ }
+
+ if (nprocs > 1) {
+ RTE_LOG(ERR, EAL,
+ "multi secondary processes not supported\n");
+ goto free_and_ret;
+ }
+
+ ret = send_msg(sockfd, mp_sec_sockets[j], req, fds);

As I remember - sndmsg() is also blocking call, so under some conditions you can stall
there forever.
As mp_mutex_requestis still held - next rte_eal_mp_request(0 will also block forever here.

+ } else
+ ret = send_msg(sockfd, eal_mp_unix_path(), req, fds);
+
+ if (ret == 0) {
+ RTE_LOG(ERR, EAL, "failed to send request: %s\n", action_name);
+ ret = -1;
+ goto free_and_ret;
+ }
+
+ ret = read_msg(sockfd, buf, MAX_MSG_LENGTH, fds, fds_out, NULL);

if the message you receive is not a reply you are expecting -
it will be simply dropped - mp_handler() would never process it.

+ if (ret > 0) {
+ hdr = (struct mp_msghdr *)buf;
+ if (hdr->len_params == len_p)
+ memcpy(params, hdr->params, len_p);
+ else {
+ RTE_LOG(ERR, EAL, "invalid reply\n");
+ ret = 0;
+ }
+ }
+
+free_and_ret:
+ free(req);
+ close(sockfd);
+ pthread_mutex_unlock(&mp_mutex_request);
+ return ret;
+}

All of the above makes me think that current implementation is erroneous
and needs to be reworked.
Konstantin

Tan, Jianfeng

2018-01-17 13:09:22 UTC

Post by Jianfeng Tan

Post by Tan, Jianfeng

Post by Burakov, Anatoly
Hi Jianfeng,

-----Original Message-----
From: Tan, Jianfeng
Sent: Tuesday, January 16, 2018 8:11 AM
Subject: Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
Thank you, Konstantin and Anatoly firstly. Other comments are well
received and I'll send out a new version.

Post by Ananyev, Konstantin

Post by Jianfeng Tan
We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.
We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The
timeout is hard-coded 5 Sec. And the replied message will be copied
in the parameters of this API so that the caller can decide how
to translate those information (including params and fds). Note
if a primary process owns multiple secondary processes, this API
will fail.
The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.
We use mutex in rte_eal_mp_request() to guarantee that only one
request is on the fly for one pair of processes.

You don't need to do things in such strange and restrictive way.
1) Introduce new struct, list for it and mutex
struct sync_request {
int reply_received;
char dst[PATH_MAX];
char reply[...];
LIST_ENTRY(sync_request) next;
};
static struct
LIST_HEAD(list, sync_request);
pthread_mutex_t lock;
pthead_cond_t cond;
} sync_requests;
Grab sync_requests.lock
Check do we already have a pending request for that destination,
If yes - the release the lock and returns with error.
- allocate and init new sync_request struct, set reply_received=0
- do send_msg()
pthread_cond_timed_wait(&sync_requests.cond, &sync_request.lock, &timespec);
- at return from it check if sync_request.reply_received == 1, if not
check if timeout expired and either return a failure or go to the start of the cycle.
3) at mp_handler() if REPLY received - grab sync_request.lock,
search through sync_requests.list for dst[] ,
if found, then set it's reply_received=1, copy the received message into reply
and call pthread_cond_braodcast((&sync_requests.cond);

The only benefit I can see is that now the sender can request to
multiple receivers at the same time. And it makes things more
complicated. Do we really need this?

The benefit is that one thread is blocked waiting for response,
your mp_handler can still receive and handle other messages.

This can already be done in the original implementation. mp_handler
listens for msg, request from the other peer(s), and replies the
requests, which is not affected.

Post by Burakov, Anatoly
Plus as you said - other threads can keep sending messages.

For this one, in the original implementation, other threads can still
send msg, but not request. I suppose the request is not in a fast path,
why we care to make it fast?

+int
+rte_eal_mp_request(const char *action_name,
+ void *params,
+ int len_p,
+ int fds[],
+ int fds_in,
+ int fds_out)
+{
+ int i, j;
+ int sockfd;
+ int nprocs;
+ int ret = 0;
+ struct mp_msghdr *req;
+ struct timeval tv;
+ char buf[MAX_MSG_LENGTH];
+ struct mp_msghdr *hdr;
+
+ RTE_LOG(DEBUG, EAL, "request: %s\n", action_name);
+
+ if (fds_in > SCM_MAX_FD || fds_out > SCM_MAX_FD) {
+ RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
+ rte_errno = -E2BIG;
+ return 0;
+ }
+
+ req = format_msg(action_name, params, len_p, fds_in, MP_REQ);
+ if (req == NULL)
+ return 0;
+
+ if ((sockfd = open_unix_fd(0)) < 0) {
+ free(req);
+ return 0;
+ }
+
+ tv.tv_sec = 5; /* 5 Secs Timeout */
+ tv.tv_usec = 0;
+ if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO,
+ (const void *)&tv, sizeof(struct timeval)) < 0)
+ RTE_LOG(INFO, EAL, "Failed to set recv timeout\n");
I f you set it just for one call, why do you not restore it?

Yes, original code is buggy, I should have put it into the critical section.

Do you mean we just create once and use for ever? if yes, we could put
the open and setting into mp_init().

Post by Jianfeng Tan
Also I don't think it is a good idea to change it here -
if you'll make timeout a parameter value - then it could be overwritten
by different threads.

For simplicity, I'm not inclined to put the timeout as an parameter
exposing to caller. So if you agree, I'll put it into the mp_init() with
open.

Post by Jianfeng Tan
+
+ /* Only allow one req at a time */
+ pthread_mutex_lock(&mp_mutex_request);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ nprocs = 0;
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+ if (!mp_sec_sockets[i]) {
+ j = i;
+ nprocs++;
+ }
+
+ if (nprocs > 1) {
+ RTE_LOG(ERR, EAL,
+ "multi secondary processes not supported\n");
+ goto free_and_ret;
+ }
+
+ ret = send_msg(sockfd, mp_sec_sockets[j], req, fds);
As I remember - sndmsg() is also blocking call, so under some conditions you can stall
there forever.

From linux's unix_diagram_sendmsg(), we see:
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);

I assume it will not block for datagram unix socket in Linux. But I'm
not sure what it behaves in freebsd.

Anyway, better to add an explicit setsockopt() to make it not blocking.

Post by Jianfeng Tan
As mp_mutex_requestis still held - next rte_eal_mp_request(0 will also block forever here.
+ } else
+ ret = send_msg(sockfd, eal_mp_unix_path(), req, fds);
+
+ if (ret == 0) {
+ RTE_LOG(ERR, EAL, "failed to send request: %s\n", action_name);
+ ret = -1;
+ goto free_and_ret;
+ }
+
+ ret = read_msg(sockfd, buf, MAX_MSG_LENGTH, fds, fds_out, NULL);
if the message you receive is not a reply you are expecting -
it will be simply dropped - mp_handler() would never process it.

We cannot detect if it's the right reply absolutely correctly, but just
check the action_name, which means, it still possibly gets a wrong reply
if an action_name contains multiple requests.

Is just comparing the action_name acceptable?

Post by Jianfeng Tan
+ if (ret > 0) {
+ hdr = (struct mp_msghdr *)buf;
+ if (hdr->len_params == len_p)
+ memcpy(params, hdr->params, len_p);
+ else {
+ RTE_LOG(ERR, EAL, "invalid reply\n");
+ ret = 0;
+ }
+ }
+
+ free(req);
+ close(sockfd);
+ pthread_mutex_unlock(&mp_mutex_request);
+ return ret;
+}
All of the above makes me think that current implementation is erroneous
and needs to be reworked.

Thank you for your review. I'll work on a new version.

Thanks,
Jianfeng

Post by Jianfeng Tan
Konstantin

Tan, Jianfeng

2018-01-17 13:15:53 UTC

[...]

Post by Tan, Jianfeng

Post by Jianfeng Tan
+int
+rte_eal_mp_request(const char *action_name,
+ void *params,
+ int len_p,
+ int fds[],
+ int fds_in,
+ int fds_out)
+{
+ int i, j;
+ int sockfd;
+ int nprocs;
+ int ret = 0;
+ struct mp_msghdr *req;
+ struct timeval tv;
+ char buf[MAX_MSG_LENGTH];
+ struct mp_msghdr *hdr;
+
+ RTE_LOG(DEBUG, EAL, "request: %s\n", action_name);
+
+ if (fds_in > SCM_MAX_FD || fds_out > SCM_MAX_FD) {
+ RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n",
SCM_MAX_FD);
+ rte_errno = -E2BIG;
+ return 0;
+ }
+
+ req = format_msg(action_name, params, len_p, fds_in, MP_REQ);
+ if (req == NULL)
+ return 0;
+
+ if ((sockfd = open_unix_fd(0)) < 0) {
+ free(req);
+ return 0;
+ }
+
+ tv.tv_sec = 5; /* 5 Secs Timeout */
+ tv.tv_usec = 0;
+ if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO,
+ (const void *)&tv, sizeof(struct timeval)) < 0)
+ RTE_LOG(INFO, EAL, "Failed to set recv timeout\n");
I f you set it just for one call, why do you not restore it?

Yes, original code is buggy, I should have put it into the critical section.
Do you mean we just create once and use for ever? if yes, we could put
the open and setting into mp_init().

A second thought, we shall not put the setting into mp_init(). It'll be
set to non-blocking as of sending msg, but blocking as of receiving msg.

Thanks,
Jianfeng

Ananyev, Konstantin

2018-01-17 17:20:38 UTC

Post by Tan, Jianfeng

Post by Jianfeng Tan

Post by Tan, Jianfeng

Post by Burakov, Anatoly
Hi Jianfeng,

-----Original Message-----
From: Tan, Jianfeng
Sent: Tuesday, January 16, 2018 8:11 AM
Subject: Re: [PATCH v2 3/4] eal: add synchronous multi-process communication
Thank you, Konstantin and Anatoly firstly. Other comments are well
received and I'll send out a new version.

Post by Ananyev, Konstantin

Post by Jianfeng Tan
We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.
We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The
timeout is hard-coded 5 Sec. And the replied message will be copied
in the parameters of this API so that the caller can decide how
to translate those information (including params and fds). Note
if a primary process owns multiple secondary processes, this API
will fail.
The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.
We use mutex in rte_eal_mp_request() to guarantee that only one
request is on the fly for one pair of processes.

You don't need to do things in such strange and restrictive way.
1) Introduce new struct, list for it and mutex
struct sync_request {
int reply_received;
char dst[PATH_MAX];
char reply[...];
LIST_ENTRY(sync_request) next;
};
static struct
LIST_HEAD(list, sync_request);
pthread_mutex_t lock;
pthead_cond_t cond;
} sync_requests;
Grab sync_requests.lock
Check do we already have a pending request for that destination,
If yes - the release the lock and returns with error.
- allocate and init new sync_request struct, set reply_received=0
- do send_msg()
pthread_cond_timed_wait(&sync_requests.cond, &sync_request.lock, &timespec);
- at return from it check if sync_request.reply_received == 1, if not
check if timeout expired and either return a failure or go to the start of the cycle.
3) at mp_handler() if REPLY received - grab sync_request.lock,
search through sync_requests.list for dst[] ,
if found, then set it's reply_received=1, copy the received message into reply
and call pthread_cond_braodcast((&sync_requests.cond);

The only benefit I can see is that now the sender can request to
multiple receivers at the same time. And it makes things more
complicated. Do we really need this?

The benefit is that one thread is blocked waiting for response,
your mp_handler can still receive and handle other messages.

This can already be done in the original implementation. mp_handler
listens for msg, request from the other peer(s), and replies the
requests, which is not affected.

Post by Burakov, Anatoly
Plus as you said - other threads can keep sending messages.

For this one, in the original implementation, other threads can still
send msg, but not request. I suppose the request is not in a fast path,
why we care to make it fast?

+int
+rte_eal_mp_request(const char *action_name,
+ void *params,
+ int len_p,
+ int fds[],
+ int fds_in,
+ int fds_out)
+{
+ int i, j;
+ int sockfd;
+ int nprocs;
+ int ret = 0;
+ struct mp_msghdr *req;
+ struct timeval tv;
+ char buf[MAX_MSG_LENGTH];
+ struct mp_msghdr *hdr;
+
+ RTE_LOG(DEBUG, EAL, "request: %s\n", action_name);
+
+ if (fds_in > SCM_MAX_FD || fds_out > SCM_MAX_FD) {
+ RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n", SCM_MAX_FD);
+ rte_errno = -E2BIG;
+ return 0;
+ }
+
+ req = format_msg(action_name, params, len_p, fds_in, MP_REQ);
+ if (req == NULL)
+ return 0;
+
+ if ((sockfd = open_unix_fd(0)) < 0) {
+ free(req);
+ return 0;
+ }
+
+ tv.tv_sec = 5; /* 5 Secs Timeout */
+ tv.tv_usec = 0;
+ if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO,
+ (const void *)&tv, sizeof(struct timeval)) < 0)
+ RTE_LOG(INFO, EAL, "Failed to set recv timeout\n");
I f you set it just for one call, why do you not restore it?

Yes, original code is buggy, I should have put it into the critical section.
Do you mean we just create once and use for ever? if yes, we could put
the open and setting into mp_init().

Post by Jianfeng Tan
Also I don't think it is a good idea to change it here -
if you'll make timeout a parameter value - then it could be overwritten
by different threads.

For simplicity, I'm not inclined to put the timeout as an parameter
exposing to caller. So if you agree, I'll put it into the mp_init() with
open.

My preference would be to have timeout value on a per call basis.
For one request user would like to wait no more than 5sec,
for another one user would probably be ok to wait forever.

Post by Tan, Jianfeng

Post by Jianfeng Tan
+
+ /* Only allow one req at a time */
+ pthread_mutex_lock(&mp_mutex_request);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ nprocs = 0;
+ for (i = 0; i < MAX_SECONDARY_PROCS; ++i)
+ if (!mp_sec_sockets[i]) {
+ j = i;
+ nprocs++;
+ }
+
+ if (nprocs > 1) {
+ RTE_LOG(ERR, EAL,
+ "multi secondary processes not supported\n");
+ goto free_and_ret;
+ }
+
+ ret = send_msg(sockfd, mp_sec_sockets[j], req, fds);
As I remember - sndmsg() is also blocking call, so under some conditions you can stall
there forever.

timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);

Ok, but it would have effect only if (msg->msg_flags & MSG_DONTWAIT) != 0.
And for that, as I remember you need your socket in non-blocking mode, no?

Post by Tan, Jianfeng
I assume it will not block for datagram unix socket in Linux. But I'm
not sure what it behaves in freebsd.
Anyway, better to add an explicit setsockopt() to make it not blocking.

You can't do that - at the same moment another thread might call your sendmsg()
and it might expect it to be blocking call.

Post by Tan, Jianfeng

Post by Jianfeng Tan
As mp_mutex_requestis still held - next rte_eal_mp_request(0 will also block forever here.
+ } else
+ ret = send_msg(sockfd, eal_mp_unix_path(), req, fds);
+
+ if (ret == 0) {
+ RTE_LOG(ERR, EAL, "failed to send request: %s\n", action_name);
+ ret = -1;
+ goto free_and_ret;
+ }
+
+ ret = read_msg(sockfd, buf, MAX_MSG_LENGTH, fds, fds_out, NULL);
if the message you receive is not a reply you are expecting -
it will be simply dropped - mp_handler() would never process it.

We cannot detect if it's the right reply absolutely correctly, but just
check the action_name, which means, it still possibly gets a wrong reply
if an action_name contains multiple requests.
Is just comparing the action_name acceptable?

As I can see the main issue here is that you can call recvmsg() from 2 different
points and they are not syncronised:
1. your mp_handler() doesn't aware about reply you are waiting and not
have any handler associated with it.
So if mp_handler() will receive a reply it will just drop it.
2. your reply() is not aware about any other messages and associated actions -
so again it can't handle them properly (and probably shouldn't).

The simplest (and most common) way - always call recvmsg from one place -
mp_handler() and have a special action for reply msg.
As I wrote before that action will be just find the appropriate buffer provided
by reply() - copy message into it and signal thread waiting in reply() that
it can proceed.

Konstantin

Post by Tan, Jianfeng

Post by Jianfeng Tan
+ if (ret > 0) {
+ hdr = (struct mp_msghdr *)buf;
+ if (hdr->len_params == len_p)
+ memcpy(params, hdr->params, len_p);
+ else {
+ RTE_LOG(ERR, EAL, "invalid reply\n");
+ ret = 0;
+ }
+ }
+
+ free(req);
+ close(sockfd);
+ pthread_mutex_unlock(&mp_mutex_request);
+ return ret;
+}
All of the above makes me think that current implementation is erroneous
and needs to be reworked.

Thank you for your review. I'll work on a new version.
Thanks,
Jianfeng

Post by Jianfeng Tan
Konstantin

Jianfeng Tan

2018-01-11 04:07:34 UTC

Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.

This patch changes to use the generic mp channel.

Test:
1. Bind two NICs to vfio-pci.

2. Start the primary and secondary process.
$ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
$ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 --num-procs=2 --proc-id=1

Signed-off-by: Jianfeng Tan <***@intel.com>
---
lib/librte_eal/linuxapp/eal/eal.c | 14 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 133 ++------
lib/librte_eal/linuxapp/eal/eal_vfio.h | 15 +-
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 409 ++++---------------------
4 files changed, 94 insertions(+), 477 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index f231724..d4b45a2 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -713,18 +713,8 @@ static int rte_eal_vfio_setup(void)
return -1;
vfio_enabled = rte_vfio_is_enabled("vfio");

- if (vfio_enabled) {
-
- /* if we are primary process, create a thread to communicate with
- * secondary processes. the thread will use a socket to wait for
- * requests from secondary process to send open file descriptors,
- * because VFIO does not allow multiple open descriptors on a group or
- * VFIO container.
- */
- if (internal_config.process_type == RTE_PROC_PRIMARY &&
- vfio_mp_sync_setup() < 0)
- return -1;
- }
+ if (vfio_enabled && vfio_mp_sync_setup() < 0)
+ return -1;

return 0;
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 3036f60..2ff40f7 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -39,9 +39,11 @@ int
vfio_get_group_fd(int iommu_group_no)
{
int i;
+ int ret;
int vfio_group_fd;
char filename[PATH_MAX];
struct vfio_group *cur_grp;
+ struct vfio_mp_param p;

/* check if we already have the group descriptor open */
for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -101,49 +103,21 @@ vfio_get_group_fd(int iommu_group_no)
return vfio_group_fd;
}
/* if we're in a secondary process, request group fd from the primary
- * process via our socket
+ * process via mp channel
*/
- else {
- int socket_fd, ret;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
+ p.req = SOCKET_REQ_GROUP;
+ p.group_no = iommu_group_no;

- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group number!\n");
- close(socket_fd);
- return -1;
- }
- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- case SOCKET_NO_FD:
- close(socket_fd);
- return 0;
- case SOCKET_OK:
- vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
- /* if we got the fd, store it and return it */
- if (vfio_group_fd > 0) {
- close(socket_fd);
- cur_grp->group_no = iommu_group_no;
- cur_grp->fd = vfio_group_fd;
- vfio_cfg.vfio_active_groups++;
- return vfio_group_fd;
- }
- /* fall-through on error */
- default:
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
- }
+ vfio_group_fd = -1;
+ ret = rte_eal_mp_request("vfio", &p, sizeof(p), &vfio_group_fd, 0, 1);
+ if (ret > 0 && p.result == SOCKET_OK) {
+ cur_grp->group_no = iommu_group_no;
+ cur_grp->fd = vfio_group_fd;
+ vfio_cfg.vfio_active_groups++;
+ return vfio_group_fd;
}
+
+ RTE_LOG(ERR, EAL, " cannot request group fd\n");
return -1;
}

@@ -200,7 +174,8 @@ int
clear_group(int vfio_group_fd)
{
int i;
- int socket_fd, ret;
+ int ret;
+ struct vfio_mp_param p;

if (internal_config.process_type == RTE_PROC_PRIMARY) {

@@ -214,43 +189,14 @@ clear_group(int vfio_group_fd)
return 0;
}

- /* This is just for SECONDARY processes */
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
-
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
-
- if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group fd!\n");
- close(socket_fd);
- return -1;
- }
+ p.req = SOCKET_CLR_GROUP;
+ p.group_no = vfio_group_fd;

- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- case SOCKET_NO_FD:
- RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
- close(socket_fd);
- break;
- case SOCKET_OK:
- close(socket_fd);
+ ret = rte_eal_mp_request("vfio", &p, sizeof(p), NULL, 0, 0);
+ if (ret > 0 && p.result == SOCKET_OK)
return 0;
- case SOCKET_ERR:
- RTE_LOG(ERR, EAL, " Socket error\n");
- close(socket_fd);
- break;
- default:
- RTE_LOG(ERR, EAL, " UNKNOWN reply, %d\n", ret);
- close(socket_fd);
- }
+
+ RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
return -1;
}

@@ -561,6 +507,7 @@ int
vfio_get_container_fd(void)
{
int ret, vfio_container_fd;
+ struct vfio_mp_param p;

/* if we're in a primary process, try to open the container */
if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -591,33 +538,19 @@ vfio_get_container_fd(void)
}

return vfio_container_fd;
- } else {
- /*
- * if we're in a secondary process, request container fd from the
- * primary process via our socket
- */
- int socket_fd;
+ }
+ /*
+ * if we're in a secondary process, request container fd from the
+ * primary process via mp channel
+ */
+ p.req = SOCKET_REQ_CONTAINER;

- socket_fd = vfio_mp_sync_connect_to_primary();
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
- if (vfio_container_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
- }
- close(socket_fd);
+ vfio_container_fd = -1;
+ ret = rte_eal_mp_request("vfio", &p, sizeof(p), &vfio_container_fd, 0, 1);
+ if (ret > 0 && p.result == SOCKET_OK)
return vfio_container_fd;
- }

+ RTE_LOG(ERR, EAL, " cannot request container fd\n");
return -1;
}

diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index b34d5d0..a14b168 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -88,15 +88,6 @@ struct vfio_iommu_spapr_tce_info {
#define VFIO_MAX_GROUPS 64

/*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
* we don't need to store device fd's anywhere since they can be obtained from
* the group fd via an ioctl() call.
*/
@@ -161,6 +152,12 @@ int vfio_mp_sync_setup(void);
#define SOCKET_NO_FD 0x1
#define SOCKET_ERR 0xFF

+struct vfio_mp_param {
+ int req;
+ int result;
+ int group_no;
+};
+
#endif /* VFIO_PRESENT */

#endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index 9b474dc..ea1a6a7 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -2,31 +2,13 @@
* Copyright(c) 2010-2014 Intel Corporation
*/

-#include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif
+#include <unistd.h>

#include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
#include <rte_vfio.h>
+#include <rte_eal.h>

-#include "eal_filesystem.h"
#include "eal_vfio.h"
-#include "eal_thread.h"

/**
* @file
@@ -37,360 +19,75 @@

#ifdef VFIO_PRESENT

-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
- do {\
- (chdr).cmsg_len = CMSGLEN;\
- (chdr).cmsg_level = SOL_SOCKET;\
- (chdr).cmsg_type = SCM_RIGHTS;\
- memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
- } while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
- memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
- const char *dir = "/var/run";
- const char *home_dir = getenv("HOME");
-
- if (getuid() != 0 && home_dir != NULL)
- dir = home_dir;
-
- /* use current prefix as file path */
- snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
- internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * data flow for socket comm protocol:
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
- struct msghdr hdr;
- struct iovec iov;
- int buf;
- int ret;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = req;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct iovec iov;
- int ret, req;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = SOCKET_ERR;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
+static int
+vfio_mp_primary(const void *params,
+ int len,
+ int fds[] __rte_unused,
+ int fds_num __rte_unused,
+ const void *peer)
{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
+ int fd;
int ret;
+ int num = 0;
+ const struct vfio_mp_param *p = params;
+ struct vfio_mp_param r;

- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- buf = SOCKET_OK;
- FD_TO_CMSGHDR(fd, *chdr);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
- int ret, req, fd;
-
- buf = SOCKET_ERR;
-
- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- if (req != SOCKET_OK)
- return -1;
-
- CMSGHDR_TO_FD(*chdr, fd);
-
- return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
-{
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
- int socket_fd;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+ if (len != sizeof(*p)) {
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}

- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
- return socket_fd;
-
- /* if connect failed */
- close(socket_fd);
- return -1;
-}
-
-
-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
- int ret, fd, vfio_data;
-
- /* wait for requests on the socket */
- for (;;) {
- int conn_sock;
- struct sockaddr_un addr;
- socklen_t sockaddr_len = sizeof(addr);
-
- /* this is a blocking call */
- conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
- &sockaddr_len);
-
- /* just restart on error */
- if (conn_sock == -1)
- continue;
-
- /* set socket to linger after close */
- struct linger l;
- l.l_onoff = 1;
- l.l_linger = 60;
-
- if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
- RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
- "on listen socket (%s)\n", strerror(errno));
-
- ret = vfio_mp_sync_receive_request(conn_sock);
-
- switch (ret) {
- case SOCKET_REQ_CONTAINER:
- fd = vfio_get_container_fd();
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- else
- vfio_mp_sync_send_fd(conn_sock, fd);
- if (fd >= 0)
- close(fd);
- break;
- case SOCKET_REQ_GROUP:
- /* wait for group number */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- fd = vfio_get_group_fd(vfio_data);
-
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+ switch (p->req) {
+ case SOCKET_REQ_GROUP:
+ r.req = SOCKET_REQ_GROUP;
+ r.group_no = p->group_no;
+ fd = vfio_get_group_fd(p->group_no);
+ if (fd < 0)
+ r.result = SOCKET_ERR;
+ else if (fd == 0)
/* if VFIO group exists but isn't bound to VFIO driver */
- else if (fd == 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+ r.result = SOCKET_NO_FD;
+ else {
/* if group exists and is bound to VFIO driver */
- else {
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- vfio_mp_sync_send_fd(conn_sock, fd);
- }
- break;
- case SOCKET_CLR_GROUP:
- /* wait for group fd */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- ret = clear_group(vfio_data);
-
- if (ret < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
- else
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- break;
- default:
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- break;
+ r.result = SOCKET_OK;
+ num = 1;
}
- close(conn_sock);
- }
-}
-
-static int
-vfio_mp_sync_socket_setup(void)
-{
- int ret, socket_fd;
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
- return -1;
- }
-
- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- unlink(addr.sun_path);
-
- ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
- close(socket_fd);
- return -1;
- }
-
- ret = listen(socket_fd, 50);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
- close(socket_fd);
+ break;
+ case SOCKET_CLR_GROUP:
+ r.req = SOCKET_CLR_GROUP;
+ r.group_no = p->group_no;
+ if (clear_group(p->group_no) < 0)
+ r.result = SOCKET_NO_FD;
+ else
+ r.result = SOCKET_OK;
+ break;
+ case SOCKET_REQ_CONTAINER:
+ r.req = SOCKET_REQ_CONTAINER;
+ fd = vfio_get_container_fd();
+ if (fd < 0)
+ r.result = SOCKET_ERR;
+ else {
+ r.result = SOCKET_OK;
+ num = 1;
+ }
+ break;
+ default:
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}

- /* save the socket in local configuration */
- mp_socket_fd = socket_fd;
-
- return 0;
+ ret = rte_eal_mp_reply("vfio", &r, sizeof(r), &fd, num, peer);
+ if (p->req == SOCKET_REQ_CONTAINER && num == 1)
+ close(fd);
+ return ret;
}

-/*
- * set up a local socket and tell it to listen for incoming connections
- */
int
vfio_mp_sync_setup(void)
{
- int ret;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
- if (vfio_mp_sync_socket_setup() < 0) {
- RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
- return -1;
- }
-
- ret = pthread_create(&socket_thread, NULL,
- vfio_mp_sync_thread, NULL);
- if (ret) {
- RTE_LOG(ERR, EAL,
- "Failed to create thread for communication with secondary processes!\n");
- close(mp_socket_fd);
- return -1;
- }
-
- /* Set thread_name for aid in debugging. */
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
- ret = rte_thread_setname(socket_thread, thread_name);
- if (ret)
- RTE_LOG(DEBUG, EAL,
- "Failed to set thread name for secondary processes!\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return rte_eal_mp_action_register("vfio", vfio_mp_primary);

return 0;
}
-
#endif

--
2.7.4

Burakov, Anatoly

2018-01-13 14:03:07 UTC

On 11-Jan-18 4:07 AM, Jianfeng Tan wrote:

<snip>

Post by Jianfeng Tan
- }
- /* fall-through on error */
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
- }
+ vfio_group_fd = -1;
+ ret = rte_eal_mp_request("vfio", &p, sizeof(p), &vfio_group_fd, 0, 1);
+ if (ret > 0 && p.result == SOCKET_OK) {

Thanks, this looks much more clear than the previous revision! In an
ideal world we would've been able to have separate response and reply
(as it's perfectly possible to imagine a situation where the request
would be small but the response would be huge), but for now this works
as well. Maybe put this API down under EXPERIMENTAL tag? (btw wasn't
this official policy now?)

Post by Jianfeng Tan
+ cur_grp->group_no = iommu_group_no;
+ cur_grp->fd = vfio_group_fd;
+ vfio_cfg.vfio_active_groups++;
+ return vfio_group_fd;
}
+
+ RTE_LOG(ERR, EAL, " cannot request group fd\n");
return -1;

check for SOCKET_NO_FD? Previously, that branch returned 0, now it will
return -1.

Post by Jianfeng Tan
}
@@ -200,7 +174,8 @@ int
clear_group(int vfio_group_fd)
{
int i;
- int socket_fd, ret;
+ int ret;
+ struct vfio_mp_param p;
if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -214,43 +189,14 @@ clear_group(int vfio_group_fd)
return 0;
}
- /* This is just for SECONDARY processes */
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
-
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
-
- if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group fd!\n");
- close(socket_fd);
- return -1;
- }
+ p.req = SOCKET_CLR_GROUP;
+ p.group_no = vfio_group_fd;
- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
- close(socket_fd);
- break;
- close(socket_fd);
+ ret = rte_eal_mp_request("vfio", &p, sizeof(p), NULL, 0, 0);
+ if (ret > 0 && p.result == SOCKET_OK)
return 0;
- RTE_LOG(ERR, EAL, " Socket error\n");
- close(socket_fd);
- break;
- RTE_LOG(ERR, EAL, " UNKNOWN reply, %d\n", ret);
- close(socket_fd);
- }
+
+ RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");

The error message lumps together two cases - bad VFIO group fd, and a
socket error.

Post by Jianfeng Tan
return -1;
}
@@ -561,6 +507,7 @@ int
vfio_get_container_fd(void)
{
int ret, vfio_container_fd;
+ struct vfio_mp_param p;
/* if we're in a primary process, try to open the container */
if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -591,33 +538,19 @@ vfio_get_container_fd(void)
}

<snip>

--
Thanks,
Anatoly

Jianfeng Tan

2018-03-04 14:57:36 UTC

Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.

This patch changes to use the generic mp channel.

Test:
1. Bind two NICs to vfio-pci.

2. Start the primary and secondary process.
$ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
$ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
--num-procs=2 --proc-id=1

Cc: ***@intel.com

Signed-off-by: Jianfeng Tan <***@intel.com>
---
lib/librte_eal/linuxapp/eal/eal.c | 14 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 172 +++++------
lib/librte_eal/linuxapp/eal/eal_vfio.h | 15 +-
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 409 ++++---------------------
4 files changed, 136 insertions(+), 474 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 38306bf..4ca06f4 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -695,18 +695,8 @@ static int rte_eal_vfio_setup(void)
return -1;
vfio_enabled = rte_vfio_is_enabled("vfio");

- if (vfio_enabled) {
-
- /* if we are primary process, create a thread to communicate with
- * secondary processes. the thread will use a socket to wait for
- * requests from secondary process to send open file descriptors,
- * because VFIO does not allow multiple open descriptors on a group or
- * VFIO container.
- */
- if (internal_config.process_type == RTE_PROC_PRIMARY &&
- vfio_mp_sync_setup() < 0)
- return -1;
- }
+ if (vfio_enabled && vfio_mp_sync_setup() < 0)
+ return -1;

return 0;
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index e44ae4d..d905e8e 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

#include <string.h>
@@ -42,6 +42,10 @@ vfio_get_group_fd(int iommu_group_no)
int vfio_group_fd;
char filename[PATH_MAX];
struct vfio_group *cur_grp;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;

/* check if we already have the group descriptor open */
for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -101,50 +105,31 @@ vfio_get_group_fd(int iommu_group_no)
return vfio_group_fd;
}
/* if we're in a secondary process, request group fd from the primary
- * process via our socket
+ * process via mp channel
*/
- else {
- int socket_fd, ret;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group number!\n");
- close(socket_fd);
- return -1;
- }
- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- case SOCKET_NO_FD:
- close(socket_fd);
- return 0;
- case SOCKET_OK:
- vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
- /* if we got the fd, store it and return it */
- if (vfio_group_fd > 0) {
- close(socket_fd);
- cur_grp->group_no = iommu_group_no;
- cur_grp->fd = vfio_group_fd;
- vfio_cfg.vfio_active_groups++;
- return vfio_group_fd;
- }
- /* fall-through on error */
- default:
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ p->req = SOCKET_REQ_GROUP;
+ p->group_no = iommu_group_no;
+ strcpy(mp_req.name, "vfio");
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_group_fd = -1;
+ if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ cur_grp->group_no = iommu_group_no;
+ vfio_group_fd = mp_rep->fds[0];
+ cur_grp->fd = vfio_group_fd;
+ vfio_cfg.vfio_active_groups++;
}
+ free(mp_reply.msgs);
}
- return -1;
+
+ if (vfio_group_fd < 0)
+ RTE_LOG(ERR, EAL, " cannot request group fd\n");
+ return vfio_group_fd;
}

@@ -200,7 +185,10 @@ int
rte_vfio_clear_group(int vfio_group_fd)
{
int i;
- int socket_fd, ret;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;

if (internal_config.process_type == RTE_PROC_PRIMARY) {

@@ -214,43 +202,24 @@ rte_vfio_clear_group(int vfio_group_fd)
return 0;
}

- /* This is just for SECONDARY processes */
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
-
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
+ p->req = SOCKET_CLR_GROUP;
+ p->group_no = vfio_group_fd;
+ strcpy(mp_req.name, "vfio");
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;

- if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group fd!\n");
- close(socket_fd);
- return -1;
+ if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK) {
+ free(mp_reply.msgs);
+ return 0;
+ }
+ free(mp_reply.msgs);
}

- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- case SOCKET_NO_FD:
- RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
- close(socket_fd);
- break;
- case SOCKET_OK:
- close(socket_fd);
- return 0;
- case SOCKET_ERR:
- RTE_LOG(ERR, EAL, " Socket error\n");
- close(socket_fd);
- break;
- default:
- RTE_LOG(ERR, EAL, " UNKNOWN reply, %d\n", ret);
- close(socket_fd);
- }
+ RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
return -1;
}

@@ -561,6 +530,11 @@ int
vfio_get_container_fd(void)
{
int ret, vfio_container_fd;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+

/* if we're in a primary process, try to open the container */
if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -591,33 +565,29 @@ vfio_get_container_fd(void)
}

return vfio_container_fd;
- } else {
- /*
- * if we're in a secondary process, request container fd from the
- * primary process via our socket
- */
- int socket_fd;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
- if (vfio_container_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ }
+ /*
+ * if we're in a secondary process, request container fd from the
+ * primary process via mp channel
+ */
+ p->req = SOCKET_REQ_CONTAINER;
+ strcpy(mp_req.name, "vfio");
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_container_fd = -1;
+ if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ free(mp_reply.msgs);
+ return mp_rep->fds[0];
}
- close(socket_fd);
- return vfio_container_fd;
+ free(mp_reply.msgs);
}

+ RTE_LOG(ERR, EAL, " cannot request container fd\n");
return -1;
}

diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 8059577..6b48969 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -88,15 +88,6 @@ struct vfio_iommu_spapr_tce_info {
#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS

/*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
* we don't need to store device fd's anywhere since they can be obtained from
* the group fd via an ioctl() call.
*/
@@ -157,6 +148,12 @@ int vfio_mp_sync_setup(void);
#define SOCKET_NO_FD 0x1
#define SOCKET_ERR 0xFF

+struct vfio_mp_param {
+ int req;
+ int result;
+ int group_no;
+};
+
#endif /* VFIO_PRESENT */

#endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index 7cc3c15..c61cdb9 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -1,32 +1,15 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

+#include <unistd.h>
#include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif

#include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
#include <rte_vfio.h>
+#include <rte_eal.h>

-#include "eal_filesystem.h"
#include "eal_vfio.h"
-#include "eal_thread.h"

/**
* @file
@@ -37,358 +20,80 @@

#ifdef VFIO_PRESENT

-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
- do {\
- (chdr).cmsg_len = CMSGLEN;\
- (chdr).cmsg_level = SOL_SOCKET;\
- (chdr).cmsg_type = SCM_RIGHTS;\
- memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
- } while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
- memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
- const char *dir = "/var/run";
- const char *home_dir = getenv("HOME");
-
- if (getuid() != 0 && home_dir != NULL)
- dir = home_dir;
-
- /* use current prefix as file path */
- snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
- internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * data flow for socket comm protocol:
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
- struct msghdr hdr;
- struct iovec iov;
- int buf;
- int ret;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = req;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct iovec iov;
- int ret, req;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = SOCKET_ERR;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
+static int
+vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
+ int fd;
+ int num;
int ret;
+ struct rte_mp_msg reply;
+ struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
+ const struct vfio_mp_param *m = (const struct vfio_mp_param *)msg->param;

- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- buf = SOCKET_OK;
- FD_TO_CMSGHDR(fd, *chdr);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
- int ret, req, fd;
-
- buf = SOCKET_ERR;
-
- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- if (req != SOCKET_OK)
- return -1;
-
- CMSGHDR_TO_FD(*chdr, fd);
-
- return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
-{
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
- int socket_fd;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+ if (msg->len_param != sizeof(*m)) {
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}

- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
- return socket_fd;
-
- /* if connect failed */
- close(socket_fd);
- return -1;
-}
-
-
-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
- int ret, fd, vfio_data;
-
- /* wait for requests on the socket */
- for (;;) {
- int conn_sock;
- struct sockaddr_un addr;
- socklen_t sockaddr_len = sizeof(addr);
-
- /* this is a blocking call */
- conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
- &sockaddr_len);
-
- /* just restart on error */
- if (conn_sock == -1)
- continue;
-
- /* set socket to linger after close */
- struct linger l;
- l.l_onoff = 1;
- l.l_linger = 60;
-
- if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
- RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
- "on listen socket (%s)\n", strerror(errno));
-
- ret = vfio_mp_sync_receive_request(conn_sock);
-
- switch (ret) {
- case SOCKET_REQ_CONTAINER:
- fd = vfio_get_container_fd();
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- else
- vfio_mp_sync_send_fd(conn_sock, fd);
- if (fd >= 0)
- close(fd);
- break;
- case SOCKET_REQ_GROUP:
- /* wait for group number */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- fd = vfio_get_group_fd(vfio_data);
+ memset(&reply, 0, sizeof(reply));

- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+ switch (m->req) {
+ case SOCKET_REQ_GROUP:
+ r->req = SOCKET_REQ_GROUP;
+ r->group_no = m->group_no;
+ fd = vfio_get_group_fd(m->group_no);
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else if (fd == 0)
/* if VFIO group exists but isn't bound to VFIO driver */
- else if (fd == 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+ r->result = SOCKET_NO_FD;
+ else {
/* if group exists and is bound to VFIO driver */
- else {
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- vfio_mp_sync_send_fd(conn_sock, fd);
- }
- break;
- case SOCKET_CLR_GROUP:
- /* wait for group fd */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- ret = rte_vfio_clear_group(vfio_data);
-
- if (ret < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
- else
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- break;
- default:
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- break;
+ r->result = SOCKET_OK;
+ num = 1;
}
- close(conn_sock);
- }
-}
-
-static int
-vfio_mp_sync_socket_setup(void)
-{
- int ret, socket_fd;
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
- return -1;
- }
-
- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- unlink(addr.sun_path);
-
- ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
- close(socket_fd);
+ break;
+ case SOCKET_CLR_GROUP:
+ r->req = SOCKET_CLR_GROUP;
+ r->group_no = m->group_no;
+ if (rte_vfio_clear_group(m->group_no) < 0)
+ r->result = SOCKET_NO_FD;
+ else
+ r->result = SOCKET_OK;
+ break;
+ case SOCKET_REQ_CONTAINER:
+ r->req = SOCKET_REQ_CONTAINER;
+ fd = vfio_get_container_fd();
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else {
+ r->result = SOCKET_OK;
+ num = 1;
+ }
+ break;
+ default:
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}

- ret = listen(socket_fd, 50);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
- close(socket_fd);
- return -1;
+ if (num == 1) {
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
}
+ strcpy(reply.name, "vfio");
+ reply.len_param = sizeof(*r);

- /* save the socket in local configuration */
- mp_socket_fd = socket_fd;
-
- return 0;
+ ret = rte_mp_reply(&reply, peer);
+ if (m->req == SOCKET_REQ_CONTAINER && num == 1)
+ close(fd);
+ return ret;
}

-/*
- * set up a local socket and tell it to listen for incoming connections
- */
int
vfio_mp_sync_setup(void)
{
- int ret;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
- if (vfio_mp_sync_socket_setup() < 0) {
- RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
- return -1;
- }
-
- ret = pthread_create(&socket_thread, NULL,
- vfio_mp_sync_thread, NULL);
- if (ret) {
- RTE_LOG(ERR, EAL,
- "Failed to create thread for communication with secondary processes!\n");
- close(mp_socket_fd);
- return -1;
- }
-
- /* Set thread_name for aid in debugging. */
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
- ret = rte_thread_setname(socket_thread, thread_name);
- if (ret)
- RTE_LOG(DEBUG, EAL,
- "Failed to set thread name for secondary processes!\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return rte_mp_action_register("vfio", vfio_mp_primary);

return 0;
}

--
2.7.4

Burakov, Anatoly

2018-03-14 13:27:17 UTC

Post by Jianfeng Tan
Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.
This patch changes to use the generic mp channel.
1. Bind two NICs to vfio-pci.
2. Start the primary and secondary process.
$ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
$ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
--num-procs=2 --proc-id=1
---

<...>

Post by Jianfeng Tan
- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- close(socket_fd);
- return 0;
- vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
- /* if we got the fd, store it and return it */
- if (vfio_group_fd > 0) {
- close(socket_fd);
- cur_grp->group_no = iommu_group_no;
- cur_grp->fd = vfio_group_fd;
- vfio_cfg.vfio_active_groups++;
- return vfio_group_fd;
- }
- /* fall-through on error */
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ p->req = SOCKET_REQ_GROUP;
+ p->group_no = iommu_group_no;
+ strcpy(mp_req.name, "vfio");

"vfio" should probably be a #define. Also, i think the identifier is too
short. Maybe "eal_vfio_mp_sync" or at least "eal_vfio" would be better?

Post by Jianfeng Tan
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_group_fd = -1;
+ if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ cur_grp->group_no = iommu_group_no;
+ vfio_group_fd = mp_rep->fds[0];
+ cur_grp->fd = vfio_group_fd;
+ vfio_cfg.vfio_active_groups++;
}
+ free(mp_reply.msgs);
}
- return -1;
+
+ if (vfio_group_fd < 0)
+ RTE_LOG(ERR, EAL, " cannot request group fd\n");
+ return vfio_group_fd;

p->result can be SOCKET_NO_FD, in which case returned value should be
zero. I think this is missing from this code. There probably should be
an "else if (p->result == SOCKET_NO_FD)" clause that sets return value to 0.

You should be able to test this by trying to set up a device for VFIO
that isn't bound to VFIO driver, in a secondary process.

Post by Jianfeng Tan
}
@@ -200,7 +185,10 @@ int
rte_vfio_clear_group(int vfio_group_fd)
{
int i;
- int socket_fd, ret;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -214,43 +202,24 @@ rte_vfio_clear_group(int vfio_group_fd)
return 0;
}
- /* This is just for SECONDARY processes */
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
-
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
+ p->req = SOCKET_CLR_GROUP;
+ p->group_no = vfio_group_fd;
+ strcpy(mp_req.name, "vfio");

Same here, please use a #define here.

Post by Jianfeng Tan
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
- if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group fd!\n");
- close(socket_fd);
- return -1;
+ if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK) {
+ free(mp_reply.msgs);
+ return 0;
+ }
+ free(mp_reply.msgs);
}
- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
- close(socket_fd);
- break;
- close(socket_fd);
- return 0;
- RTE_LOG(ERR, EAL, " Socket error\n");
- close(socket_fd);
- break;
- RTE_LOG(ERR, EAL, " UNKNOWN reply, %d\n", ret);
- close(socket_fd);
- }
+ RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");

Old error messages distinguished between "bad VFIO group fd" and other
errors. You should probably only output this message of response was
SOCKET_NO_FD, and output another message in case of other errors.

Post by Jianfeng Tan
return -1;
}
@@ -561,6 +530,11 @@ int
vfio_get_container_fd(void)
{

<...>

Post by Jianfeng Tan
- vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
- if (vfio_container_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ }
+ /*
+ * if we're in a secondary process, request container fd from the
+ * primary process via mp channel
+ */
+ p->req = SOCKET_REQ_CONTAINER;
+ strcpy(mp_req.name, "vfio");

Same here, please use #define here.

Post by Jianfeng Tan
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_container_fd = -1;
+ if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ free(mp_reply.msgs);
+ return mp_rep->fds[0];
}
- close(socket_fd);
- return vfio_container_fd;
+ free(mp_reply.msgs);

<...>

Post by Jianfeng Tan
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
- return -1;
- }
-
- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- unlink(addr.sun_path);
-
- ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
- close(socket_fd);
+ break;
+ r->req = SOCKET_CLR_GROUP;
+ r->group_no = m->group_no;
+ if (rte_vfio_clear_group(m->group_no) < 0)
+ r->result = SOCKET_NO_FD;
+ else
+ r->result = SOCKET_OK;
+ break;
+ r->req = SOCKET_REQ_CONTAINER;
+ fd = vfio_get_container_fd();
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else {
+ r->result = SOCKET_OK;
+ num = 1;
+ }
+ break;
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}
- ret = listen(socket_fd, 50);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
- close(socket_fd);
- return -1;
+ if (num == 1) {
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
}

You're not saving any lines of code with this, but you are sacrificing
code clarity :) I think this should be done inline, e.g. in "else"
clause of SOCKET_REQ_CONTAINER and SOCKET_REQ_GROUP.

Post by Jianfeng Tan
+ strcpy(reply.name, "vfio");

Same here, please use #define.

Post by Jianfeng Tan
+ reply.len_param = sizeof(*r);
- /* save the socket in local configuration */
- mp_socket_fd = socket_fd;
-
- return 0;
+ ret = rte_mp_reply(&reply, peer);
+ if (m->req == SOCKET_REQ_CONTAINER && num == 1)

Why not just "fd >= 0"? No need for num variable then.

Post by Jianfeng Tan
+ close(fd);
+ return ret;
}
-/*
- * set up a local socket and tell it to listen for incoming connections
- */
int
vfio_mp_sync_setup(void)
{
- int ret;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
- if (vfio_mp_sync_socket_setup() < 0) {
- RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
- return -1;
- }
-
- ret = pthread_create(&socket_thread, NULL,
- vfio_mp_sync_thread, NULL);
- if (ret) {
- RTE_LOG(ERR, EAL,
- "Failed to create thread for communication with secondary processes!\n");
- close(mp_socket_fd);
- return -1;
- }
-
- /* Set thread_name for aid in debugging. */
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
- ret = rte_thread_setname(socket_thread, thread_name);
- if (ret)
- RTE_LOG(DEBUG, EAL,
- "Failed to set thread name for secondary processes!\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return rte_mp_action_register("vfio", vfio_mp_primary);

Same here, please use #define.

Post by Jianfeng Tan
return 0;
}

Thanks for doing this patch!

--
Thanks,
Anatoly

Tan, Jianfeng

2018-03-19 06:53:33 UTC

Hi Anatoly,

Thank you for the review. All your comments will be addressed in next version, except for below concern which might be taken care of in another patch if it also concerns you.

-----Original Message-----
From: Burakov, Anatoly
Sent: Wednesday, March 14, 2018 9:27 PM
Subject: Re: [PATCH v5] vfio: change to use generic multi-process channel

[...]

Post by Jianfeng Tan
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_group_fd = -1;
+ if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ cur_grp->group_no = iommu_group_no;
+ vfio_group_fd = mp_rep->fds[0];
+ cur_grp->fd = vfio_group_fd;
+ vfio_cfg.vfio_active_groups++;
}
+ free(mp_reply.msgs);
}
- return -1;
+
+ if (vfio_group_fd < 0)
+ RTE_LOG(ERR, EAL, " cannot request group fd\n");
+ return vfio_group_fd;

p->result can be SOCKET_NO_FD, in which case returned value should be
zero. I think this is missing from this code. There probably should be
an "else if (p->result == SOCKET_NO_FD)" clause that sets return value to 0.
You should be able to test this by trying to set up a device for VFIO
that isn't bound to VFIO driver, in a secondary process.

OK, I will fix this.

But really, "zero" could be ambiguous as a fd could, theoretically, be zer

Burakov, Anatoly

2018-03-20 10:33:00 UTC

Post by Tan, Jianfeng
Hi Anatoly,
Thank you for the review. All your comments will be addressed in next version, except for below concern which might be taken care of in another patch if it also concerns you.

-----Original Message-----
From: Burakov, Anatoly
Sent: Wednesday, March 14, 2018 9:27 PM
Subject: Re: [PATCH v5] vfio: change to use generic multi-process channel

[...]

Post by Jianfeng Tan
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_group_fd = -1;
+ if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ cur_grp->group_no = iommu_group_no;
+ vfio_group_fd = mp_rep->fds[0];
+ cur_grp->fd = vfio_group_fd;
+ vfio_cfg.vfio_active_groups++;
}
+ free(mp_reply.msgs);
}
- return -1;
+
+ if (vfio_group_fd < 0)
+ RTE_LOG(ERR, EAL, " cannot request group fd\n");
+ return vfio_group_fd;

p->result can be SOCKET_NO_FD, in which case returned value should be
zero. I think this is missing from this code. There probably should be
an "else if (p->result == SOCKET_NO_FD)" clause that sets return value to 0.
You should be able to test this by trying to set up a device for VFIO
that isn't bound to VFIO driver, in a secondary process.

OK, I will fix this.
But really, "zero" could be ambiguous as a fd could, theoretically, be zero too.

You're correct. Maybe return 0/-1 in case of success/failure and put fd
into a pointer? i.e.

int func(int *vfio_group_fd) {
<...>
*vfio_group_fd = fd;
return 0;
}

Post by Tan, Jianfeng
Thanks,
Jianfeng

--
Thanks,
Anatoly

Burakov, Anatoly

2018-03-20 10:56:28 UTC

Post by Burakov, Anatoly

Post by Tan, Jianfeng
Hi Anatoly,
Thank you for the review. All your comments will be addressed in next
version, except for below concern which might be taken care of in
another patch if it also concerns you.

-----Original Message-----
From: Burakov, Anatoly
Sent: Wednesday, March 14, 2018 9:27 PM
Subject: Re: [PATCH v5] vfio: change to use generic multi-process channel

[...]

+    mp_req.len_param = sizeof(*p);
+    mp_req.num_fds = 0;
+
+    vfio_group_fd = -1;
+    if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+        mp_reply.nb_received == 1) {
+        mp_rep = &mp_reply.msgs[0];
+        p = (struct vfio_mp_param *)mp_rep->param;
+        if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+            cur_grp->group_no = iommu_group_no;
+            vfio_group_fd = mp_rep->fds[0];
+            cur_grp->fd = vfio_group_fd;
+            vfio_cfg.vfio_active_groups++;
           }
+        free(mp_reply.msgs);
       }
-    return -1;
+
+    if (vfio_group_fd < 0)
+        RTE_LOG(ERR, EAL, " cannot request group fd\n");
+    return vfio_group_fd;

p->result can be SOCKET_NO_FD, in which case returned value should be
zero. I think this is missing from this code. There probably should be
an "else if (p->result == SOCKET_NO_FD)" clause that sets return value to 0.
You should be able to test this by trying to set up a device for VFIO
that isn't bound to VFIO driver, in a secondary process.

OK, I will fix this.
But really, "zero" could be ambiguous as a fd could, theoretically, be zero too.

You're correct. Maybe return 0/-1 in case of success/failure and put fd
into a pointer? i.e.
int func(int *vfio_group_fd) {
<...>
*vfio_group_fd = fd;
return 0;
}

Or rather return 1/0/-1 depending on whether we got SOCKET_OK,
SOCKET_NO_FD or SOCKET_ERR.

Post by Burakov, Anatoly

Post by Tan, Jianfeng
Thanks,
Jianfeng

--
Thanks,
Anatoly

Jianfeng Tan

2018-03-20 08:50:09 UTC

Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.

This patch changes to use the generic mp channel.

Test:
1. Bind two NICs to vfio-pci.

2. Start the primary and secondary process.
$ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
$ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
--num-procs=2 --proc-id=1

Cc: ***@intel.com

Signed-off-by: Jianfeng Tan <***@intel.com>
---
v5->v6: (Address comments from Anatoly)
- Naming, return checking, logging.
- Move vfio action register after rte_bus_probe().
lib/librte_eal/linuxapp/eal/eal.c | 22 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 176 +++++------
lib/librte_eal/linuxapp/eal/eal_vfio.h | 17 +-
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 408 ++++---------------------
4 files changed, 145 insertions(+), 478 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 38306bf..fb41e97 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -689,24 +689,8 @@ rte_eal_iopl_init(void)
#ifdef VFIO_PRESENT
static int rte_eal_vfio_setup(void)
{
- int vfio_enabled = 0;
-
if (rte_vfio_enable("vfio"))
return -1;
- vfio_enabled = rte_vfio_is_enabled("vfio");
-
- if (vfio_enabled) {
-
- /* if we are primary process, create a thread to communicate with
- * secondary processes. the thread will use a socket to wait for
- * requests from secondary process to send open file descriptors,
- * because VFIO does not allow multiple open descriptors on a group or
- * VFIO container.
- */
- if (internal_config.process_type == RTE_PROC_PRIMARY &&
- vfio_mp_sync_setup() < 0)
- return -1;
- }

return 0;
}
@@ -950,6 +934,12 @@ rte_eal_init(int argc, char **argv)
return -1;
}

+#ifdef VFIO_PRESENT
+ /* Register mp action after probe() so that we got enough info */
+ if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0)
+ return -1;
+#endif
+
/* initialize default service/lcore mappings and start running. Ignore
* -ENOTSUP, as it indicates no service coremask passed to EAL.
*/
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index e44ae4d..9b97e5b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

#include <string.h>
@@ -42,6 +42,10 @@ vfio_get_group_fd(int iommu_group_no)
int vfio_group_fd;
char filename[PATH_MAX];
struct vfio_group *cur_grp;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;

/* check if we already have the group descriptor open */
for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -101,50 +105,34 @@ vfio_get_group_fd(int iommu_group_no)
return vfio_group_fd;
}
/* if we're in a secondary process, request group fd from the primary
- * process via our socket
+ * process via mp channel
*/
- else {
- int socket_fd, ret;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group number!\n");
- close(socket_fd);
- return -1;
- }
- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- case SOCKET_NO_FD:
- close(socket_fd);
- return 0;
- case SOCKET_OK:
- vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
- /* if we got the fd, store it and return it */
- if (vfio_group_fd > 0) {
- close(socket_fd);
- cur_grp->group_no = iommu_group_no;
- cur_grp->fd = vfio_group_fd;
- vfio_cfg.vfio_active_groups++;
- return vfio_group_fd;
- }
- /* fall-through on error */
- default:
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ p->req = SOCKET_REQ_GROUP;
+ p->group_no = iommu_group_no;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_group_fd = -1;
+ if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ cur_grp->group_no = iommu_group_no;
+ vfio_group_fd = mp_rep->fds[0];
+ cur_grp->fd = vfio_group_fd;
+ vfio_cfg.vfio_active_groups++;
+ } else if (p->result == SOCKET_NO_FD) {
+ RTE_LOG(ERR, EAL, " bad VFIO group fd\n");
+ vfio_group_fd = 0;
}
+ free(mp_reply.msgs);
}
- return -1;
+
+ if (vfio_group_fd < 0)
+ RTE_LOG(ERR, EAL, " cannot request group fd\n");
+ return vfio_group_fd;
}

@@ -200,7 +188,10 @@ int
rte_vfio_clear_group(int vfio_group_fd)
{
int i;
- int socket_fd, ret;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;

if (internal_config.process_type == RTE_PROC_PRIMARY) {

@@ -214,43 +205,27 @@ rte_vfio_clear_group(int vfio_group_fd)
return 0;
}

- /* This is just for SECONDARY processes */
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
+ p->req = SOCKET_CLR_GROUP;
+ p->group_no = vfio_group_fd;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;

- if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
+ if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK) {
+ free(mp_reply.msgs);
+ return 0;
+ } else if (p->result == SOCKET_NO_FD)
+ RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
+ else
+ RTE_LOG(ERR, EAL, " no such VFIO group fd!\n");

- if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group fd!\n");
- close(socket_fd);
- return -1;
+ free(mp_reply.msgs);
}

- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- case SOCKET_NO_FD:
- RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
- close(socket_fd);
- break;
- case SOCKET_OK:
- close(socket_fd);
- return 0;
- case SOCKET_ERR:
- RTE_LOG(ERR, EAL, " Socket error\n");
- close(socket_fd);
- break;
- default:
- RTE_LOG(ERR, EAL, " UNKNOWN reply, %d\n", ret);
- close(socket_fd);
- }
return -1;
}

@@ -561,6 +536,11 @@ int
vfio_get_container_fd(void)
{
int ret, vfio_container_fd;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+

/* if we're in a primary process, try to open the container */
if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -591,33 +571,29 @@ vfio_get_container_fd(void)
}

return vfio_container_fd;
- } else {
- /*
- * if we're in a secondary process, request container fd from the
- * primary process via our socket
- */
- int socket_fd;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
- if (vfio_container_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ }
+ /*
+ * if we're in a secondary process, request container fd from the
+ * primary process via mp channel
+ */
+ p->req = SOCKET_REQ_CONTAINER;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_container_fd = -1;
+ if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ free(mp_reply.msgs);
+ return mp_rep->fds[0];
}
- close(socket_fd);
- return vfio_container_fd;
+ free(mp_reply.msgs);
}

+ RTE_LOG(ERR, EAL, " cannot request container fd\n");
return -1;
}

diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 8059577..be2a79b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -88,15 +88,6 @@ struct vfio_iommu_spapr_tce_info {
#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS

/*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
* we don't need to store device fd's anywhere since they can be obtained from
* the group fd via an ioctl() call.
*/
@@ -150,6 +141,8 @@ vfio_get_group_fd(int iommu_group_no);

int vfio_mp_sync_setup(void);

+#define EAL_VFIO_MP "eal_vfio_mp_sync"
+
#define SOCKET_REQ_CONTAINER 0x100
#define SOCKET_REQ_GROUP 0x200
#define SOCKET_CLR_GROUP 0x300
@@ -157,6 +150,12 @@ int vfio_mp_sync_setup(void);
#define SOCKET_NO_FD 0x1
#define SOCKET_ERR 0xFF

+struct vfio_mp_param {
+ int req;
+ int result;
+ int group_no;
+};
+
#endif /* VFIO_PRESENT */

#endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index 7cc3c15..afa556f 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -1,32 +1,15 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

+#include <unistd.h>
#include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif

#include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
#include <rte_vfio.h>
+#include <rte_eal.h>

-#include "eal_filesystem.h"
#include "eal_vfio.h"
-#include "eal_thread.h"

/**
* @file
@@ -37,358 +20,77 @@

#ifdef VFIO_PRESENT

-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
- do {\
- (chdr).cmsg_len = CMSGLEN;\
- (chdr).cmsg_level = SOL_SOCKET;\
- (chdr).cmsg_type = SCM_RIGHTS;\
- memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
- } while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
- memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
- const char *dir = "/var/run";
- const char *home_dir = getenv("HOME");
-
- if (getuid() != 0 && home_dir != NULL)
- dir = home_dir;
-
- /* use current prefix as file path */
- snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
- internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * data flow for socket comm protocol:
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
- struct msghdr hdr;
- struct iovec iov;
- int buf;
- int ret;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = req;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct iovec iov;
- int ret, req;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = SOCKET_ERR;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
+static int
+vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
+ int fd = -1;
int ret;
+ struct rte_mp_msg reply;
+ struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
+ const struct vfio_mp_param *m = (const struct vfio_mp_param *)msg->param;

- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- buf = SOCKET_OK;
- FD_TO_CMSGHDR(fd, *chdr);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
- int ret, req, fd;
-
- buf = SOCKET_ERR;
-
- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- if (req != SOCKET_OK)
- return -1;
-
- CMSGHDR_TO_FD(*chdr, fd);
-
- return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
-{
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
- int socket_fd;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+ if (msg->len_param != sizeof(*m)) {
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}

- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
- return socket_fd;
-
- /* if connect failed */
- close(socket_fd);
- return -1;
-}
-
+ memset(&reply, 0, sizeof(reply));

-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
- int ret, fd, vfio_data;
-
- /* wait for requests on the socket */
- for (;;) {
- int conn_sock;
- struct sockaddr_un addr;
- socklen_t sockaddr_len = sizeof(addr);
-
- /* this is a blocking call */
- conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
- &sockaddr_len);
-
- /* just restart on error */
- if (conn_sock == -1)
- continue;
-
- /* set socket to linger after close */
- struct linger l;
- l.l_onoff = 1;
- l.l_linger = 60;
-
- if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
- RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
- "on listen socket (%s)\n", strerror(errno));
-
- ret = vfio_mp_sync_receive_request(conn_sock);
-
- switch (ret) {
- case SOCKET_REQ_CONTAINER:
- fd = vfio_get_container_fd();
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- else
- vfio_mp_sync_send_fd(conn_sock, fd);
- if (fd >= 0)
- close(fd);
- break;
- case SOCKET_REQ_GROUP:
- /* wait for group number */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- fd = vfio_get_group_fd(vfio_data);
-
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+ switch (m->req) {
+ case SOCKET_REQ_GROUP:
+ r->req = SOCKET_REQ_GROUP;
+ r->group_no = m->group_no;
+ fd = vfio_get_group_fd(m->group_no);
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else if (fd == 0)
/* if VFIO group exists but isn't bound to VFIO driver */
- else if (fd == 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+ r->result = SOCKET_NO_FD;
+ else {
/* if group exists and is bound to VFIO driver */
- else {
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- vfio_mp_sync_send_fd(conn_sock, fd);
- }
- break;
- case SOCKET_CLR_GROUP:
- /* wait for group fd */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- ret = rte_vfio_clear_group(vfio_data);
-
- if (ret < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
- else
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- break;
- default:
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- break;
+ r->result = SOCKET_OK;
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
}
- close(conn_sock);
- }
-}
-
-static int
-vfio_mp_sync_socket_setup(void)
-{
- int ret, socket_fd;
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
- return -1;
- }
-
- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- unlink(addr.sun_path);
-
- ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
- close(socket_fd);
- return -1;
- }
-
- ret = listen(socket_fd, 50);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
- close(socket_fd);
+ break;
+ case SOCKET_CLR_GROUP:
+ r->req = SOCKET_CLR_GROUP;
+ r->group_no = m->group_no;
+ if (rte_vfio_clear_group(m->group_no) < 0)
+ r->result = SOCKET_NO_FD;
+ else
+ r->result = SOCKET_OK;
+ break;
+ case SOCKET_REQ_CONTAINER:
+ r->req = SOCKET_REQ_CONTAINER;
+ fd = vfio_get_container_fd();
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else {
+ r->result = SOCKET_OK;
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
+ }
+ break;
+ default:
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}

- /* save the socket in local configuration */
- mp_socket_fd = socket_fd;
+ strcpy(reply.name, EAL_VFIO_MP);
+ reply.len_param = sizeof(*r);

- return 0;
+ ret = rte_mp_reply(&reply, peer);
+ if (m->req == SOCKET_REQ_CONTAINER && fd >= 0)
+ close(fd);
+ return ret;
}

-/*
- * set up a local socket and tell it to listen for incoming connections
- */
int
vfio_mp_sync_setup(void)
{
- int ret;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
- if (vfio_mp_sync_socket_setup() < 0) {
- RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
- return -1;
- }
-
- ret = pthread_create(&socket_thread, NULL,
- vfio_mp_sync_thread, NULL);
- if (ret) {
- RTE_LOG(ERR, EAL,
- "Failed to create thread for communication with secondary processes!\n");
- close(mp_socket_fd);
- return -1;
- }
-
- /* Set thread_name for aid in debugging. */
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
- ret = rte_thread_setname(socket_thread, thread_name);
- if (ret)
- RTE_LOG(DEBUG, EAL,
- "Failed to set thread name for secondary processes!\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary);

return 0;
}

--
2.7.4

Tan, Jianfeng

2018-04-05 14:26:24 UTC

Hi Anatoly,

An obvious action would be change rte_mp_request to
rte_mp_request_sync(). Before sending out the new patch, do you have any
other comments for this patch?

Hi Thomas,

Several patches will change vfio; may I know the your preferred apply
sequence? (I'm trying to find out which patch shall rebase on; of
course, I can wait until other patches are applied)

- http://dpdk.org/dev/patchwork/patch/37258/
- http://dpdk.org/dev/patchwork/patch/37152/
- http://dpdk.org/dev/patchwork/patch/37082/
- http://dpdk.org/dev/patchwork/patch/37047/

Thanks,
Jianfeng

Post by Jianfeng Tan
Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.
This patch changes to use the generic mp channel.
1. Bind two NICs to vfio-pci.
2. Start the primary and secondary process.
$ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
$ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
--num-procs=2 --proc-id=1
---
v5->v6: (Address comments from Anatoly)
- Naming, return checking, logging.
- Move vfio action register after rte_bus_probe().
lib/librte_eal/linuxapp/eal/eal.c | 22 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 176 +++++------
lib/librte_eal/linuxapp/eal/eal_vfio.h | 17 +-
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 408 ++++---------------------
4 files changed, 145 insertions(+), 478 deletions(-)
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 38306bf..fb41e97 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -689,24 +689,8 @@ rte_eal_iopl_init(void)
#ifdef VFIO_PRESENT
static int rte_eal_vfio_setup(void)
{
- int vfio_enabled = 0;
-
if (rte_vfio_enable("vfio"))
return -1;
- vfio_enabled = rte_vfio_is_enabled("vfio");
-
- if (vfio_enabled) {
-
- /* if we are primary process, create a thread to communicate with
- * secondary processes. the thread will use a socket to wait for
- * requests from secondary process to send open file descriptors,
- * because VFIO does not allow multiple open descriptors on a group or
- * VFIO container.
- */
- if (internal_config.process_type == RTE_PROC_PRIMARY &&
- vfio_mp_sync_setup() < 0)
- return -1;
- }
return 0;
}
@@ -950,6 +934,12 @@ rte_eal_init(int argc, char **argv)
return -1;
}
+#ifdef VFIO_PRESENT
+ /* Register mp action after probe() so that we got enough info */
+ if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0)
+ return -1;
+#endif
+
/* initialize default service/lcore mappings and start running. Ignore
* -ENOTSUP, as it indicates no service coremask passed to EAL.
*/
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index e44ae4d..9b97e5b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/
#include <string.h>
@@ -42,6 +42,10 @@ vfio_get_group_fd(int iommu_group_no)
int vfio_group_fd;
char filename[PATH_MAX];
struct vfio_group *cur_grp;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
/* check if we already have the group descriptor open */
for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -101,50 +105,34 @@ vfio_get_group_fd(int iommu_group_no)
return vfio_group_fd;
}
/* if we're in a secondary process, request group fd from the primary
- * process via our socket
+ * process via mp channel
*/
- else {
- int socket_fd, ret;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group number!\n");
- close(socket_fd);
- return -1;
- }
- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- close(socket_fd);
- return 0;
- vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
- /* if we got the fd, store it and return it */
- if (vfio_group_fd > 0) {
- close(socket_fd);
- cur_grp->group_no = iommu_group_no;
- cur_grp->fd = vfio_group_fd;
- vfio_cfg.vfio_active_groups++;
- return vfio_group_fd;
- }
- /* fall-through on error */
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ p->req = SOCKET_REQ_GROUP;
+ p->group_no = iommu_group_no;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_group_fd = -1;
+ if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ cur_grp->group_no = iommu_group_no;
+ vfio_group_fd = mp_rep->fds[0];
+ cur_grp->fd = vfio_group_fd;
+ vfio_cfg.vfio_active_groups++;
+ } else if (p->result == SOCKET_NO_FD) {
+ RTE_LOG(ERR, EAL, " bad VFIO group fd\n");
+ vfio_group_fd = 0;
}
+ free(mp_reply.msgs);
}
- return -1;
+
+ if (vfio_group_fd < 0)
+ RTE_LOG(ERR, EAL, " cannot request group fd\n");
+ return vfio_group_fd;
}
@@ -200,7 +188,10 @@ int
rte_vfio_clear_group(int vfio_group_fd)
{
int i;
- int socket_fd, ret;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -214,43 +205,27 @@ rte_vfio_clear_group(int vfio_group_fd)
return 0;
}
- /* This is just for SECONDARY processes */
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
+ p->req = SOCKET_CLR_GROUP;
+ p->group_no = vfio_group_fd;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
+ if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK) {
+ free(mp_reply.msgs);
+ return 0;
+ } else if (p->result == SOCKET_NO_FD)
+ RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
+ else
+ RTE_LOG(ERR, EAL, " no such VFIO group fd!\n");
- if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group fd!\n");
- close(socket_fd);
- return -1;
+ free(mp_reply.msgs);
}
- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
- close(socket_fd);
- break;
- close(socket_fd);
- return 0;
- RTE_LOG(ERR, EAL, " Socket error\n");
- close(socket_fd);
- break;
- RTE_LOG(ERR, EAL, " UNKNOWN reply, %d\n", ret);
- close(socket_fd);
- }
return -1;
}
@@ -561,6 +536,11 @@ int
vfio_get_container_fd(void)
{
int ret, vfio_container_fd;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
/* if we're in a primary process, try to open the container */
if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -591,33 +571,29 @@ vfio_get_container_fd(void)
}
return vfio_container_fd;
- } else {
- /*
- * if we're in a secondary process, request container fd from the
- * primary process via our socket
- */
- int socket_fd;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
- if (vfio_container_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ }
+ /*
+ * if we're in a secondary process, request container fd from the
+ * primary process via mp channel
+ */
+ p->req = SOCKET_REQ_CONTAINER;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_container_fd = -1;
+ if (rte_mp_request(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ free(mp_reply.msgs);
+ return mp_rep->fds[0];
}
- close(socket_fd);
- return vfio_container_fd;
+ free(mp_reply.msgs);
}
+ RTE_LOG(ERR, EAL, " cannot request container fd\n");
return -1;
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 8059577..be2a79b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -88,15 +88,6 @@ struct vfio_iommu_spapr_tce_info {
#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
/*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
* we don't need to store device fd's anywhere since they can be obtained from
* the group fd via an ioctl() call.
*/
@@ -150,6 +141,8 @@ vfio_get_group_fd(int iommu_group_no);
int vfio_mp_sync_setup(void);
+#define EAL_VFIO_MP "eal_vfio_mp_sync"
+
#define SOCKET_REQ_CONTAINER 0x100
#define SOCKET_REQ_GROUP 0x200
#define SOCKET_CLR_GROUP 0x300
@@ -157,6 +150,12 @@ int vfio_mp_sync_setup(void);
#define SOCKET_NO_FD 0x1
#define SOCKET_ERR 0xFF
+struct vfio_mp_param {
+ int req;
+ int result;
+ int group_no;
+};
+
#endif /* VFIO_PRESENT */
#endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index 7cc3c15..afa556f 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -1,32 +1,15 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/
+#include <unistd.h>
#include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif
#include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
#include <rte_vfio.h>
+#include <rte_eal.h>
-#include "eal_filesystem.h"
#include "eal_vfio.h"
-#include "eal_thread.h"
/**
@@ -37,358 +20,77 @@
#ifdef VFIO_PRESENT
-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
- do {\
- (chdr).cmsg_len = CMSGLEN;\
- (chdr).cmsg_level = SOL_SOCKET;\
- (chdr).cmsg_type = SCM_RIGHTS;\
- memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
- } while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
- memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
- const char *dir = "/var/run";
- const char *home_dir = getenv("HOME");
-
- if (getuid() != 0 && home_dir != NULL)
- dir = home_dir;
-
- /* use current prefix as file path */
- snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
- internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
- struct msghdr hdr;
- struct iovec iov;
- int buf;
- int ret;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = req;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct iovec iov;
- int ret, req;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = SOCKET_ERR;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
+static int
+vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
+ int fd = -1;
int ret;
+ struct rte_mp_msg reply;
+ struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
+ const struct vfio_mp_param *m = (const struct vfio_mp_param *)msg->param;
- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- buf = SOCKET_OK;
- FD_TO_CMSGHDR(fd, *chdr);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
- int ret, req, fd;
-
- buf = SOCKET_ERR;
-
- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- if (req != SOCKET_OK)
- return -1;
-
- CMSGHDR_TO_FD(*chdr, fd);
-
- return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
-{
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
- int socket_fd;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+ if (msg->len_param != sizeof(*m)) {
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}
- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
- return socket_fd;
-
- /* if connect failed */
- close(socket_fd);
- return -1;
-}
-
+ memset(&reply, 0, sizeof(reply));
-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
- int ret, fd, vfio_data;
-
- /* wait for requests on the socket */
- for (;;) {
- int conn_sock;
- struct sockaddr_un addr;
- socklen_t sockaddr_len = sizeof(addr);
-
- /* this is a blocking call */
- conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
- &sockaddr_len);
-
- /* just restart on error */
- if (conn_sock == -1)
- continue;
-
- /* set socket to linger after close */
- struct linger l;
- l.l_onoff = 1;
- l.l_linger = 60;
-
- if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
- RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
- "on listen socket (%s)\n", strerror(errno));
-
- ret = vfio_mp_sync_receive_request(conn_sock);
-
- switch (ret) {
- fd = vfio_get_container_fd();
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- else
- vfio_mp_sync_send_fd(conn_sock, fd);
- if (fd >= 0)
- close(fd);
- break;
- /* wait for group number */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- fd = vfio_get_group_fd(vfio_data);
-
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+ switch (m->req) {
+ r->req = SOCKET_REQ_GROUP;
+ r->group_no = m->group_no;
+ fd = vfio_get_group_fd(m->group_no);
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else if (fd == 0)
/* if VFIO group exists but isn't bound to VFIO driver */
- else if (fd == 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+ r->result = SOCKET_NO_FD;
+ else {
/* if group exists and is bound to VFIO driver */
- else {
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- vfio_mp_sync_send_fd(conn_sock, fd);
- }
- break;
- /* wait for group fd */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- ret = rte_vfio_clear_group(vfio_data);
-
- if (ret < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
- else
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- break;
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- break;
+ r->result = SOCKET_OK;
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
}
- close(conn_sock);
- }
-}
-
-static int
-vfio_mp_sync_socket_setup(void)
-{
- int ret, socket_fd;
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
- return -1;
- }
-
- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- unlink(addr.sun_path);
-
- ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
- close(socket_fd);
- return -1;
- }
-
- ret = listen(socket_fd, 50);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
- close(socket_fd);
+ break;
+ r->req = SOCKET_CLR_GROUP;
+ r->group_no = m->group_no;
+ if (rte_vfio_clear_group(m->group_no) < 0)
+ r->result = SOCKET_NO_FD;
+ else
+ r->result = SOCKET_OK;
+ break;
+ r->req = SOCKET_REQ_CONTAINER;
+ fd = vfio_get_container_fd();
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else {
+ r->result = SOCKET_OK;
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
+ }
+ break;
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}
- /* save the socket in local configuration */
- mp_socket_fd = socket_fd;
+ strcpy(reply.name, EAL_VFIO_MP);
+ reply.len_param = sizeof(*r);
- return 0;
+ ret = rte_mp_reply(&reply, peer);
+ if (m->req == SOCKET_REQ_CONTAINER && fd >= 0)
+ close(fd);
+ return ret;
}
-/*
- * set up a local socket and tell it to listen for incoming connections
- */
int
vfio_mp_sync_setup(void)
{
- int ret;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
- if (vfio_mp_sync_socket_setup() < 0) {
- RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
- return -1;
- }
-
- ret = pthread_create(&socket_thread, NULL,
- vfio_mp_sync_thread, NULL);
- if (ret) {
- RTE_LOG(ERR, EAL,
- "Failed to create thread for communication with secondary processes!\n");
- close(mp_socket_fd);
- return -1;
- }
-
- /* Set thread_name for aid in debugging. */
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
- ret = rte_thread_setname(socket_thread, thread_name);
- if (ret)
- RTE_LOG(DEBUG, EAL,
- "Failed to set thread name for secondary processes!\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary);
return 0;
}

Burakov, Anatoly

2018-04-05 14:39:29 UTC

Post by Tan, Jianfeng
Hi Anatoly,
An obvious action would be change rte_mp_request to
rte_mp_request_sync(). Before sending out the new patch, do you have any
other comments for this patch?

Hi Jianfeng,

I don't think i do, but i'll have another look at it just in case, when
i get time.

Post by Tan, Jianfeng
Hi Thomas,
Several patches will change vfio; may I know the your preferred apply
sequence? (I'm trying to find out which patch shall rebase on; of
course, I can wait until other patches are applied)
- http://dpdk.org/dev/patchwork/patch/37258/
- http://dpdk.org/dev/patchwork/patch/37152/
- http://dpdk.org/dev/patchwork/patch/37082/
- http://dpdk.org/dev/patchwork/patch/37047/
Thanks,
Jianfeng

--
Thanks,
Anatoly

Thomas Monjalon

2018-04-12 23:27:34 UTC

Post by Tan, Jianfeng
Hi Anatoly,
An obvious action would be change rte_mp_request to
rte_mp_request_sync(). Before sending out the new patch, do you have any
other comments for this patch?
Hi Thomas,
Several patches will change vfio; may I know the your preferred apply
sequence? (I'm trying to find out which patch shall rebase on; of
course, I can wait until other patches are applied)
- http://dpdk.org/dev/patchwork/patch/37258/
- http://dpdk.org/dev/patchwork/patch/37152/
- http://dpdk.org/dev/patchwork/patch/37082/
- http://dpdk.org/dev/patchwork/patch/37047/

All, but first one, are applied now.
I guess you can rebase on master.

Burakov, Anatoly

2018-04-12 15:26:08 UTC

Post by Jianfeng Tan
Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.
This patch changes to use the generic mp channel.
1. Bind two NICs to vfio-pci.
2. Start the primary and secondary process.
$ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
$ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
--num-procs=2 --proc-id=1
---
v5->v6: (Address comments from Anatoly)
- Naming, return checking, logging.
- Move vfio action register after rte_bus_probe().

Acked-by: Anatoly Burakov <***@intel.com>

--
Thanks,
Anatoly

Jianfeng Tan

2018-04-15 15:06:19 UTC

Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.

This patch changes to use the generic mp channel.

Test:
1. Bind two NICs to vfio-pci.

2. Start the primary and secondary process.
$ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
$ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
--num-procs=2 --proc-id=1

Cc: ***@intel.com

Signed-off-by: Jianfeng Tan <***@intel.com>
Acked-by: Anatoly Burakov <***@intel.com>
---
lib/librte_eal/linuxapp/eal/eal.c | 22 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 178 +++++------
lib/librte_eal/linuxapp/eal/eal_vfio.h | 17 +-
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 410 ++++---------------------
4 files changed, 148 insertions(+), 479 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 99c2242..21afa73 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -696,24 +696,8 @@ rte_eal_iopl_init(void)
#ifdef VFIO_PRESENT
static int rte_eal_vfio_setup(void)
{
- int vfio_enabled = 0;
-
if (rte_vfio_enable("vfio"))
return -1;
- vfio_enabled = rte_vfio_is_enabled("vfio");
-
- if (vfio_enabled) {
-
- /* if we are primary process, create a thread to communicate with
- * secondary processes. the thread will use a socket to wait for
- * requests from secondary process to send open file descriptors,
- * because VFIO does not allow multiple open descriptors on a group or
- * VFIO container.
- */
- if (internal_config.process_type == RTE_PROC_PRIMARY &&
- vfio_mp_sync_setup() < 0)
- return -1;
- }

return 0;
}
@@ -970,6 +954,12 @@ rte_eal_init(int argc, char **argv)
return -1;
}

+#ifdef VFIO_PRESENT
+ /* Register mp action after probe() so that we got enough info */
+ if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0)
+ return -1;
+#endif
+
/* initialize default service/lcore mappings and start running. Ignore
* -ENOTSUP, as it indicates no service coremask passed to EAL.
*/
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 16ee730..957a537 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

#include <inttypes.h>
@@ -290,6 +290,10 @@ rte_vfio_get_group_fd(int iommu_group_num)
int vfio_group_fd;
char filename[PATH_MAX];
struct vfio_group *cur_grp;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;

/* check if we already have the group descriptor open */
for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -350,50 +354,34 @@ rte_vfio_get_group_fd(int iommu_group_num)
return vfio_group_fd;
}
/* if we're in a secondary process, request group fd from the primary
- * process via our socket
+ * process via mp channel.
*/
- else {
- int socket_fd, ret;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, iommu_group_num) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group number!\n");
- close(socket_fd);
- return -1;
- }
- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- case SOCKET_NO_FD:
- close(socket_fd);
- return 0;
- case SOCKET_OK:
- vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
- /* if we got the fd, store it and return it */
- if (vfio_group_fd > 0) {
- close(socket_fd);
- cur_grp->group_num = iommu_group_num;
- cur_grp->fd = vfio_group_fd;
- vfio_cfg.vfio_active_groups++;
- return vfio_group_fd;
- }
- /* fall-through on error */
- default:
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ p->req = SOCKET_REQ_GROUP;
+ p->group_num = iommu_group_num;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_group_fd = -1;
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ cur_grp->group_num = iommu_group_num;
+ vfio_group_fd = mp_rep->fds[0];
+ cur_grp->fd = vfio_group_fd;
+ vfio_cfg.vfio_active_groups++;
+ } else if (p->result == SOCKET_NO_FD) {
+ RTE_LOG(ERR, EAL, " bad VFIO group fd\n");
+ vfio_group_fd = 0;
}
+ free(mp_reply.msgs);
}
- return -1;
+
+ if (vfio_group_fd < 0)
+ RTE_LOG(ERR, EAL, " cannot request group fd\n");
+ return vfio_group_fd;
}

@@ -481,7 +469,10 @@ int
rte_vfio_clear_group(int vfio_group_fd)
{
int i;
- int socket_fd, ret;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;

if (internal_config.process_type == RTE_PROC_PRIMARY) {

@@ -495,43 +486,27 @@ rte_vfio_clear_group(int vfio_group_fd)
return 0;
}

- /* This is just for SECONDARY processes */
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
-
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
+ p->req = SOCKET_CLR_GROUP;
+ p->group_num = vfio_group_fd;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK) {
+ free(mp_reply.msgs);
+ return 0;
+ } else if (p->result == SOCKET_NO_FD)
+ RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
+ else
+ RTE_LOG(ERR, EAL, " no such VFIO group fd!\n");

- if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group fd!\n");
- close(socket_fd);
- return -1;
+ free(mp_reply.msgs);
}

- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- case SOCKET_NO_FD:
- RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
- close(socket_fd);
- break;
- case SOCKET_OK:
- close(socket_fd);
- return 0;
- case SOCKET_ERR:
- RTE_LOG(ERR, EAL, " Socket error\n");
- close(socket_fd);
- break;
- default:
- RTE_LOG(ERR, EAL, " UNKNOWN reply, %d\n", ret);
- close(socket_fd);
- }
return -1;
}

@@ -924,6 +899,11 @@ int
rte_vfio_get_container_fd(void)
{
int ret, vfio_container_fd;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+

/* if we're in a primary process, try to open the container */
if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -954,33 +934,29 @@ rte_vfio_get_container_fd(void)
}

return vfio_container_fd;
- } else {
- /*
- * if we're in a secondary process, request container fd from the
- * primary process via our socket
- */
- int socket_fd;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
- if (vfio_container_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ }
+ /*
+ * if we're in a secondary process, request container fd from the
+ * primary process via mp channel
+ */
+ p->req = SOCKET_REQ_CONTAINER;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_container_fd = -1;
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ free(mp_reply.msgs);
+ return mp_rep->fds[0];
}
- close(socket_fd);
- return vfio_container_fd;
+ free(mp_reply.msgs);
}

+ RTE_LOG(ERR, EAL, " cannot request container fd\n");
return -1;
}

diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index c788bba..c8c6ee4 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -84,15 +84,6 @@ struct vfio_iommu_spapr_tce_info {
#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS

/*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
* we don't need to store device fd's anywhere since they can be obtained from
* the group fd via an ioctl() call.
*/
@@ -141,6 +132,8 @@ vfio_has_supported_extensions(int vfio_container_fd);

int vfio_mp_sync_setup(void);

+#define EAL_VFIO_MP "eal_vfio_mp_sync"
+
#define SOCKET_REQ_CONTAINER 0x100
#define SOCKET_REQ_GROUP 0x200
#define SOCKET_CLR_GROUP 0x300
@@ -148,6 +141,12 @@ int vfio_mp_sync_setup(void);
#define SOCKET_NO_FD 0x1
#define SOCKET_ERR 0xFF

+struct vfio_mp_param {
+ int req;
+ int result;
+ int group_num;
+};
+
#endif /* VFIO_PRESENT */

#endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index e19b571..9c202bb 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -1,32 +1,16 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

+#include <unistd.h>
#include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif

+#include <rte_compat.h>
#include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
#include <rte_vfio.h>
+#include <rte_eal.h>

-#include "eal_filesystem.h"
#include "eal_vfio.h"
-#include "eal_thread.h"

/**
* @file
@@ -37,358 +21,78 @@

#ifdef VFIO_PRESENT

-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
- do {\
- (chdr).cmsg_len = CMSGLEN;\
- (chdr).cmsg_level = SOL_SOCKET;\
- (chdr).cmsg_type = SCM_RIGHTS;\
- memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
- } while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
- memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
- const char *dir = "/var/run";
- const char *home_dir = getenv("HOME");
-
- if (getuid() != 0 && home_dir != NULL)
- dir = home_dir;
-
- /* use current prefix as file path */
- snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
- internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * data flow for socket comm protocol:
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
- struct msghdr hdr;
- struct iovec iov;
- int buf;
- int ret;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = req;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct iovec iov;
- int ret, req;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = SOCKET_ERR;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
+static int
+vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
+ int fd = -1;
int ret;
+ struct rte_mp_msg reply;
+ struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
+ const struct vfio_mp_param *m =
+ (const struct vfio_mp_param *)msg->param;

- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- buf = SOCKET_OK;
- FD_TO_CMSGHDR(fd, *chdr);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
- int ret, req, fd;
-
- buf = SOCKET_ERR;
-
- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- if (req != SOCKET_OK)
- return -1;
-
- CMSGHDR_TO_FD(*chdr, fd);
-
- return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
-{
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
- int socket_fd;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+ if (msg->len_param != sizeof(*m)) {
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}

- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
- return socket_fd;
-
- /* if connect failed */
- close(socket_fd);
- return -1;
-}
-
+ memset(&reply, 0, sizeof(reply));

-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
- int ret, fd, vfio_data;
-
- /* wait for requests on the socket */
- for (;;) {
- int conn_sock;
- struct sockaddr_un addr;
- socklen_t sockaddr_len = sizeof(addr);
-
- /* this is a blocking call */
- conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
- &sockaddr_len);
-
- /* just restart on error */
- if (conn_sock == -1)
- continue;
-
- /* set socket to linger after close */
- struct linger l;
- l.l_onoff = 1;
- l.l_linger = 60;
-
- if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
- RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
- "on listen socket (%s)\n", strerror(errno));
-
- ret = vfio_mp_sync_receive_request(conn_sock);
-
- switch (ret) {
- case SOCKET_REQ_CONTAINER:
- fd = rte_vfio_get_container_fd();
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- else
- vfio_mp_sync_send_fd(conn_sock, fd);
- if (fd >= 0)
- close(fd);
- break;
- case SOCKET_REQ_GROUP:
- /* wait for group number */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- fd = rte_vfio_get_group_fd(vfio_data);
-
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+ switch (m->req) {
+ case SOCKET_REQ_GROUP:
+ r->req = SOCKET_REQ_GROUP;
+ r->group_num = m->group_num;
+ fd = rte_vfio_get_group_fd(m->group_num);
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else if (fd == 0)
/* if VFIO group exists but isn't bound to VFIO driver */
- else if (fd == 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+ r->result = SOCKET_NO_FD;
+ else {
/* if group exists and is bound to VFIO driver */
- else {
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- vfio_mp_sync_send_fd(conn_sock, fd);
- }
- break;
- case SOCKET_CLR_GROUP:
- /* wait for group fd */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- ret = rte_vfio_clear_group(vfio_data);
-
- if (ret < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
- else
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- break;
- default:
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- break;
+ r->result = SOCKET_OK;
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
}
- close(conn_sock);
- }
-}
-
-static int
-vfio_mp_sync_socket_setup(void)
-{
- int ret, socket_fd;
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
- return -1;
- }
-
- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- unlink(addr.sun_path);
-
- ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
- close(socket_fd);
- return -1;
- }
-
- ret = listen(socket_fd, 50);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
- close(socket_fd);
+ break;
+ case SOCKET_CLR_GROUP:
+ r->req = SOCKET_CLR_GROUP;
+ r->group_num = m->group_num;
+ if (rte_vfio_clear_group(m->group_num) < 0)
+ r->result = SOCKET_NO_FD;
+ else
+ r->result = SOCKET_OK;
+ break;
+ case SOCKET_REQ_CONTAINER:
+ r->req = SOCKET_REQ_CONTAINER;
+ fd = rte_vfio_get_container_fd();
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else {
+ r->result = SOCKET_OK;
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
+ }
+ break;
+ default:
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}

- /* save the socket in local configuration */
- mp_socket_fd = socket_fd;
+ strcpy(reply.name, EAL_VFIO_MP);
+ reply.len_param = sizeof(*r);

- return 0;
+ ret = rte_mp_reply(&reply, peer);
+ if (m->req == SOCKET_REQ_CONTAINER && fd >= 0)
+ close(fd);
+ return ret;
}

-/*
- * set up a local socket and tell it to listen for incoming connections
- */
int
vfio_mp_sync_setup(void)
{
- int ret;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
- if (vfio_mp_sync_socket_setup() < 0) {
- RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
- return -1;
- }
-
- ret = pthread_create(&socket_thread, NULL,
- vfio_mp_sync_thread, NULL);
- if (ret) {
- RTE_LOG(ERR, EAL,
- "Failed to create thread for communication with secondary processes!\n");
- close(mp_socket_fd);
- return -1;
- }
-
- /* Set thread_name for aid in debugging. */
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
- ret = rte_thread_setname(socket_thread, thread_name);
- if (ret)
- RTE_LOG(DEBUG, EAL,
- "Failed to set thread name for secondary processes!\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary);

return 0;
}

--
2.7.4

Tan, Jianfeng

2018-04-15 15:10:44 UTC

Sorry, forget the version change log. FYI:

v6->v7:
- Rebase on master.
v5->v6: (Address comments from Anatoly)
- Naming, return checking, logging.
- Move vfio action register after rte_bus_probe().

Post by Jianfeng Tan
Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.
This patch changes to use the generic mp channel.
1. Bind two NICs to vfio-pci.
2. Start the primary and secondary process.
$ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
$ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
--num-procs=2 --proc-id=1
---
lib/librte_eal/linuxapp/eal/eal.c | 22 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 178 +++++------
lib/librte_eal/linuxapp/eal/eal_vfio.h | 17 +-
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 410 ++++---------------------
4 files changed, 148 insertions(+), 479 deletions(-)
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 99c2242..21afa73 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -696,24 +696,8 @@ rte_eal_iopl_init(void)
#ifdef VFIO_PRESENT
static int rte_eal_vfio_setup(void)
{
- int vfio_enabled = 0;
-
if (rte_vfio_enable("vfio"))
return -1;
- vfio_enabled = rte_vfio_is_enabled("vfio");
-
- if (vfio_enabled) {
-
- /* if we are primary process, create a thread to communicate with
- * secondary processes. the thread will use a socket to wait for
- * requests from secondary process to send open file descriptors,
- * because VFIO does not allow multiple open descriptors on a group or
- * VFIO container.
- */
- if (internal_config.process_type == RTE_PROC_PRIMARY &&
- vfio_mp_sync_setup() < 0)
- return -1;
- }
return 0;
}
@@ -970,6 +954,12 @@ rte_eal_init(int argc, char **argv)
return -1;
}
+#ifdef VFIO_PRESENT
+ /* Register mp action after probe() so that we got enough info */
+ if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0)
+ return -1;
+#endif
+
/* initialize default service/lcore mappings and start running. Ignore
* -ENOTSUP, as it indicates no service coremask passed to EAL.
*/
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 16ee730..957a537 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/
#include <inttypes.h>
@@ -290,6 +290,10 @@ rte_vfio_get_group_fd(int iommu_group_num)
int vfio_group_fd;
char filename[PATH_MAX];
struct vfio_group *cur_grp;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
/* check if we already have the group descriptor open */
for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -350,50 +354,34 @@ rte_vfio_get_group_fd(int iommu_group_num)
return vfio_group_fd;
}
/* if we're in a secondary process, request group fd from the primary
- * process via our socket
+ * process via mp channel.
*/
- else {
- int socket_fd, ret;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, iommu_group_num) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group number!\n");
- close(socket_fd);
- return -1;
- }
- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- close(socket_fd);
- return 0;
- vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
- /* if we got the fd, store it and return it */
- if (vfio_group_fd > 0) {
- close(socket_fd);
- cur_grp->group_num = iommu_group_num;
- cur_grp->fd = vfio_group_fd;
- vfio_cfg.vfio_active_groups++;
- return vfio_group_fd;
- }
- /* fall-through on error */
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ p->req = SOCKET_REQ_GROUP;
+ p->group_num = iommu_group_num;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_group_fd = -1;
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ cur_grp->group_num = iommu_group_num;
+ vfio_group_fd = mp_rep->fds[0];
+ cur_grp->fd = vfio_group_fd;
+ vfio_cfg.vfio_active_groups++;
+ } else if (p->result == SOCKET_NO_FD) {
+ RTE_LOG(ERR, EAL, " bad VFIO group fd\n");
+ vfio_group_fd = 0;
}
+ free(mp_reply.msgs);
}
- return -1;
+
+ if (vfio_group_fd < 0)
+ RTE_LOG(ERR, EAL, " cannot request group fd\n");
+ return vfio_group_fd;
}
@@ -481,7 +469,10 @@ int
rte_vfio_clear_group(int vfio_group_fd)
{
int i;
- int socket_fd, ret;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -495,43 +486,27 @@ rte_vfio_clear_group(int vfio_group_fd)
return 0;
}
- /* This is just for SECONDARY processes */
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
-
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
+ p->req = SOCKET_CLR_GROUP;
+ p->group_num = vfio_group_fd;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK) {
+ free(mp_reply.msgs);
+ return 0;
+ } else if (p->result == SOCKET_NO_FD)
+ RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
+ else
+ RTE_LOG(ERR, EAL, " no such VFIO group fd!\n");
- if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group fd!\n");
- close(socket_fd);
- return -1;
+ free(mp_reply.msgs);
}
- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
- close(socket_fd);
- break;
- close(socket_fd);
- return 0;
- RTE_LOG(ERR, EAL, " Socket error\n");
- close(socket_fd);
- break;
- RTE_LOG(ERR, EAL, " UNKNOWN reply, %d\n", ret);
- close(socket_fd);
- }
return -1;
}
@@ -924,6 +899,11 @@ int
rte_vfio_get_container_fd(void)
{
int ret, vfio_container_fd;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
/* if we're in a primary process, try to open the container */
if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -954,33 +934,29 @@ rte_vfio_get_container_fd(void)
}
return vfio_container_fd;
- } else {
- /*
- * if we're in a secondary process, request container fd from the
- * primary process via our socket
- */
- int socket_fd;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
- if (vfio_container_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ }
+ /*
+ * if we're in a secondary process, request container fd from the
+ * primary process via mp channel
+ */
+ p->req = SOCKET_REQ_CONTAINER;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_container_fd = -1;
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ free(mp_reply.msgs);
+ return mp_rep->fds[0];
}
- close(socket_fd);
- return vfio_container_fd;
+ free(mp_reply.msgs);
}
+ RTE_LOG(ERR, EAL, " cannot request container fd\n");
return -1;
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index c788bba..c8c6ee4 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -84,15 +84,6 @@ struct vfio_iommu_spapr_tce_info {
#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
/*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
* we don't need to store device fd's anywhere since they can be obtained from
* the group fd via an ioctl() call.
*/
@@ -141,6 +132,8 @@ vfio_has_supported_extensions(int vfio_container_fd);
int vfio_mp_sync_setup(void);
+#define EAL_VFIO_MP "eal_vfio_mp_sync"
+
#define SOCKET_REQ_CONTAINER 0x100
#define SOCKET_REQ_GROUP 0x200
#define SOCKET_CLR_GROUP 0x300
@@ -148,6 +141,12 @@ int vfio_mp_sync_setup(void);
#define SOCKET_NO_FD 0x1
#define SOCKET_ERR 0xFF
+struct vfio_mp_param {
+ int req;
+ int result;
+ int group_num;
+};
+
#endif /* VFIO_PRESENT */
#endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index e19b571..9c202bb 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -1,32 +1,16 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/
+#include <unistd.h>
#include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif
+#include <rte_compat.h>
#include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
#include <rte_vfio.h>
+#include <rte_eal.h>
-#include "eal_filesystem.h"
#include "eal_vfio.h"
-#include "eal_thread.h"
/**
@@ -37,358 +21,78 @@
#ifdef VFIO_PRESENT
-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
- do {\
- (chdr).cmsg_len = CMSGLEN;\
- (chdr).cmsg_level = SOL_SOCKET;\
- (chdr).cmsg_type = SCM_RIGHTS;\
- memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
- } while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
- memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
- const char *dir = "/var/run";
- const char *home_dir = getenv("HOME");
-
- if (getuid() != 0 && home_dir != NULL)
- dir = home_dir;
-
- /* use current prefix as file path */
- snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
- internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
- struct msghdr hdr;
- struct iovec iov;
- int buf;
- int ret;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = req;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct iovec iov;
- int ret, req;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = SOCKET_ERR;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
+static int
+vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
+ int fd = -1;
int ret;
+ struct rte_mp_msg reply;
+ struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
+ const struct vfio_mp_param *m =
+ (const struct vfio_mp_param *)msg->param;
- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- buf = SOCKET_OK;
- FD_TO_CMSGHDR(fd, *chdr);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
- int ret, req, fd;
-
- buf = SOCKET_ERR;
-
- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- if (req != SOCKET_OK)
- return -1;
-
- CMSGHDR_TO_FD(*chdr, fd);
-
- return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
-{
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
- int socket_fd;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+ if (msg->len_param != sizeof(*m)) {
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}
- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
- return socket_fd;
-
- /* if connect failed */
- close(socket_fd);
- return -1;
-}
-
+ memset(&reply, 0, sizeof(reply));
-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
- int ret, fd, vfio_data;
-
- /* wait for requests on the socket */
- for (;;) {
- int conn_sock;
- struct sockaddr_un addr;
- socklen_t sockaddr_len = sizeof(addr);
-
- /* this is a blocking call */
- conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
- &sockaddr_len);
-
- /* just restart on error */
- if (conn_sock == -1)
- continue;
-
- /* set socket to linger after close */
- struct linger l;
- l.l_onoff = 1;
- l.l_linger = 60;
-
- if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
- RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
- "on listen socket (%s)\n", strerror(errno));
-
- ret = vfio_mp_sync_receive_request(conn_sock);
-
- switch (ret) {
- fd = rte_vfio_get_container_fd();
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- else
- vfio_mp_sync_send_fd(conn_sock, fd);
- if (fd >= 0)
- close(fd);
- break;
- /* wait for group number */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- fd = rte_vfio_get_group_fd(vfio_data);
-
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+ switch (m->req) {
+ r->req = SOCKET_REQ_GROUP;
+ r->group_num = m->group_num;
+ fd = rte_vfio_get_group_fd(m->group_num);
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else if (fd == 0)
/* if VFIO group exists but isn't bound to VFIO driver */
- else if (fd == 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+ r->result = SOCKET_NO_FD;
+ else {
/* if group exists and is bound to VFIO driver */
- else {
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- vfio_mp_sync_send_fd(conn_sock, fd);
- }
- break;
- /* wait for group fd */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- ret = rte_vfio_clear_group(vfio_data);
-
- if (ret < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
- else
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- break;
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- break;
+ r->result = SOCKET_OK;
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
}
- close(conn_sock);
- }
-}
-
-static int
-vfio_mp_sync_socket_setup(void)
-{
- int ret, socket_fd;
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
- return -1;
- }
-
- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- unlink(addr.sun_path);
-
- ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
- close(socket_fd);
- return -1;
- }
-
- ret = listen(socket_fd, 50);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
- close(socket_fd);
+ break;
+ r->req = SOCKET_CLR_GROUP;
+ r->group_num = m->group_num;
+ if (rte_vfio_clear_group(m->group_num) < 0)
+ r->result = SOCKET_NO_FD;
+ else
+ r->result = SOCKET_OK;
+ break;
+ r->req = SOCKET_REQ_CONTAINER;
+ fd = rte_vfio_get_container_fd();
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else {
+ r->result = SOCKET_OK;
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
+ }
+ break;
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}
- /* save the socket in local configuration */
- mp_socket_fd = socket_fd;
+ strcpy(reply.name, EAL_VFIO_MP);
+ reply.len_param = sizeof(*r);
- return 0;
+ ret = rte_mp_reply(&reply, peer);
+ if (m->req == SOCKET_REQ_CONTAINER && fd >= 0)
+ close(fd);
+ return ret;
}
-/*
- * set up a local socket and tell it to listen for incoming connections
- */
int
vfio_mp_sync_setup(void)
{
- int ret;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
- if (vfio_mp_sync_socket_setup() < 0) {
- RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
- return -1;
- }
-
- ret = pthread_create(&socket_thread, NULL,
- vfio_mp_sync_thread, NULL);
- if (ret) {
- RTE_LOG(ERR, EAL,
- "Failed to create thread for communication with secondary processes!\n");
- close(mp_socket_fd);
- return -1;
- }
-
- /* Set thread_name for aid in debugging. */
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
- ret = rte_thread_setname(socket_thread, thread_name);
- if (ret)
- RTE_LOG(DEBUG, EAL,
- "Failed to set thread name for secondary processes!\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary);
return 0;
}

Thomas Monjalon

2018-04-17 23:04:45 UTC

Post by Jianfeng Tan
Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.
This patch changes to use the generic mp channel.
1. Bind two NICs to vfio-pci.
2. Start the primary and secondary process.
$ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
$ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
--num-procs=2 --proc-id=1

Applied, thanks

Jianfeng Tan

2018-01-25 04:16:20 UTC

v2->v3:
- Add pre-check for each APIs.
- Remove the limitation of 8 secondary processes by: discard original
register/unregister mechanism of secondary process, instead, primary
discoveries secondary processes by looking up the folder for regex match.
- Previous implementation use two sockets for msg and request, this version
just uses one socket. And receive all kinds of messages in mp thread.

v1->v2: (Address comments from Anatoly and Konstantin)
- Use datagram unix socket to supersede stream unix socket + epoll.
- Change the secondary add/del mechanism as now we use connection-less channel.
- Add mp_mutex_action to sync action register/unregister/reference.
- Limit max length of action name to 64B.
- New APIs for synchronous communication: rte_eal_mp_request/rte_eal_mp_reply.
- Formalize the errno handle.
- Some other small issues.

This patchset adds a generic channel for multi-process (primary/secondary)
communication.

Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for the requests which need a immediate response.
Patch 3: Rework vfio to use this generic communication channel.

Jianfeng Tan (3):
eal: add channel for multi-process communication
eal: add synchronous multi-process communication
vfio: use the generic multi-process channel

doc/guides/rel_notes/release_18_02.rst | 15 +
lib/librte_eal/common/eal_common_proc.c | 593 ++++++++++++++++++++++++-
lib/librte_eal/common/eal_filesystem.h | 17 +
lib/librte_eal/common/eal_private.h | 10 +
lib/librte_eal/common/include/rte_eal.h | 131 ++++++
lib/librte_eal/linuxapp/eal/eal.c | 22 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 172 +++----
lib/librte_eal/linuxapp/eal/eal_vfio.h | 15 +-
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 408 +++--------------
lib/librte_eal/rte_eal_version.map | 6 +
10 files changed, 915 insertions(+), 474 deletions(-)

--
2.7.4

Jianfeng Tan

2018-01-25 04:16:21 UTC

Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
1. Config-file based channel, in which, the primary process writes
info into a pre-defined config file, and the secondary process
reads the info out.
2. vfio submodule has its own channel based on unix socket for the
secondary process to get container fd and group fd from the
primary process.
3. pdump submodule also has its own channel based on unix socket for
packet dump.

It'd be good to have a generic communication channel for multi-process
communication to accomodate the requirements including:
a. Secondary wants to send info to primary, for example, secondary
would like to send request (about some specific vdev to primary).
b. Sending info at any time, instead of just initialization time.
c. Share FDs with the other side, for vdev like vhost, related FDs
(memory region, kick) should be shared.
d. A send message request needs the other side to response immediately.

This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.

Three new APIs are added:

1. rte_eal_mp_action_register() is used to register an action,
indexed by a string, when a component at receiver side would like
to response the messages from the peer processe.
2. rte_eal_mp_action_unregister() is used to unregister the action
if the calling component does not want to response the messages.
3. rte_eal_mp_sendmsg() is used to send a message, and returns
immediately. If there are n secondary processes, the primary
process will send n messages.

Suggested-by: Konstantin Ananyev <***@intel.com>
Signed-off-by: Jianfeng Tan <***@intel.com>
---
lib/librte_eal/common/eal_common_proc.c | 390 +++++++++++++++++++++++++++++++-
lib/librte_eal/common/eal_filesystem.h | 17 ++
lib/librte_eal/common/eal_private.h | 10 +
lib/librte_eal/common/include/rte_eal.h | 75 ++++++
lib/librte_eal/linuxapp/eal/eal.c | 8 +
lib/librte_eal/rte_eal_version.map | 3 +
6 files changed, 502 insertions(+), 1 deletion(-)

diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 40fa982..baeb7d1 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -2,14 +2,48 @@
* Copyright(c) 2016 Intel Corporation
*/

-#include <stdio.h>
+#include <dirent.h>
+#include <errno.h>
#include <fcntl.h>
+#include <fnmatch.h>
+#include <libgen.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <rte_log.h>
#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_common.h>

+#include "eal_private.h"
#include "eal_filesystem.h"
#include "eal_internal_cfg.h"

+static int mp_fd = -1;
+static char mp_filter[PATH_MAX]; /* Filter for secondary process sockets */
+static char mp_dir_path[PATH_MAX]; /* The directory path for all mp sockets */
+static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+
+struct action_entry {
+ TAILQ_ENTRY(action_entry) next;
+ char action_name[RTE_MP_MAX_NAME_LEN];
+ rte_eal_mp_t action;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(action_entry_list, action_entry);
+
+static struct action_entry_list action_entry_list =
+ TAILQ_HEAD_INITIALIZER(action_entry_list);
+
int
rte_eal_primary_proc_alive(const char *config_file_path)
{
@@ -31,3 +65,357 @@ rte_eal_primary_proc_alive(const char *config_file_path)

return !!ret;
}
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+ struct action_entry *entry;
+
+ TAILQ_FOREACH(entry, &action_entry_list, next) {
+ if (strncmp(entry->action_name, name, RTE_MP_MAX_NAME_LEN) == 0)
+ break;
+ }
+
+ return entry;
+}
+
+static bool
+validate_action_name(const char *name)
+{
+ if (name == NULL) {
+ RTE_LOG(ERR, EAL, "Action name cannot be NULL\n");
+ rte_errno = -EINVAL;
+ return false;
+ }
+ if (strnlen(name, RTE_MP_MAX_NAME_LEN) == 0) {
+ RTE_LOG(ERR, EAL, "Length of action name is zero\n");
+ rte_errno = -EINVAL;
+ return false;
+ }
+ if (strnlen(name, RTE_MP_MAX_NAME_LEN) == RTE_MP_MAX_NAME_LEN) {
+ rte_errno = -E2BIG;
+ return false;
+ }
+ return true;
+}
+
+int
+rte_eal_mp_action_register(const char *name, rte_eal_mp_t action)
+{
+ struct action_entry *entry;
+
+ if(!validate_action_name(name))
+ return -1;
+
+ entry = malloc(sizeof(struct action_entry));
+ if (entry == NULL) {
+ rte_errno = -ENOMEM;
+ return -1;
+ }
+ strcpy(entry->action_name, name);
+ entry->action = action;
+
+ pthread_mutex_lock(&mp_mutex_action);
+ if (find_action_entry_by_name(name) != NULL) {
+ pthread_mutex_unlock(&mp_mutex_action);
+ rte_errno = -EEXIST;
+ free(entry);
+ return -1;
+ }
+ TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+ pthread_mutex_unlock(&mp_mutex_action);
+ return 0;
+}
+
+void
+rte_eal_mp_action_unregister(const char *name)
+{
+ struct action_entry *entry;
+
+ if(!validate_action_name(name))
+ return;
+
+ pthread_mutex_lock(&mp_mutex_action);
+ entry = find_action_entry_by_name(name);
+ if (entry == NULL) {
+ pthread_mutex_unlock(&mp_mutex_action);
+ return;
+ }
+ TAILQ_REMOVE(&action_entry_list, entry, next);
+ pthread_mutex_unlock(&mp_mutex_action);
+ free(entry);
+}
+
+static int
+read_msg(struct rte_mp_msg *msg)
+{
+ int msglen;
+ struct iovec iov;
+ struct msghdr msgh;
+ char control[CMSG_SPACE(sizeof(msg->fds))];
+ struct cmsghdr *cmsg;
+ int buflen = sizeof(*msg) - sizeof(msg->fds);
+
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = msg;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ msglen = recvmsg(mp_fd, &msgh, 0);
+ if (msglen < 0) {
+ RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
+ return -1;
+ }
+
+ if (msglen != buflen || (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+ RTE_LOG(ERR, EAL, "truncted msg\n");
+ return -1;
+ }
+
+ /* read auxiliary FDs if any */
+ for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+ if ((cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS)) {
+ memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static void
+process_msg(struct rte_mp_msg *msg)
+{
+ struct action_entry *entry;
+ rte_eal_mp_t action = NULL;
+
+ RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+ pthread_mutex_lock(&mp_mutex_action);
+ entry = find_action_entry_by_name(msg->name);
+ if (entry != NULL)
+ action = entry->action;
+ pthread_mutex_unlock(&mp_mutex_action);
+
+ if (!action)
+ RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
+ else if (action(msg) < 0)
+ RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
+}
+
+static void *
+mp_handle(void *arg __rte_unused)
+{
+ struct rte_mp_msg msg;
+
+ while (1) {
+ if (read_msg(&msg) == 0)
+ process_msg(&msg);
+ }
+
+ return NULL;
+}
+
+static int
+open_socket_fd(void)
+{
+ struct sockaddr_un un;
+ const char *prefix = eal_mp_socket_path();
+
+ mp_fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+ if (mp_fd < 0) {
+ RTE_LOG(ERR, EAL, "failed to create unix socket\n");
+ return -1;
+ }
+
+ memset(&un, 0, sizeof(un));
+ un.sun_family = AF_UNIX;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ snprintf(un.sun_path, sizeof(un.sun_path), "%s", prefix);
+ else
+ snprintf(un.sun_path, sizeof(un.sun_path), "%s_%d",
+ prefix, getpid());
+ unlink(un.sun_path); /* May still exist since last run */
+ if (bind(mp_fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+ RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
+ un.sun_path, strerror(errno));
+ close(mp_fd);
+ return -1;
+ }
+
+ RTE_LOG(INFO, EAL, "Multi-process socket %s\n", un.sun_path);
+ return mp_fd;
+}
+
+static void
+unlink_sockets(void)
+{
+ int dir_fd;
+ DIR *mp_dir;
+ struct dirent *ent;
+
+ mp_dir = opendir(mp_dir_path);
+ if (!mp_dir) {
+ RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+ return;
+ }
+ dir_fd = dirfd(mp_dir);
+
+ while ((ent = readdir(mp_dir))) {
+ if (fnmatch(mp_filter, ent->d_name, 0) == 0)
+ unlinkat(dir_fd, ent->d_name, 0);
+ }
+
+ closedir(mp_dir);
+}
+
+int
+rte_eal_mp_channel_init(void)
+{
+ char thread_name[RTE_MAX_THREAD_NAME_LEN];
+ char *path;
+ pthread_t tid;
+
+ snprintf(mp_filter, PATH_MAX, ".%s_unix_*",
+ internal_config.hugefile_prefix);
+
+ path = strdup(eal_mp_socket_path());
+ snprintf(mp_dir_path, PATH_MAX, "%s", dirname(path));
+ free(path);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ unlink_sockets();
+
+ if (open_socket_fd() < 0)
+ return -1;
+
+ snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
+
+ if (pthread_create(&tid, NULL, mp_handle, NULL) == 0) {
+ /* try best to set thread name */
+ rte_thread_setname(tid, thread_name);
+ return 0;
+ }
+
+ RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n", strerror(errno));
+ close(mp_fd);
+ mp_fd = -1;
+ return -1;
+}
+
+static int
+send_msg(const char *dst_path, struct rte_mp_msg *msg)
+{
+ int snd;
+ struct iovec iov;
+ struct msghdr msgh;
+ struct cmsghdr *cmsg;
+ struct sockaddr_un dst;
+ int fd_size = msg->num_fds * sizeof(int);
+ char control[CMSG_SPACE(fd_size)];
+
+ memset(&dst, 0, sizeof(dst));
+ dst.sun_family = AF_UNIX;
+ snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
+
+ memset(&msgh, 0, sizeof(msgh));
+ memset(control, 0, sizeof(control));
+
+ iov.iov_base = msg;
+ iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+
+ msgh.msg_name = &dst;
+ msgh.msg_namelen = sizeof(dst);
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ cmsg->cmsg_len = CMSG_LEN(fd_size);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), msg->fds, fd_size);
+
+ do {
+ snd = sendmsg(mp_fd, &msgh, 0);
+ } while (snd < 0 && errno == EINTR);
+
+ if (snd > 0)
+ return 1;
+
+ RTE_LOG(ERR, EAL, "failed to send to (%s) due to %s\n",
+ dst_path, strerror(errno));
+ return 0;
+}
+
+static int
+mp_send(struct rte_mp_msg *msg)
+{
+ int n = 0;
+ DIR *mp_dir;
+ struct dirent *ent;
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ /* broadcast to all secondary processes */
+ mp_dir = opendir(mp_dir_path);
+ if (!mp_dir) {
+ RTE_LOG(ERR, EAL, "Unable to open directory %s\n",
+ mp_dir_path);
+ return 0;
+ }
+ while ((ent = readdir(mp_dir))) {
+ if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+ continue;
+
+ n += send_msg(ent->d_name, msg);
+ }
+ closedir(mp_dir);
+ } else
+ n += send_msg(eal_mp_socket_path(), msg);
+
+ return n;
+}
+
+static bool
+check_input(const struct rte_mp_msg *msg)
+{
+ if (msg == NULL) {
+ RTE_LOG(ERR, EAL, "Msg cannot be NULL\n");
+ rte_errno = -EINVAL;
+ return false;
+ }
+
+ if (!validate_action_name(msg->name))
+ return false;
+
+ if (msg->len_param > RTE_MP_MAX_PARAM_LEN) {
+ RTE_LOG(ERR, EAL, "Message data is too long\n");
+ rte_errno = -E2BIG;
+ return false;
+ }
+
+ if (msg->num_fds > RTE_MP_MAX_FD_NUM) {
+ RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n",
+ RTE_MP_MAX_FD_NUM);
+ rte_errno = -E2BIG;
+ return false;
+ }
+
+ return true;
+}
+
+int
+rte_eal_mp_sendmsg(struct rte_mp_msg *msg)
+{
+ if (!check_input(msg))
+ return -1;
+
+ RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
+ return mp_send(msg);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index e8959eb..3b2929d 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -38,6 +38,23 @@ eal_runtime_config_path(void)
return buffer;
}

+/** Path of primary/secondary communication unix socket file. */
+#define MP_SOCKET_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_socket_path(void)
+{
+ static char buffer[PATH_MAX]; /* static so auto-zeroed */
+ const char *directory = default_config_dir;
+ const char *home_dir = getenv("HOME");
+
+ if (getuid() != 0 && home_dir != NULL)
+ directory = home_dir;
+ snprintf(buffer, sizeof(buffer) - 1, MP_SOCKET_PATH_FMT,
+ directory, internal_config.hugefile_prefix);
+
+ return buffer;
+}
+
/** Path of hugepage info file. */
#define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"

diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index c46dd8f..e36e3b5 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -195,4 +195,14 @@ int rte_eal_hugepage_attach(void);
*/
struct rte_bus *rte_bus_find_by_device_name(const char *str);

+/**
+ * Create the unix channel for primary/secondary communication.
+ *
+ * @return
+ * 0 on success;
+ * (<0) on failure.
+ */
+
+int rte_eal_mp_channel_init(void);
+
#endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 2aba2c8..9a1aac2 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -186,6 +186,81 @@ int rte_eal_init(int argc, char **argv);
*/
int rte_eal_primary_proc_alive(const char *config_file_path);

+#define RTE_MP_MAX_FD_NUM 8 /* The max amount of fds */
+#define RTE_MP_MAX_NAME_LEN 64 /* The max length of action name */
+#define RTE_MP_MAX_PARAM_LEN 256 /* The max length of param */
+struct rte_mp_msg {
+ char name[RTE_MP_MAX_NAME_LEN];
+ int len_param;
+ int num_fds;
+ uint8_t param[RTE_MP_MAX_PARAM_LEN];
+ int fds[RTE_MP_MAX_FD_NUM];
+};
+
+/**
+ * Action function typedef used by other components.
+ *
+ * As we create socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_eal_mp_t)(const struct rte_mp_msg *msg);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * @param name
+ * The name argument plays as the nonredundant key to find the action.
+ *
+ * @param action
+ * The action argument is the function pointer to the action function.
+ *
+ * @return
+ * - 0 on success.
+ * - (<0) on failure.
+ */
+int rte_eal_mp_action_register(const char *name, rte_eal_mp_t action);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Unregister an action function for primary/secondary communication.
+ *
+ * Call this function to unregister an action if the calling component does
+ * not want to response the messages from the corresponding component in its
+ * primary process or secondary processes.
+ *
+ * @param name
+ * The name argument plays as the nonredundant key to find the action.
+ *
+ */
+void rte_eal_mp_action_unregister(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a message to the peer process.
+ *
+ * This function will send a message which will be responsed by the action
+ * identified by name in the peer process.
+ *
+ * @param msg
+ * The msg argument contains the customized message.
+ *
+ * @return
+ * - (<0) on invalid parameters;
+ * - (>=0) as the number of messages being sent successfully.
+ */
+int rte_eal_mp_sendmsg(struct rte_mp_msg *msg);
+
/**
* Usage function typedef used by the application usage function.
*
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 229eec9..ad44ab5 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -852,6 +852,14 @@ rte_eal_init(int argc, char **argv)
return -1;
}

+ if (rte_eal_mp_channel_init() < 0) {
+ rte_eal_init_alert("failed to init mp channel\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ rte_errno = EFAULT;
+ return -1;
+ }
+ }
+
#ifdef VFIO_PRESENT
if (rte_eal_vfio_setup() < 0) {
rte_eal_init_alert("Cannot init VFIO\n");
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 7088b72..adeadfb 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -217,6 +217,9 @@ EXPERIMENTAL {
rte_eal_devargs_remove;
rte_eal_hotplug_add;
rte_eal_hotplug_remove;
+ rte_eal_mp_action_register;
+ rte_eal_mp_action_unregister;
+ rte_eal_mp_sendmsg;
rte_service_attr_get;
rte_service_attr_reset_all;
rte_service_component_register;

--
2.7.4

Thomas Monjalon

2018-01-25 10:41:24 UTC

Post by Jianfeng Tan
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
+ rte_eal_mp_action_register;
+ rte_eal_mp_action_unregister;
+ rte_eal_mp_sendmsg;

Just a naming comment:
I think you can drop "eal" in function names.
"rte_mp_" is a good prefix for multi-process management.

Burakov, Anatoly

2018-01-25 11:27:35 UTC

Overall on this patch:

Reviewed-by: Anatoly Burakov <***@intel.com>

There are a few nitpicks below in comments.

Also, as a general note, i would prefer for sendmsg API's to return 0 on
success and -1 on failure, as number of sent messages is not only
meaningless to the user (since there's no way to tell if the value
returned is the value we expected), but also makes the API unintuitive
and prone to usage errors when using common "if (sendmsg()) {// error}"
idiom. However, i'm fine with leaving it as is, if everyone else is.
It's an experimental API, so we can change it later if need be.

Post by Jianfeng Tan
Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
1. Config-file based channel, in which, the primary process writes
info into a pre-defined config file, and the secondary process
reads the info out.
2. vfio submodule has its own channel based on unix socket for the
secondary process to get container fd and group fd from the
primary process.
3. pdump submodule also has its own channel based on unix socket for
packet dump.
It'd be good to have a generic communication channel for multi-process
a. Secondary wants to send info to primary, for example, secondary
would like to send request (about some specific vdev to primary).
b. Sending info at any time, instead of just initialization time.
c. Share FDs with the other side, for vdev like vhost, related FDs
(memory region, kick) should be shared.
d. A send message request needs the other side to response immediately.
This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.
1. rte_eal_mp_action_register() is used to register an action,
indexed by a string, when a component at receiver side would like
to response the messages from the peer processe.
2. rte_eal_mp_action_unregister() is used to unregister the action
if the calling component does not want to response the messages.
3. rte_eal_mp_sendmsg() is used to send a message, and returns
immediately. If there are n secondary processes, the primary
process will send n messages.
---
lib/librte_eal/common/eal_common_proc.c | 390 +++++++++++++++++++++++++++++++-
lib/librte_eal/common/eal_filesystem.h | 17 ++
lib/librte_eal/common/eal_private.h | 10 +
lib/librte_eal/common/include/rte_eal.h | 75 ++++++
lib/librte_eal/linuxapp/eal/eal.c | 8 +
lib/librte_eal/rte_eal_version.map | 3 +
6 files changed, 502 insertions(+), 1 deletion(-)
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 40fa982..baeb7d1 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -2,14 +2,48 @@
* Copyright(c) 2016 Intel Corporation

Nitpicking - making substantial changes to this files should probably
update copyright year (2016-2018?).

Post by Jianfeng Tan
*/
-#include <stdio.h>
+#include <dirent.h>
+#include <errno.h>
#include <fcntl.h>
+#include <fnmatch.h>
+#include <libgen.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+

<snip>

Post by Jianfeng Tan
+int
+rte_eal_mp_channel_init(void)
+{
+ char thread_name[RTE_MAX_THREAD_NAME_LEN];
+ char *path;
+ pthread_t tid;
+
+ snprintf(mp_filter, PATH_MAX, ".%s_unix_*",
+ internal_config.hugefile_prefix);
+
+ path = strdup(eal_mp_socket_path());
+ snprintf(mp_dir_path, PATH_MAX, "%s", dirname(path));
+ free(path);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ unlink_sockets();
+
+ if (open_socket_fd() < 0)
+ return -1;
+
+ snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
+
+ if (pthread_create(&tid, NULL, mp_handle, NULL) == 0) {
+ /* try best to set thread name */
+ rte_thread_setname(tid, thread_name);
+ return 0;
+ }
+
+ RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n", strerror(errno));
+ close(mp_fd);
+ mp_fd = -1;
+ return -1;

Nitpicking: looks weird, usually early exit is for failures, not
success. Maybe move the error part under (pthread_create() != 0).

Post by Jianfeng Tan
+}
+
+static int
+send_msg(const char *dst_path, struct rte_mp_msg *msg)
+{
+ int snd;
+ struct iovec iov;
+ struct msghdr msgh;
+ struct cmsghdr *cmsg;
+ struct sockaddr_un dst;
+ int fd_size = msg->num_fds * sizeof(int);
+ char control[CMSG_SPACE(fd_size)];
+
+ memset(&dst, 0, sizeof(dst));

--
Thanks,
Anatoly

Thomas Monjalon

2018-01-25 11:34:46 UTC

Post by Burakov, Anatoly
Also, as a general note, i would prefer for sendmsg API's to return 0 on
success and -1 on failure, as number of sent messages is not only
meaningless to the user (since there's no way to tell if the value
returned is the value we expected), but also makes the API unintuitive
and prone to usage errors when using common "if (sendmsg()) {// error}"
idiom. However, i'm fine with leaving it as is, if everyone else is.

I have not reviewed it, but I feel you are right.

Ananyev, Konstantin

2018-01-25 12:21:45 UTC

Post by Jianfeng Tan
Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
1. Config-file based channel, in which, the primary process writes
info into a pre-defined config file, and the secondary process
reads the info out.
2. vfio submodule has its own channel based on unix socket for the
secondary process to get container fd and group fd from the
primary process.
3. pdump submodule also has its own channel based on unix socket for
packet dump.
It'd be good to have a generic communication channel for multi-process
a. Secondary wants to send info to primary, for example, secondary
would like to send request (about some specific vdev to primary).
b. Sending info at any time, instead of just initialization time.
c. Share FDs with the other side, for vdev like vhost, related FDs
(memory region, kick) should be shared.
d. A send message request needs the other side to response immediately.
This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.
1. rte_eal_mp_action_register() is used to register an action,
indexed by a string, when a component at receiver side would like
to response the messages from the peer processe.
2. rte_eal_mp_action_unregister() is used to unregister the action
if the calling component does not want to response the messages.
3. rte_eal_mp_sendmsg() is used to send a message, and returns
immediately. If there are n secondary processes, the primary
process will send n messages.
---

Acked-by: Konstantin Ananyev <***@intel.com>

Jianfeng Tan

2018-01-25 04:16:22 UTC

We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.

We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The caller
can specify the timeout. And the response messages will be collected
and returned so that the caller can decide how to translate them.

The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.

sender-process receiver-process
---------------------- ----------------

thread-n
|_rte_eal_mp_request() ----------> mp-thread
|_timedwait() |_process_msg()
|_action()
|_rte_eal_mp_reply()
mp_thread <---------------------|
|_process_msg()
|_signal(send_thread)
thread-m <----------|
|_collect-reply

* A secondary process is only allowed to talk to the primary process.
* If there are multiple secondary processes for the primary proces,
it will send request to peer1, collect response from peer1; then
send request to peer2, collect reponse from peer2, and so on.
* When thread-n is sending request, thread-m of that process can send
request at the same time.
* For pair <action_name, peer>, we guarantee that only one such request
is on the fly.

Suggested-by: Anatoly Burakov <***@intel.com>
Suggested-by: Konstantin Ananyev <***@intel.com>
Signed-off-by: Jianfeng Tan <***@intel.com>
---
doc/guides/rel_notes/release_18_02.rst | 15 ++
lib/librte_eal/common/eal_common_proc.c | 237 +++++++++++++++++++++++++++++---
lib/librte_eal/common/include/rte_eal.h | 58 +++++++-
lib/librte_eal/rte_eal_version.map | 3 +
4 files changed, 295 insertions(+), 18 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index 00b3224..f6ed666 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -151,6 +151,21 @@ New Features
renamed the application from SW PMD specific ``eventdev_pipeline_sw_pmd``
to PMD agnostic ``eventdev_pipeline``.

+* **Added new multi-process communication channel**
+
+ Added a generic channel in EAL for multi-process (primary/secondary) synchronous
+ and asynchronous communication. Each component who wants to reponse a message
+ shall register the action; and each process has a thread to receive the message
+ and invokes the registered action. The list of new APIs:
+
+ * ``rte_eal_mp_register``
+ * ``rte_eal_mp_unregister``
+ * ``rte_eal_mp_sendmsg``
+ * ``rte_eal_mp_request``
+ * ``rte_eal_mp_reply``
+
+ Note as we changed to use the new channel for communication, applications cannot
+ talk with old version through the old (private) communication channel.

API Changes
-----------
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index baeb7d1..69df943 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -12,6 +12,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <sys/time.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
@@ -44,6 +45,50 @@ TAILQ_HEAD(action_entry_list, action_entry);
static struct action_entry_list action_entry_list =
TAILQ_HEAD_INITIALIZER(action_entry_list);

+enum mp_type {
+ MP_MSG, /* Share message with peers, will not block */
+ MP_REQ, /* Request for information, Will block for a reply */
+ MP_REP, /* Response to previously-received request */
+};
+
+struct mp_msg_internal {
+ int type;
+ struct rte_mp_msg msg;
+};
+
+struct sync_request {
+ int reply_received;
+ char dst[PATH_MAX];
+ struct rte_mp_msg *request;
+ struct rte_mp_msg *reply;
+ pthread_cond_t cond;
+ TAILQ_ENTRY(sync_request) next;
+};
+
+TAILQ_HEAD(sync_request_list, sync_request);
+
+static struct {
+ struct sync_request_list requests;
+ pthread_mutex_t lock;
+} sync_requests = {
+ .requests = TAILQ_HEAD_INITIALIZER(sync_requests.requests),
+ .lock = PTHREAD_MUTEX_INITIALIZER
+};
+
+static struct sync_request *
+find_sync_request(const char *dst, const char *act_name)
+{
+ struct sync_request *r;
+
+ TAILQ_FOREACH(r, &sync_requests.requests, next) {
+ if (!strcmp(r->dst, dst) &&
+ !strcmp(r->request->name, act_name))
+ break;
+ }
+
+ return r;
+}
+
int
rte_eal_primary_proc_alive(const char *config_file_path)
{
@@ -147,19 +192,21 @@ rte_eal_mp_action_unregister(const char *name)
}

static int
-read_msg(struct rte_mp_msg *msg)
+read_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
{
int msglen;
struct iovec iov;
struct msghdr msgh;
- char control[CMSG_SPACE(sizeof(msg->fds))];
+ char control[CMSG_SPACE(sizeof(m->msg.fds))];
struct cmsghdr *cmsg;
- int buflen = sizeof(*msg) - sizeof(msg->fds);
+ int buflen = sizeof(*m) - sizeof(m->msg.fds);

memset(&msgh, 0, sizeof(msgh));
- iov.iov_base = msg;
+ iov.iov_base = m;
iov.iov_len = buflen;

+ msgh.msg_name = s;
+ msgh.msg_namelen = sizeof(*s);
msgh.msg_iov = &iov;
msgh.msg_iovlen = 1;
msgh.msg_control = control;
@@ -181,7 +228,7 @@ read_msg(struct rte_mp_msg *msg)
cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
if ((cmsg->cmsg_level == SOL_SOCKET) &&
(cmsg->cmsg_type == SCM_RIGHTS)) {
- memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+ memcpy(m->msg.fds, CMSG_DATA(cmsg), sizeof(m->msg.fds));
break;
}
}
@@ -190,12 +237,28 @@ read_msg(struct rte_mp_msg *msg)
}

static void
-process_msg(struct rte_mp_msg *msg)
+process_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
{
+ struct sync_request *sync_req;
struct action_entry *entry;
+ struct rte_mp_msg *msg = &m->msg;
rte_eal_mp_t action = NULL;

RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+
+ if (m->type == MP_REP) {
+ pthread_mutex_lock(&sync_requests.lock);
+ sync_req = find_sync_request(s->sun_path, msg->name);
+ if (sync_req) {
+ memcpy(sync_req->reply, msg, sizeof(*msg));
+ sync_req->reply_received = 1;
+ pthread_cond_signal(&sync_req->cond);
+ } else
+ RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name);
+ pthread_mutex_unlock(&sync_requests.lock);
+ return;
+ }
+
pthread_mutex_lock(&mp_mutex_action);
entry = find_action_entry_by_name(msg->name);
if (entry != NULL)
@@ -204,18 +267,19 @@ process_msg(struct rte_mp_msg *msg)

if (!action)
RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
- else if (action(msg) < 0)
+ else if (action(msg, s->sun_path) < 0)
RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
}

static void *
mp_handle(void *arg __rte_unused)
{
- struct rte_mp_msg msg;
+ struct mp_msg_internal msg;
+ struct sockaddr_un sa;

while (1) {
- if (read_msg(&msg) == 0)
- process_msg(&msg);
+ if (read_msg(&msg, &sa) == 0)
+ process_msg(&msg, &sa);
}

return NULL;
@@ -309,16 +373,20 @@ rte_eal_mp_channel_init(void)
}

static int
-send_msg(const char *dst_path, struct rte_mp_msg *msg)
+send_msg(const char *dst_path, struct rte_mp_msg *msg, int type)
{
int snd;
struct iovec iov;
struct msghdr msgh;
struct cmsghdr *cmsg;
struct sockaddr_un dst;
+ struct mp_msg_internal m;
int fd_size = msg->num_fds * sizeof(int);
char control[CMSG_SPACE(fd_size)];

+ m.type = type;
+ memcpy(&m.msg, msg, sizeof(*msg));
+
memset(&dst, 0, sizeof(dst));
dst.sun_family = AF_UNIX;
snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
@@ -326,8 +394,8 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
memset(&msgh, 0, sizeof(msgh));
memset(control, 0, sizeof(control));

- iov.iov_base = msg;
- iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+ iov.iov_base = &m;
+ iov.iov_len = sizeof(m) - sizeof(msg->fds);

msgh.msg_name = &dst;
msgh.msg_namelen = sizeof(dst);
@@ -355,12 +423,16 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
}

static int
-mp_send(struct rte_mp_msg *msg)
+mp_send(struct rte_mp_msg *msg, const char *peer, int type)
{
int n = 0;
DIR *mp_dir;
struct dirent *ent;

+
+ if (peer)
+ return send_msg(peer, msg, type);
+
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
/* broadcast to all secondary processes */
mp_dir = opendir(mp_dir_path);
@@ -373,11 +445,11 @@ mp_send(struct rte_mp_msg *msg)
if (fnmatch(mp_filter, ent->d_name, 0) != 0)
continue;

- n += send_msg(ent->d_name, msg);
+ n += send_msg(ent->d_name, msg, type);
}
closedir(mp_dir);
} else
- n += send_msg(eal_mp_socket_path(), msg);
+ n += send_msg(eal_mp_socket_path(), msg, type);

return n;
}
@@ -417,5 +489,136 @@ rte_eal_mp_sendmsg(struct rte_mp_msg *msg)
return -1;

RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
- return mp_send(msg);
+ return mp_send(msg, NULL, MP_MSG);
+}
+
+static int
+mp_request_one(const char *dst, struct rte_mp_msg *req,
+ struct rte_mp_reply *reply, const struct timespec *ts)
+{
+ struct timeval now;
+ struct rte_mp_msg msg, *tmp;
+ struct sync_request sync_req, *exist;
+
+ sync_req.reply_received = 0;
+ strcpy(sync_req.dst, dst);
+ sync_req.request = req;
+ sync_req.reply = &msg;
+ pthread_cond_init(&sync_req.cond, NULL);
+
+ pthread_mutex_lock(&sync_requests.lock);
+ exist = find_sync_request(dst, req->name);
+ if (!exist)
+ TAILQ_INSERT_TAIL(&sync_requests.requests, &sync_req, next);
+ pthread_mutex_unlock(&sync_requests.lock);
+ if (exist) {
+ RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name);
+ return 0;
+ }
+
+ if (send_msg(dst, req, MP_REQ) != 1) {
+ RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n",
+ dst, req->name);
+ return 0;
+ }
+
+ pthread_mutex_lock(&sync_requests.lock);
+ do {
+ pthread_cond_timedwait(&sync_req.cond, &sync_requests.lock, ts);
+ /* Check spurious wakeups */
+ if (sync_req.reply_received == 1)
+ break;
+ /* Check if time is out */
+ if (gettimeofday(&now, NULL) < 0)
+ break;
+ if (now.tv_sec < ts->tv_sec)
+ break;
+ else if (now.tv_sec == ts->tv_sec &&
+ now.tv_usec * 1000 < ts->tv_nsec)
+ break;
+ } while (1);
+ /* We got the lock now */
+ TAILQ_REMOVE(&sync_requests.requests, &sync_req, next);
+ pthread_mutex_unlock(&sync_requests.lock);
+
+ if (sync_req.reply_received == 0) {
+ RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n",
+ dst, req->name);
+ return 1;
+ }
+
+ tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_msgs + 1));
+ if (!tmp) {
+ RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n",
+ dst, req->name);
+ return 1;
+ }
+ memcpy(&tmp[reply->nb_msgs], &msg, sizeof(msg));
+ reply->msgs = tmp;
+ reply->nb_msgs++;
+ return 1;
+}
+
+int
+rte_eal_mp_request(struct rte_mp_msg *req,
+ struct rte_mp_reply *reply,
+ const struct timespec *ts)
+{
+ DIR *mp_dir;
+ struct dirent *ent;
+ int nb_snds = 0;
+ struct timeval now;
+ struct timespec end;
+
+ RTE_LOG(DEBUG, EAL, "request: %s\n", req->name);
+
+ if (check_input(req) == false)
+ return -1;
+ if (gettimeofday(&now, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "Faile to get current time\n");
+ return -1;
+ }
+ end.tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000;
+ end.tv_sec = now.tv_sec + ts->tv_sec +
+ (now.tv_usec * 1000 + ts->tv_nsec) / 1000000000;
+
+ reply->nb_msgs = 0;
+ reply->msgs = NULL;
+
+ /* for secondary process, send request to the primary process only */
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+ return mp_request_one(eal_mp_socket_path(), req, reply, &end);
+
+ /* for primary process, broadcast request, and collect reply 1 by 1 */
+ mp_dir = opendir(mp_dir_path);
+ if (!mp_dir) {
+ RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+ return -1;
+ }
+ while ((ent = readdir(mp_dir))) {
+ if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+ continue;
+
+ nb_snds += mp_request_one(ent->d_name, req, reply, &end);
+ }
+ closedir(mp_dir);
+
+ return nb_snds;
+}
+
+int
+rte_eal_mp_reply(struct rte_mp_msg *msg, const char *peer)
+{
+
+ RTE_LOG(DEBUG, EAL, "reply: %s\n", msg->name);
+
+ if (check_input(msg) == false)
+ return -1;
+
+ if (peer == NULL) {
+ RTE_LOG(ERR, EAL, "peer is not specified\n");
+ return -1;
+ }
+
+ return mp_send(msg, peer, MP_REP);
}
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 9a1aac2..8e234e0 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -13,6 +13,7 @@

#include <stdint.h>
#include <sched.h>
+#include <time.h>

#include <rte_config.h>
#include <rte_per_lcore.h>
@@ -197,13 +198,18 @@ struct rte_mp_msg {
int fds[RTE_MP_MAX_FD_NUM];
};

+struct rte_mp_reply {
+ int nb_msgs;
+ struct rte_mp_msg *msgs;
+};
+
/**
* Action function typedef used by other components.
*
* As we create socket channel for primary/secondary communication, use
* this function typedef to register action for coming messages.
*/
-typedef int (*rte_eal_mp_t)(const struct rte_mp_msg *msg);
+typedef int (*rte_eal_mp_t)(const struct rte_mp_msg *msg, const void *peer);

/**
* @warning
@@ -262,6 +268,56 @@ void rte_eal_mp_action_unregister(const char *name);
int rte_eal_mp_sendmsg(struct rte_mp_msg *msg);

/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a request to the peer process and expect a reply.
+ *
+ * This function sends a request message to the peer process, and will
+ * block until receiving reply message from the peer process.
+ *
+ * @note The caller is responsible to free reply->replies.
+ *
+ * @param req
+ * The req argument contains the customized request message.
+ *
+ * @param reply
+ * The reply argument will be for storing all the replied messages;
+ * the caller is responsible for free reply->replies.
+ *
+ * @param ts
+ * The ts argument specifies how long we can wait for the peer(s) to reply.
+ *
+ * @return
+ * - (<0) on invalid parameters;
+ * - (>=0) as the number of messages being sent successfully.
+ */
+int rte_eal_mp_request(struct rte_mp_msg *req,
+ struct rte_mp_reply *reply, const struct timespec *ts);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a reply to the peer process.
+ *
+ * This function will send a reply message in response to a request message
+ * received previously.
+ *
+ * @param msg
+ * The msg argument contains the customized message.
+ *
+ * @param peer
+ * The peer argument is the pointer to the peer socket path.
+ *
+ * @return
+ * - (1) on success;
+ * - (0) on failure;
+ * - (<0) on invalid parameters.
+ */
+int rte_eal_mp_reply(struct rte_mp_msg *msg, const char *peer);
+
+/**
* Usage function typedef used by the application usage function.
*
* Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index adeadfb..3015bc6 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -220,6 +220,9 @@ EXPERIMENTAL {
rte_eal_mp_action_register;
rte_eal_mp_action_unregister;
rte_eal_mp_sendmsg;
+ rte_eal_mp_request;
+ rte_eal_mp_reply;
+ rte_eal_mp_sendmsg;
rte_service_attr_get;
rte_service_attr_reset_all;
rte_service_component_register;

--
2.7.4

Burakov, Anatoly

2018-01-25 12:00:12 UTC

On the overall patch,

Reviewed-by: Anatoly Burakov <***@intel.com>

For request(), returning number of replies received actually makes
sense, because now we get use the value to read our replies, if we were
a primary process sending messages to secondary processes.

Few comments below.

Post by Jianfeng Tan
We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.
We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The caller
can specify the timeout. And the response messages will be collected
and returned so that the caller can decide how to translate them.
The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.
sender-process receiver-process
---------------------- ----------------
thread-n
|_rte_eal_mp_request() ----------> mp-thread
|_timedwait() |_process_msg()
|_action()
|_rte_eal_mp_reply()
mp_thread <---------------------|
|_process_msg()
|_signal(send_thread)
thread-m <----------|
|_collect-reply
* A secondary process is only allowed to talk to the primary process.
* If there are multiple secondary processes for the primary proces,
it will send request to peer1, collect response from peer1; then
send request to peer2, collect reponse from peer2, and so on.
* When thread-n is sending request, thread-m of that process can send
request at the same time.
* For pair <action_name, peer>, we guarantee that only one such request
is on the fly.
---
doc/guides/rel_notes/release_18_02.rst | 15 ++
lib/librte_eal/common/eal_common_proc.c | 237 +++++++++++++++++++++++++++++---
lib/librte_eal/common/include/rte_eal.h | 58 +++++++-
lib/librte_eal/rte_eal_version.map | 3 +
4 files changed, 295 insertions(+), 18 deletions(-)
diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index 00b3224..f6ed666 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -151,6 +151,21 @@ New Features
renamed the application from SW PMD specific ``eventdev_pipeline_sw_pmd``
to PMD agnostic ``eventdev_pipeline``.
+* **Added new multi-process communication channel**
+
+ Added a generic channel in EAL for multi-process (primary/secondary) synchronous
+ and asynchronous communication. Each component who wants to reponse a message
+ shall register the action; and each process has a thread to receive the message
+
+ * ``rte_eal_mp_register``
+ * ``rte_eal_mp_unregister``
+ * ``rte_eal_mp_sendmsg``
+ * ``rte_eal_mp_request``
+ * ``rte_eal_mp_reply``
+
+ Note as we changed to use the new channel for communication, applications cannot
+ talk with old version through the old (private) communication channel.

Some of this should've probably been added into previous patch.

Post by Jianfeng Tan
API Changes
-----------
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index baeb7d1..69df943 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -12,6 +12,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <sys/time.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
@@ -44,6 +45,50 @@ TAILQ_HEAD(action_entry_list, action_entry);
static struct action_entry_list action_entry_list =
TAILQ_HEAD_INITIALIZER(action_entry_list);

<snip>

Post by Jianfeng Tan
+ return 0;
+ }
+
+ if (send_msg(dst, req, MP_REQ) != 1) {
+ RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n",
+ dst, req->name);
+ return 0;
+ }
+
+ pthread_mutex_lock(&sync_requests.lock);
+ do {
+ pthread_cond_timedwait(&sync_req.cond, &sync_requests.lock, ts);
+ /* Check spurious wakeups */
+ if (sync_req.reply_received == 1)
+ break;
+ /* Check if time is out */
+ if (gettimeofday(&now, NULL) < 0)
+ break;
+ if (now.tv_sec < ts->tv_sec)
+ break;
+ else if (now.tv_sec == ts->tv_sec &&
+ now.tv_usec * 1000 < ts->tv_nsec)
+ break;
+ } while (1);
+ /* We got the lock now */
+ TAILQ_REMOVE(&sync_requests.requests, &sync_req, next);
+ pthread_mutex_unlock(&sync_requests.lock);
+
+ if (sync_req.reply_received == 0) {
+ RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n",
+ dst, req->name);
+ return 1;

Why are we returning 1 here? There was no reply, so no reply structure
was allocated. This looks like a potential buffer overflow on trying to
read replies if one of them wasn't delivered.

Post by Jianfeng Tan
+ }
+
+ tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_msgs + 1));
+ if (!tmp) {
+ RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n",
+ dst, req->name);
+ return 1;
+ }

Same here - we couldn't allocate a reply, so it won't get to the user.
Why return 1 here?

Post by Jianfeng Tan
+ memcpy(&tmp[reply->nb_msgs], &msg, sizeof(msg));
+ reply->msgs = tmp;
+ reply->nb_msgs++;
+ return 1;
+}
+
+int
+rte_eal_mp_request(struct rte_mp_msg *req,
+ struct rte_mp_reply *reply,
+ const struct timespec *ts)
+{
+ DIR *mp_dir;
+ struct dirent *ent;
+ int nb_snds = 0;
+ struct timeval now;
+ struct timespec end;
+

<snip>

Post by Jianfeng Tan
/**
@@ -262,6 +268,56 @@ void rte_eal_mp_action_unregister(const char *name);
int rte_eal_mp_sendmsg(struct rte_mp_msg *msg);
/**
+ *
+ * Send a request to the peer process and expect a reply.
+ *
+ * This function sends a request message to the peer process, and will
+ * block until receiving reply message from the peer process.
+ *
+ *
+ * The req argument contains the customized request message.
+ *
+ * The reply argument will be for storing all the replied messages;
+ * the caller is responsible for free reply->replies.
+ *
+ * The ts argument specifies how long we can wait for the peer(s) to reply.
+ *
+ * - (<0) on invalid parameters;
+ * - (>=0) as the number of messages being sent successfully.
+ */
+int rte_eal_mp_request(struct rte_mp_msg *req,
+ struct rte_mp_reply *reply, const struct timespec *ts);

See above: it would be much more useful to return number of replies
received, rather than number of messages sent, as that's the number we
are most interested in. Otherwise, if we e.g. sent 5 messages but
received 1 reply, you're essentially not telling the user how far can he
index the reply pointer.

Post by Jianfeng Tan
+
+/**
+ *
+ * Send a reply to the peer process.
+ *
+ * This function will send a reply message in response to a request message
+ * received previously.
+ *
+ * The msg argument contains the customized message.
+ *
+ * The peer argument is the pointer to the peer socket path.
+ *
+ * - (1) on success;
+ * - (0) on failure;
+ * - (<0) on invalid parameters.
+ */
+int rte_eal_mp_reply(struct rte_mp_msg *msg, const char *peer);

I don't think there's much point in making distinction between invalid
parameters and failure.

Post by Jianfeng Tan
+
+/**
* Usage function typedef used by the application usage function.
*
* Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index adeadfb..3015bc6 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -220,6 +220,9 @@ EXPERIMENTAL {
rte_eal_mp_action_register;
rte_eal_mp_action_unregister;
rte_eal_mp_sendmsg;
+ rte_eal_mp_request;
+ rte_eal_mp_reply;
+ rte_eal_mp_sendmsg;

You're adding rte_eal_mp_sendmsg twice.

Post by Jianfeng Tan
rte_service_attr_get;
rte_service_attr_reset_all;
rte_service_component_register;

--
Thanks,
Anatoly

Burakov, Anatoly

2018-01-25 12:19:23 UTC

Post by Burakov, Anatoly
See above: it would be much more useful to return number of replies
received, rather than number of messages sent, as that's the number we
are most interested in. Otherwise, if we e.g. sent 5 messages but
received 1 reply, you're essentially not telling the user how far can he
index the reply pointer.

Apologies, just noticed that rte_mp_reply has nb_messages in it. So if
we are getting number of replies along with reply, this API should too
switch to 0/-1 on success/failure respectively, as the number of sent
messages also becomes meaningless to the user.

--
Thanks,
Anatoly

Ananyev, Konstantin

2018-01-25 12:19:23 UTC

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:00 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
On the overall patch,
For request(), returning number of replies received actually makes
sense, because now we get use the value to read our replies, if we were
a primary process sending messages to secondary processes.

Yes, I also think it is good to return number of sends.
Then caller can compare number of sended requests with number of
received replies and decide should it be considered a failure or no.

Post by Jianfeng Tan
+ return 0;
+ }
+
+ if (send_msg(dst, req, MP_REQ) != 1) {
+ RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n",
+ dst, req->name);
+ return 0;
+ }
+
+ pthread_mutex_lock(&sync_requests.lock);
+ do {
+ pthread_cond_timedwait(&sync_req.cond, &sync_requests.lock, ts);
+ /* Check spurious wakeups */
+ if (sync_req.reply_received == 1)
+ break;
+ /* Check if time is out */
+ if (gettimeofday(&now, NULL) < 0)
+ break;
+ if (now.tv_sec < ts->tv_sec)
+ break;
+ else if (now.tv_sec == ts->tv_sec &&
+ now.tv_usec * 1000 < ts->tv_nsec)
+ break;
+ } while (1);
+ /* We got the lock now */
+ TAILQ_REMOVE(&sync_requests.requests, &sync_req, next);
+ pthread_mutex_unlock(&sync_requests.lock);
+
+ if (sync_req.reply_received == 0) {
+ RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n",
+ dst, req->name);
+ return 1;

Why are we returning 1 here? There was no reply, so no reply structure
was allocated. This looks like a potential buffer overflow on trying to
read replies if one of them wasn't delivered.

As I understand - because we receive a number of sended requests.
Number of received replies will be available in reply->nb_msgs.
Same below.
Konstantin

Post by Jianfeng Tan
+ }
+
+ tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_msgs + 1));
+ if (!tmp) {
+ RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n",
+ dst, req->name);
+ return 1;
+ }

Same here - we couldn't allocate a reply, so it won't get to the user.
Why return 1 here?

Post by Jianfeng Tan
+ memcpy(&tmp[reply->nb_msgs], &msg, sizeof(msg));
+ reply->msgs = tmp;
+ reply->nb_msgs++;
+ return 1;
+}
+
+int
+rte_eal_mp_request(struct rte_mp_msg *req,
+ struct rte_mp_reply *reply,
+ const struct timespec *ts)
+{
+ DIR *mp_dir;
+ struct dirent *ent;
+ int nb_snds = 0;
+ struct timeval now;
+ struct timespec end;
+

<snip>

Post by Jianfeng Tan
/**
@@ -262,6 +268,56 @@ void rte_eal_mp_action_unregister(const char *name);
int rte_eal_mp_sendmsg(struct rte_mp_msg *msg);
/**
+ *
+ * Send a request to the peer process and expect a reply.
+ *
+ * This function sends a request message to the peer process, and will
+ * block until receiving reply message from the peer process.
+ *
+ *
+ * The req argument contains the customized request message.
+ *
+ * The reply argument will be for storing all the replied messages;
+ * the caller is responsible for free reply->replies.
+ *
+ * The ts argument specifies how long we can wait for the peer(s) to reply.
+ *
+ * - (<0) on invalid parameters;
+ * - (>=0) as the number of messages being sent successfully.
+ */
+int rte_eal_mp_request(struct rte_mp_msg *req,
+ struct rte_mp_reply *reply, const struct timespec *ts);

See above: it would be much more useful to return number of replies
received, rather than number of messages sent, as that's the number we
are most interested in. Otherwise, if we e.g. sent 5 messages but
received 1 reply, you're essentially not telling the user how far can he
index the reply pointer.

Post by Jianfeng Tan
+
+/**
+ *
+ * Send a reply to the peer process.
+ *
+ * This function will send a reply message in response to a request message
+ * received previously.
+ *
+ * The msg argument contains the customized message.
+ *
+ * The peer argument is the pointer to the peer socket path.
+ *
+ * - (1) on success;
+ * - (0) on failure;
+ * - (<0) on invalid parameters.
+ */
+int rte_eal_mp_reply(struct rte_mp_msg *msg, const char *peer);

I don't think there's much point in making distinction between invalid
parameters and failure.

Post by Jianfeng Tan
+
+/**
* Usage function typedef used by the application usage function.
*
* Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index adeadfb..3015bc6 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -220,6 +220,9 @@ EXPERIMENTAL {
rte_eal_mp_action_register;
rte_eal_mp_action_unregister;
rte_eal_mp_sendmsg;
+ rte_eal_mp_request;
+ rte_eal_mp_reply;
+ rte_eal_mp_sendmsg;

You're adding rte_eal_mp_sendmsg twice.

Post by Jianfeng Tan
rte_service_attr_get;
rte_service_attr_reset_all;
rte_service_component_regi

Burakov, Anatoly

2018-01-25 12:25:58 UTC

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:00 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
On the overall patch,
For request(), returning number of replies received actually makes
sense, because now we get use the value to read our replies, if we were
a primary process sending messages to secondary processes.

Yes, I also think it is good to return number of sends.
Then caller can compare number of sended requests with number of
received replies and decide should it be considered a failure or no.

Well, OK, that might make sense. However, i think it would've be of more
value to make the API consistent (0/-1 on success/failure) and put
number of sent messages into the reply, like number of received. I.e.
something like

struct reply {
int nb_sent;
int nb_received;
};

We do it for the latter already, so why not the former?

--
Thanks,
Anatoly

Ananyev, Konstantin

2018-01-25 13:00:25 UTC

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:26 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:00 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
On the overall patch,
For request(), returning number of replies received actually makes
sense, because now we get use the value to read our replies, if we were
a primary process sending messages to secondary processes.

Yes, I also think it is good to return number of sends.
Then caller can compare number of sended requests with number of
received replies and decide should it be considered a failure or no.

Well, OK, that might make sense. However, i think it would've be of more
value to make the API consistent (0/-1 on success/failure) and put
number of sent messages into the reply, like number of received. I.e.
something like
struct reply {
int nb_sent;
int nb_received;
};
We do it for the latter already, so why not the former?

The question is what treat as success/failure?
Let say we sent 2 requests (of 3 possible), got back 1 response...
Should we consider it

Burakov, Anatoly

2018-01-25 13:05:57 UTC

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:26 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:00 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
On the overall patch,
For request(), returning number of replies received actually makes
sense, because now we get use the value to read our replies, if we were
a primary process sending messages to secondary processes.

Yes, I also think it is good to return number of sends.
Then caller can compare number of sended requests with number of
received replies and decide should it be considered a failure or no.

Well, OK, that might make sense. However, i think it would've be of more
value to make the API consistent (0/-1 on success/failure) and put
number of sent messages into the reply, like number of received. I.e.
something like
struct reply {
int nb_sent;
int nb_received;
};
We do it for the latter already, so why not the former?

The question is what treat as success/failure?
Let say we sent 2 requests (of 3 possible), got back 1 response...
Should we consider it as success or failure?

I think "failure" is "something went wrong", not "secondary processes
didn't respond". For example, invalid parameters, or our socket suddenly
being closed, or some other error that prevents us from sending requests
to secondaries.

As far as i can tell from the code, there's no way to know if the
secondary process is running other than by attempting to connect to it,
and get a response. So, failed connection should not be a failure
condition, because we can't know if we *can* connect to the process
until we do. Process may have ended, but socket files will still be
around, and there's nothing we can do about that. So i wouldn't consider
inability to send a message a failure condition.

--
Thanks,
Anatoly

Burakov, Anatoly

2018-01-25 13:10:07 UTC

Post by Burakov, Anatoly

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:26 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:00 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
On the overall patch,
For request(), returning number of replies received actually makes
sense, because now we get use the value to read our replies, if we were
a primary process sending messages to secondary processes.

Yes, I also think it is good to return number of sends.
Then caller can compare number of sended requests with number of
received replies and decide should it be considered a failure or no.

Well, OK, that might make sense. However, i think it would've be of more
value to make the API consistent (0/-1 on success/failure) and put
number of sent messages into the reply, like number of received. I.e.
something like
struct reply {
int nb_sent;
int nb_received;
};
We do it for the latter already, so why not the former?

The question is what treat as success/failure?
Let say we sent 2 requests (of 3 possible), got back 1 response...
Should we consider it as success or failure?

I think "failure" is "something went wrong", not "secondary processes
didn't respond". For example, invalid parameters, or our socket suddenly
being closed, or some other error that prevents us from sending requests
to secondaries.
As far as i can tell from the code, there's no way to know if the
secondary process is running other than by attempting to connect to it,
and get a response. So, failed connection should not be a failure
condition, because we can't know if we *can* connect to the process
until we do. Process may have ended, but socket files will still be
around, and there's nothing we can do about that. So i wouldn't consider
inability to send a message a failure condition.

Just to clarify - i'm suggesting leaving this decision up to the user.
If a user expects there to be "n" processes running, but only "m"
responses were received, he could treat it as error. Another user might
simply send periodical updates/polls to secondaries, for whatever reason
(say, stats display), and won't really care if one of them just died, so
there's no error for that user.

However, all of this has nothing to do with API. If we're able to send
messages - it's not a failure. If we can't - it is. That's the part API
should be concerned about, and that's what the return value should
indicate, IMO.

--
Thanks,
Anatoly

Ananyev, Konstantin

2018-01-25 15:03:58 UTC

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 1:10 PM
Subject: Re: [dpdk-dev] [PATCH v3 2/3] eal: add synchronous multi-process communication

Post by Burakov, Anatoly

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:26 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:00 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
On the overall patch,
For request(), returning number of replies received actually makes
sense, because now we get use the value to read our replies, if we were
a primary process sending messages to secondary processes.

Yes, I also think it is good to return number of sends.
Then caller can compare number of sended requests with number of
received replies and decide should it be considered a failure or no.

Well, OK, that might make sense. However, i think it would've be of more
value to make the API consistent (0/-1 on success/failure) and put
number of sent messages into the reply, like number of received. I.e.
something like
struct reply {
int nb_sent;
int nb_received;
};
We do it for the latter already, so why not the former?

The question is what treat as success/failure?
Let say we sent 2 requests (of 3 possible), got back 1 response...
Should we consider it as success or failure?

I think "failure" is "something went wrong", not "secondary processes
didn't respond". For example, invalid parameters, or our socket suddenly
being closed, or some other error that prevents us from sending requests
to secondaries.
As far as i can tell from the code, there's no way to know if the
secondary process is running other than by attempting to connect to it,
and get a response. So, failed connection should not be a failure
condition, because we can't know if we *can* connect to the process
until we do. Process may have ended, but socket files will still be
around, and there's nothing we can do about that. So i wouldn't consider
inability to send a message a failure condition.

Just to clarify - i'm suggesting leaving this decision up to the user.
If a user expects there to be "n" processes running, but only "m"
responses were received, he could treat it as error. Another user might
simply send periodical updates/polls to secondaries, for whatever reason
(say, stats display), and won't really care if one of them just died, so
there's no error for that user.
However, all of this has nothing to do with API. If we're able to send
messages - it's not a failure. If we can't - it is. That's the part API
should be concerned about, and that's what the return value should
indicate, IMO.

Ok so to clarify, you are suggesting:
we have N peers - if send_msg() returns success for all N - return success
(no matter did we get a reply or not)
Otherwise return a failure.
?

Burakov, Anatoly

2018-01-25 16:22:03 UTC

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 1:10 PM
Subject: Re: [dpdk-dev] [PATCH v3 2/3] eal: add synchronous multi-process communication

Post by Burakov, Anatoly

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:26 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:00 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
On the overall patch,
For request(), returning number of replies received actually makes
sense, because now we get use the value to read our replies, if we were
a primary process sending messages to secondary processes.

Yes, I also think it is good to return number of sends.
Then caller can compare number of sended requests with number of
received replies and decide should it be considered a failure or no.

Well, OK, that might make sense. However, i think it would've be of more
value to make the API consistent (0/-1 on success/failure) and put
number of sent messages into the reply, like number of received. I.e.
something like
struct reply {
int nb_sent;
int nb_received;
};
We do it for the latter already, so why not the former?

The question is what treat as success/failure?
Let say we sent 2 requests (of 3 possible), got back 1 response...
Should we consider it as success or failure?

I think "failure" is "something went wrong", not "secondary processes
didn't respond". For example, invalid parameters, or our socket suddenly
being closed, or some other error that prevents us from sending requests
to secondaries.
As far as i can tell from the code, there's no way to know if the
secondary process is running other than by attempting to connect to it,
and get a response. So, failed connection should not be a failure
condition, because we can't know if we *can* connect to the process
until we do. Process may have ended, but socket files will still be
around, and there's nothing we can do about that. So i wouldn't consider
inability to send a message a failure condition.

Just to clarify - i'm suggesting leaving this decision up to the user.
If a user expects there to be "n" processes running, but only "m"
responses were received, he could treat it as error. Another user might
simply send periodical updates/polls to secondaries, for whatever reason
(say, stats display), and won't really care if one of them just died, so
there's no error for that user.
However, all of this has nothing to do with API. If we're able to send
messages - it's not a failure. If we can't - it is. That's the part API
should be concerned about, and that's what the return value should
indicate, IMO.

we have N peers - if send_msg() returns success for all N - return success
(no matter did we get a reply or not)
Otherwise return a failure.
?
Konstantin

More along the lines of, return -1 if and only if something went wrong.
That might be invalid parameters, or that might be an error with our own
socket, or something else to that effect. In all other cases, return 0
(that includes cases where we sent N messages but M replies where N !=
M). So, in other words, return 0 if we *could have succeeded* if nothing
went wrong on the other side, and only return -1 if something went wrong
on our side.

Post by Ananyev, Konstantin

--
Thanks,
Anatoly

--
Thanks,
Anatoly

Tan, Jianfeng

2018-01-25 17:10:34 UTC

Post by Burakov, Anatoly

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 1:10 PM
Subject: Re: [dpdk-dev] [PATCH v3 2/3] eal: add synchronous
multi-process communication

Post by Burakov, Anatoly

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:26 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:00 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication
On the overall patch,
For request(), returning number of replies received actually makes
sense, because now we get use the value to read our replies, if we were
a primary process sending messages to secondary processes.

Yes, I also think it is good to return number of sends.
Then caller can compare number of sended requests with number of
received replies and decide should it be considered a failure or no.

Well, OK, that might make sense. However, i think it would've be of more
value to make the API consistent (0/-1 on success/failure) and put
number of sent messages into the reply, like number of received. I.e.
something like
struct reply {
int nb_sent;
int nb_received;
};
We do it for the latter already, so why not the former?

The question is what treat as success/failure?
Let say we sent 2 requests (of 3 possible), got back 1 response...
Should we consider it as success or failure?

I think "failure" is "something went wrong", not "secondary processes
didn't respond". For example, invalid parameters, or our socket suddenly
being closed, or some other error that prevents us from sending requests
to secondaries.
As far as i can tell from the code, there's no way to know if the
secondary process is running other than by attempting to connect to it,
and get a response. So, failed connection should not be a failure
condition, because we can't know if we *can* connect to the process
until we do. Process may have ended, but socket files will still be
around, and there's nothing we can do about that. So i wouldn't consider
inability to send a message a failure condition.

Just to clarify - i'm suggesting leaving this decision up to the user.
If a user expects there to be "n" processes running, but only "m"
responses were received, he could treat it as error. Another user might
simply send periodical updates/polls to secondaries, for whatever reason
(say, stats display), and won't really care if one of them just died, so
there's no error for that user.
However, all of this has nothing to do with API. If we're able to send
messages - it's not a failure. If we can't - it is. That's the part API
should be concerned about, and that's what the return value should
indicate, IMO.

we have N peers - if send_msg() returns success for all N - return success
(no matter did we get a reply or not)
Otherwise return a failure.
?
Konstantin

More along the lines of, return -1 if and only if something went
wrong. That might be invalid parameters, or that might be an error
with our own socket,

To check if the error is caused by our own socket, we check the errno
after sendmsg?

Like for remote socket errors, we check:
- ECONNRESET
- ECONNREFUSED
- ENOBUFS

Right?

Thanks,
Jianfeng

Post by Burakov, Anatoly
or something else to that effect. In all other cases, return 0 (that
includes cases where we sent N messages but M replies where N != M).
So, in other words, return 0 if we *could have succeeded* if nothing
went wrong on the other side, and only return -1 if something went
wrong on our side.

Post by Ananyev, Konstantin

--
Thanks,
Anatoly

Burakov, Anatoly

2018-01-25 18:02:57 UTC

Post by Tan, Jianfeng

Post by Burakov, Anatoly

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 1:10 PM
Subject: Re: [dpdk-dev] [PATCH v3 2/3] eal: add synchronous
multi-process communication

Post by Burakov, Anatoly

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:26 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process communication

Post by Ananyev, Konstantin

-----Original Message-----
From: Burakov, Anatoly
Sent: Thursday, January 25, 2018 12:00 PM
Subject: Re: [PATCH v3 2/3] eal: add synchronous multi-process
communication
On the overall patch,
For request(), returning number of replies received actually makes
sense, because now we get use the value to read our replies, if we were
a primary process sending messages to secondary processes.

Yes, I also think it is good to return number of sends.
Then caller can compare number of sended requests with number of
received replies and decide should it be considered a failure or no.

Well, OK, that might make sense. However, i think it would've be of more
value to make the API consistent (0/-1 on success/failure) and put
number of sent messages into the reply, like number of received. I.e.
something like
struct reply {
int nb_sent;
int nb_received;
};
We do it for the latter already, so why not the former?

The question is what treat as success/failure?
Let say we sent 2 requests (of 3 possible), got back 1 response...
Should we consider it as success or failure?

I think "failure" is "something went wrong", not "secondary processes
didn't respond". For example, invalid parameters, or our socket suddenly
being closed, or some other error that prevents us from sending requests
to secondaries.
As far as i can tell from the code, there's no way to know if the
secondary process is running other than by attempting to connect to it,
and get a response. So, failed connection should not be a failure
condition, because we can't know if we *can* connect to the process
until we do. Process may have ended, but socket files will still be
around, and there's nothing we can do about that. So i wouldn't consider
inability to send a message a failure condition.

Just to clarify - i'm suggesting leaving this decision up to the user.
If a user expects there to be "n" processes running, but only "m"
responses were received, he could treat it as error. Another user might
simply send periodical updates/polls to secondaries, for whatever reason
(say, stats display), and won't really care if one of them just died, so
there's no error for that user.
However, all of this has nothing to do with API. If we're able to send
messages - it's not a failure. If we can't - it is. That's the part API
should be concerned about, and that's what the return value should
indicate, IMO.

we have N peers - if send_msg() returns success for all N - return success
(no matter did we get a reply or not)
Otherwise return a failure.
?
Konstantin

More along the lines of, return -1 if and only if something went
wrong. That might be invalid parameters, or that might be an error
with our own socket,

To check if the error is caused by our own socket, we check the errno
after sendmsg?
- ECONNRESET
- ECONNREFUSED
- ENOBUFS
Right?
Thanks,
Jianfeng

Well, that was only an example. If it doesn't make much sense to do so
in this case, then don't, and only return -1 on invalid parameters.
AFAIU we're using connectionless sockets so a bunch of these errors
won't be applicable to us. Maybe -ENOBUFS, but i'm not sure it's worth
it to check for that.

Post by Tan, Jianfeng

Post by Burakov, Anatoly
or something else to that effect. In all other cases, return 0 (that
includes cases where we sent N messages but M replies where N != M).
So, in other words, return 0 if we *could have succeeded* if nothing
went wrong on the other side, and only return -1 if something went
wrong on our side.

Post by Ananyev, Konstantin

--
Thanks,
Anatoly

--
Thanks,
Anatoly

Ananyev, Konstantin

2018-01-25 12:22:32 UTC

Post by Jianfeng Tan
We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.
We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The caller
can specify the timeout. And the response messages will be collected
and returned so that the caller can decide how to translate them.
The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.
sender-process receiver-process
---------------------- ----------------
thread-n
|_rte_eal_mp_request() ----------> mp-thread
|_timedwait() |_process_msg()
|_action()
|_rte_eal_mp_reply()
mp_thread <---------------------|
|_process_msg()
|_signal(send_thread)
thread-m <----------|
|_collect-reply
* A secondary process is only allowed to talk to the primary process.
* If there are multiple secondary processes for the primary proces,
it will send request to peer1, collect response from peer1; then
send request to peer2, collect reponse from peer2, and so on.
* When thread-n is sending request, thread-m of that process can send
request at the same time.
* For pair <action_name, peer>, we guarantee that only one such request
is on the fly.
---

Acked-by: Konstantin Ananyev <***@intel.com>

Jianfeng Tan

2018-01-25 04:16:23 UTC

Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.

This patch changes to use the generic mp channel.

Test:
1. Bind two NICs to vfio-pci.

2. Start the primary and secondary process.
$ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
$ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 --num-procs=2 --proc-id=1

Signed-off-by: Jianfeng Tan <***@intel.com>
---
lib/librte_eal/linuxapp/eal/eal.c | 14 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 172 +++++------
lib/librte_eal/linuxapp/eal/eal_vfio.h | 15 +-
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 408 ++++---------------------
4 files changed, 136 insertions(+), 473 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index ad44ab5..66a79a1 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -713,18 +713,8 @@ static int rte_eal_vfio_setup(void)
return -1;
vfio_enabled = rte_vfio_is_enabled("vfio");

- if (vfio_enabled) {
-
- /* if we are primary process, create a thread to communicate with
- * secondary processes. the thread will use a socket to wait for
- * requests from secondary process to send open file descriptors,
- * because VFIO does not allow multiple open descriptors on a group or
- * VFIO container.
- */
- if (internal_config.process_type == RTE_PROC_PRIMARY &&
- vfio_mp_sync_setup() < 0)
- return -1;
- }
+ if (vfio_enabled && vfio_mp_sync_setup() < 0)
+ return -1;

return 0;
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index e44ae4d..c2f8486 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -39,9 +39,14 @@ int
vfio_get_group_fd(int iommu_group_no)
{
int i;
+ int ret;
int vfio_group_fd;
char filename[PATH_MAX];
struct vfio_group *cur_grp;
+ struct rte_mp_msg req, *rep;
+ struct rte_mp_reply reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)req.param;

/* check if we already have the group descriptor open */
for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -101,50 +106,31 @@ vfio_get_group_fd(int iommu_group_no)
return vfio_group_fd;
}
/* if we're in a secondary process, request group fd from the primary
- * process via our socket
+ * process via mp channel
*/
- else {
- int socket_fd, ret;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group number!\n");
- close(socket_fd);
- return -1;
- }
- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- case SOCKET_NO_FD:
- close(socket_fd);
- return 0;
- case SOCKET_OK:
- vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
- /* if we got the fd, store it and return it */
- if (vfio_group_fd > 0) {
- close(socket_fd);
- cur_grp->group_no = iommu_group_no;
- cur_grp->fd = vfio_group_fd;
- vfio_cfg.vfio_active_groups++;
- return vfio_group_fd;
- }
- /* fall-through on error */
- default:
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ p->req = SOCKET_REQ_GROUP;
+ p->group_no = iommu_group_no;
+ strcpy(req.name, "vfio");
+ req.len_param = sizeof(*p);
+ req.num_fds = 0;
+
+ vfio_group_fd = -1;
+ ret = rte_eal_mp_request(&req, &reply, &ts);
+ if (ret > 0 && reply.nb_msgs > 0) {
+ rep = &reply.msgs[0];
+ p = (struct vfio_mp_param *)rep->param;
+ if (p->result == SOCKET_OK && rep->num_fds == 1) {
+ cur_grp->group_no = iommu_group_no;
+ vfio_group_fd = rep->fds[0];
+ cur_grp->fd = vfio_group_fd;
+ vfio_cfg.vfio_active_groups++;
}
+ free(reply.msgs);
}
- return -1;
+
+ if (vfio_group_fd < 0)
+ RTE_LOG(ERR, EAL, " cannot request group fd\n");
+ return vfio_group_fd;
}

@@ -200,7 +186,11 @@ int
rte_vfio_clear_group(int vfio_group_fd)
{
int i;
- int socket_fd, ret;
+ int ret;
+ struct rte_mp_msg req, *rep;
+ struct rte_mp_reply reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)req.param;

if (internal_config.process_type == RTE_PROC_PRIMARY) {

@@ -214,43 +204,24 @@ rte_vfio_clear_group(int vfio_group_fd)
return 0;
}

- /* This is just for SECONDARY processes */
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
-
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
+ p->req = SOCKET_CLR_GROUP;
+ p->group_no = vfio_group_fd;
+ strcpy(req.name, "vfio");
+ req.len_param = sizeof(*p);
+ req.num_fds = 0;

- if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group fd!\n");
- close(socket_fd);
- return -1;
+ ret = rte_eal_mp_request(&req, &reply, &ts);
+ if (ret > 0 && reply.nb_msgs > 0) {
+ rep = &reply.msgs[0];
+ p = (struct vfio_mp_param *)rep->param;
+ if (p->result == SOCKET_OK) {
+ free(reply.msgs);
+ return 0;
+ }
+ free(reply.msgs);
}

- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- case SOCKET_NO_FD:
- RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
- close(socket_fd);
- break;
- case SOCKET_OK:
- close(socket_fd);
- return 0;
- case SOCKET_ERR:
- RTE_LOG(ERR, EAL, " Socket error\n");
- close(socket_fd);
- break;
- default:
- RTE_LOG(ERR, EAL, " UNKNOWN reply, %d\n", ret);
- close(socket_fd);
- }
+ RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
return -1;
}

@@ -561,6 +532,11 @@ int
vfio_get_container_fd(void)
{
int ret, vfio_container_fd;
+ struct rte_mp_msg req, *rep;
+ struct rte_mp_reply reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)req.param;
+

/* if we're in a primary process, try to open the container */
if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -591,33 +567,29 @@ vfio_get_container_fd(void)
}

return vfio_container_fd;
- } else {
- /*
- * if we're in a secondary process, request container fd from the
- * primary process via our socket
- */
- int socket_fd;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
- if (vfio_container_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ }
+ /*
+ * if we're in a secondary process, request container fd from the
+ * primary process via mp channel
+ */
+ p->req = SOCKET_REQ_CONTAINER;
+ strcpy(req.name, "vfio");
+ req.len_param = sizeof(*p);
+ req.num_fds = 0;
+
+ vfio_container_fd = -1;
+ ret = rte_eal_mp_request(&req, &reply, &ts);
+ if (ret > 0 && reply.nb_msgs > 0) {
+ rep = &reply.msgs[0];
+ p = (struct vfio_mp_param *)rep->param;
+ if (p->result == SOCKET_OK && rep->num_fds == 1) {
+ free(reply.msgs);
+ return rep->fds[0];
}
- close(socket_fd);
- return vfio_container_fd;
+ free(reply.msgs);
}

+ RTE_LOG(ERR, EAL, " cannot request container fd\n");
return -1;
}

diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 8059577..6b48969 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -88,15 +88,6 @@ struct vfio_iommu_spapr_tce_info {
#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS

/*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
* we don't need to store device fd's anywhere since they can be obtained from
* the group fd via an ioctl() call.
*/
@@ -157,6 +148,12 @@ int vfio_mp_sync_setup(void);
#define SOCKET_NO_FD 0x1
#define SOCKET_ERR 0xFF

+struct vfio_mp_param {
+ int req;
+ int result;
+ int group_no;
+};
+
#endif /* VFIO_PRESENT */

#endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index 7cc3c15..8c2f409 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -2,31 +2,14 @@
* Copyright(c) 2010-2014 Intel Corporation
*/

+#include <unistd.h>
#include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif

#include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
#include <rte_vfio.h>
+#include <rte_eal.h>

-#include "eal_filesystem.h"
#include "eal_vfio.h"
-#include "eal_thread.h"

/**
* @file
@@ -37,360 +20,81 @@

#ifdef VFIO_PRESENT

-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
- do {\
- (chdr).cmsg_len = CMSGLEN;\
- (chdr).cmsg_level = SOL_SOCKET;\
- (chdr).cmsg_type = SCM_RIGHTS;\
- memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
- } while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
- memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
- const char *dir = "/var/run";
- const char *home_dir = getenv("HOME");
-
- if (getuid() != 0 && home_dir != NULL)
- dir = home_dir;
-
- /* use current prefix as file path */
- snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
- internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * data flow for socket comm protocol:
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
- struct msghdr hdr;
- struct iovec iov;
- int buf;
- int ret;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = req;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct iovec iov;
- int ret, req;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = SOCKET_ERR;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
+static int
+vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
+ int fd;
+ int num;
int ret;
+ struct rte_mp_msg reply;
+ struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
+ const struct vfio_mp_param *m = (const struct vfio_mp_param *)msg->param;

- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- buf = SOCKET_OK;
- FD_TO_CMSGHDR(fd, *chdr);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
- int ret, req, fd;
-
- buf = SOCKET_ERR;
-
- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- if (req != SOCKET_OK)
- return -1;
-
- CMSGHDR_TO_FD(*chdr, fd);
-
- return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
-{
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
- int socket_fd;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+ if (msg->len_param != sizeof(*m)) {
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}

- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
- return socket_fd;
-
- /* if connect failed */
- close(socket_fd);
- return -1;
-}
-
+ memset(&reply, 0, sizeof(reply));

-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
- int ret, fd, vfio_data;
-
- /* wait for requests on the socket */
- for (;;) {
- int conn_sock;
- struct sockaddr_un addr;
- socklen_t sockaddr_len = sizeof(addr);
-
- /* this is a blocking call */
- conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
- &sockaddr_len);
-
- /* just restart on error */
- if (conn_sock == -1)
- continue;
-
- /* set socket to linger after close */
- struct linger l;
- l.l_onoff = 1;
- l.l_linger = 60;
-
- if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
- RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
- "on listen socket (%s)\n", strerror(errno));
-
- ret = vfio_mp_sync_receive_request(conn_sock);
-
- switch (ret) {
- case SOCKET_REQ_CONTAINER:
- fd = vfio_get_container_fd();
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- else
- vfio_mp_sync_send_fd(conn_sock, fd);
- if (fd >= 0)
- close(fd);
- break;
- case SOCKET_REQ_GROUP:
- /* wait for group number */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- fd = vfio_get_group_fd(vfio_data);
-
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+ switch (m->req) {
+ case SOCKET_REQ_GROUP:
+ r->req = SOCKET_REQ_GROUP;
+ r->group_no = m->group_no;
+ fd = vfio_get_group_fd(m->group_no);
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else if (fd == 0)
/* if VFIO group exists but isn't bound to VFIO driver */
- else if (fd == 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+ r->result = SOCKET_NO_FD;
+ else {
/* if group exists and is bound to VFIO driver */
- else {
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- vfio_mp_sync_send_fd(conn_sock, fd);
- }
- break;
- case SOCKET_CLR_GROUP:
- /* wait for group fd */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- ret = rte_vfio_clear_group(vfio_data);
-
- if (ret < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
- else
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- break;
- default:
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- break;
+ r->result = SOCKET_OK;
+ num = 1;
}
- close(conn_sock);
- }
-}
-
-static int
-vfio_mp_sync_socket_setup(void)
-{
- int ret, socket_fd;
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+ break;
+ case SOCKET_CLR_GROUP:
+ r->req = SOCKET_CLR_GROUP;
+ r->group_no = m->group_no;
+ if (rte_vfio_clear_group(m->group_no) < 0)
+ r->result = SOCKET_NO_FD;
+ else
+ r->result = SOCKET_OK;
+ break;
+ case SOCKET_REQ_CONTAINER:
+ r->req = SOCKET_REQ_CONTAINER;
+ fd = vfio_get_container_fd();
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else {
+ r->result = SOCKET_OK;
+ num = 1;
+ }
+ break;
+ default:
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}

- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- unlink(addr.sun_path);
-
- ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
- close(socket_fd);
- return -1;
+ if (num == 1) {
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
}
+ strcpy(reply.name, "vfio");
+ reply.len_param = sizeof(*r);

- ret = listen(socket_fd, 50);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
- close(socket_fd);
- return -1;
- }
-
- /* save the socket in local configuration */
- mp_socket_fd = socket_fd;
-
- return 0;
+ ret = rte_eal_mp_reply(&reply, peer);
+ if (m->req == SOCKET_REQ_CONTAINER && num == 1)
+ close(fd);
+ return ret;
}

-/*
- * set up a local socket and tell it to listen for incoming connections
- */
int
vfio_mp_sync_setup(void)
{
- int ret;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
- if (vfio_mp_sync_socket_setup() < 0) {
- RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
- return -1;
- }
-
- ret = pthread_create(&socket_thread, NULL,
- vfio_mp_sync_thread, NULL);
- if (ret) {
- RTE_LOG(ERR, EAL,
- "Failed to create thread for communication with secondary processes!\n");
- close(mp_socket_fd);
- return -1;
- }
-
- /* Set thread_name for aid in debugging. */
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
- ret = rte_thread_setname(socket_thread, thread_name);
- if (ret)
- RTE_LOG(DEBUG, EAL,
- "Failed to set thread name for secondary processes!\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return rte_eal_mp_action_register("vfio", vfio_mp_primary);

return 0;
}
-
#endif

--
2.7.4

Thomas Monjalon

2018-01-25 10:47:20 UTC

Post by Jianfeng Tan
Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.
This patch changes to use the generic mp channel.

There was a private request to get it in 18.02-rc2.

I have 3 concerns:
1/ It is late
2/ It is not yet reviewed by Anatoly and Konstantin
3/ We try to not rework the existing code in RC2,
because it would totally invalidate the validation work
done for RC1.

Burakov, Anatoly

2018-01-25 10:52:41 UTC

Post by Thomas Monjalon

Post by Jianfeng Tan
Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.
This patch changes to use the generic mp channel.

There was a private request to get it in 18.02-rc2.
1/ It is late
2/ It is not yet reviewed by Anatoly and Konstantin
3/ We try to not rework the existing code in RC2,
because it would totally invalidate the validation work
done for RC1.

Hi Thomas,

We can postpone the VFIO patch until 18.05, and integrate only the first
two patches. First two patches do not change anything in DPDK, so
validation impact should be non-existent.

--
Thanks,
Anatoly

Thomas Monjalon

2018-01-25 10:57:06 UTC

Post by Burakov, Anatoly

Post by Thomas Monjalon

Post by Jianfeng Tan
Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.
This patch changes to use the generic mp channel.

There was a private request to get it in 18.02-rc2.
1/ It is late
2/ It is not yet reviewed by Anatoly and Konstantin
3/ We try to not rework the existing code in RC2,
because it would totally invalidate the validation work
done for RC1.

Hi Thomas,
We can postpone the VFIO patch until 18.05, and integrate only the first
two patches. First two patches do not change anything in DPDK, so
validation impact should be non-existent.

Yes, possible if it is well reviewed and all comments addressed.

Burakov, Anatoly

2018-01-25 12:15:35 UTC

Post by Thomas Monjalon

Post by Burakov, Anatoly

Post by Thomas Monjalon

Post by Jianfeng Tan
Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.
This patch changes to use the generic mp channel.

There was a private request to get it in 18.02-rc2.
1/ It is late
2/ It is not yet reviewed by Anatoly and Konstantin
3/ We try to not rework the existing code in RC2,
because it would totally invalidate the validation work
done for RC1.

Hi Thomas,
We can postpone the VFIO patch until 18.05, and integrate only the first
two patches. First two patches do not change anything in DPDK, so
validation impact should be non-existent.

Yes, possible if it is well reviewed and all comments addressed.

OK then. Jianfeng, let's drop the VFIO patch for now, and postpone it
for 18.05?

--
Thanks,
Anatoly

Jianfeng Tan

2018-01-25 19:14:42 UTC

v3->v4:
- Drop the patch 3 on vfio communication (postponed).
- Change names from rte_eal_mp_* -> rte_mp_* as suggested by Thomas.
- Add nb_sent and nb_received in struct rte_mp_reply.
- Standardize the return val of sendmsg, request, reply: 0 on sucess,
(-1) on failure.
- If we found an peer error when we send msg in primary, we try to
remove the secondary socket; as there is no sync mechanism there
(cannot do flock like regular file for socket file), we use a more
complex socket name (with tsc in it).
- Some other small changes.

v2->v3:
- Add pre-check for each APIs.
- Remove the limitation of 8 secondary processes by: discard original
register/unregister mechanism of secondary process, instead, primary
discoveries secondary processes by looking up the folder for regex match.
- Previous implementation use two sockets for msg and request, this version
just uses one socket. And receive all kinds of messages in mp thread.

v1->v2: (Address comments from Anatoly and Konstantin)
- Use datagram unix socket to supersede stream unix socket + epoll.
- Change the secondary add/del mechanism as now we use connection-less channel.
- Add mp_mutex_action to sync action register/unregister/reference.
- Limit max length of action name to 64B.
- New APIs for synchronous communication: rte_eal_mp_request/rte_eal_mp_reply.
- Formalize the errno handle.
- Some other small issues.

This patchset adds a generic channel for multi-process (primary/secondary)
communication.

Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for the requests which need a immediate response.

Jianfeng Tan (2):
eal: add synchronous multi-process communication
vfio: use the generic multi-process channel

doc/guides/rel_notes/release_18_02.rst | 2 +
lib/librte_eal/common/eal_common_proc.c | 254 +++++++++++++--
lib/librte_eal/common/include/rte_eal.h | 58 +++-
lib/librte_eal/linuxapp/eal/eal.c | 14 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 169 ++++------
lib/librte_eal/linuxapp/eal/eal_vfio.h | 15 +-
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 410 ++++---------------------
lib/librte_eal/rte_eal_version.map | 2 +
8 files changed, 429 insertions(+), 495 deletions(-)

--
2.7.4

Jianfeng Tan

2018-01-25 19:14:43 UTC

We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.

We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The caller
can specify the timeout. And the response messages will be collected
and returned so that the caller can decide how to translate them.

The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.

sender-process receiver-process
---------------------- ----------------

thread-n
|_rte_eal_mp_request() ----------> mp-thread
|_timedwait() |_process_msg()
|_action()
|_rte_eal_mp_reply()
mp_thread <---------------------|
|_process_msg()
|_signal(send_thread)
thread-m <----------|
|_collect-reply

* A secondary process is only allowed to talk to the primary process.
* If there are multiple secondary processes for the primary process,
it will send request to peer1, collect response from peer1; then
send request to peer2, collect response from peer2, and so on.
* When thread-n is sending request, thread-m of that process can send
request at the same time.
* For pair <action_name, peer>, we guarantee that only one such request
is on the fly.

Suggested-by: Anatoly Burakov <***@intel.com>
Suggested-by: Konstantin Ananyev <***@intel.com>
Signed-off-by: Jianfeng Tan <***@intel.com>
Reviewed-by: Anatoly Burakov <***@intel.com>
Acked-by: Konstantin Ananyev <***@intel.com>
---
doc/guides/rel_notes/release_18_02.rst | 2 +
lib/librte_eal/common/eal_common_proc.c | 254 +++++++++++++++++++++++++++++---
lib/librte_eal/common/include/rte_eal.h | 58 +++++++-
lib/librte_eal/rte_eal_version.map | 2 +
4 files changed, 296 insertions(+), 20 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index be6ac99..39425a4 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -160,6 +160,8 @@ New Features

* ``rte_mp_register`` and ``rte_mp_unregister`` are for action (un)registration.
* ``rte_mp_sendmsg`` is for sending a message without blocking for a response.
+ * ``rte_mp_request`` is for sending a request message and will block until
+ it gets a reply message which is sent from the peer by ``rte_mp_reply``.

API Changes
-----------
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index aea0829..6ad73f5 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -13,6 +13,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <sys/time.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
@@ -46,6 +47,50 @@ TAILQ_HEAD(action_entry_list, action_entry);
static struct action_entry_list action_entry_list =
TAILQ_HEAD_INITIALIZER(action_entry_list);

+enum mp_type {
+ MP_MSG, /* Share message with peers, will not block */
+ MP_REQ, /* Request for information, Will block for a reply */
+ MP_REP, /* Response to previously-received request */
+};
+
+struct mp_msg_internal {
+ int type;
+ struct rte_mp_msg msg;
+};
+
+struct sync_request {
+ TAILQ_ENTRY(sync_request) next;
+ int reply_received;
+ char dst[PATH_MAX];
+ struct rte_mp_msg *request;
+ struct rte_mp_msg *reply;
+ pthread_cond_t cond;
+};
+
+TAILQ_HEAD(sync_request_list, sync_request);
+
+static struct {
+ struct sync_request_list requests;
+ pthread_mutex_t lock;
+} sync_requests = {
+ .requests = TAILQ_HEAD_INITIALIZER(sync_requests.requests),
+ .lock = PTHREAD_MUTEX_INITIALIZER
+};
+
+static struct sync_request *
+find_sync_request(const char *dst, const char *act_name)
+{
+ struct sync_request *r;
+
+ TAILQ_FOREACH(r, &sync_requests.requests, next) {
+ if (!strcmp(r->dst, dst) &&
+ !strcmp(r->request->name, act_name))
+ break;
+ }
+
+ return r;
+}
+
int
rte_eal_primary_proc_alive(const char *config_file_path)
{
@@ -149,19 +194,21 @@ rte_mp_action_unregister(const char *name)
}

static int
-read_msg(struct rte_mp_msg *msg)
+read_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
{
int msglen;
struct iovec iov;
struct msghdr msgh;
- char control[CMSG_SPACE(sizeof(msg->fds))];
+ char control[CMSG_SPACE(sizeof(m->msg.fds))];
struct cmsghdr *cmsg;
- int buflen = sizeof(*msg) - sizeof(msg->fds);
+ int buflen = sizeof(*m) - sizeof(m->msg.fds);

memset(&msgh, 0, sizeof(msgh));
- iov.iov_base = msg;
+ iov.iov_base = m;
iov.iov_len = buflen;

+ msgh.msg_name = s;
+ msgh.msg_namelen = sizeof(*s);
msgh.msg_iov = &iov;
msgh.msg_iovlen = 1;
msgh.msg_control = control;
@@ -183,7 +230,7 @@ read_msg(struct rte_mp_msg *msg)
cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
if ((cmsg->cmsg_level == SOL_SOCKET) &&
(cmsg->cmsg_type == SCM_RIGHTS)) {
- memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+ memcpy(m->msg.fds, CMSG_DATA(cmsg), sizeof(m->msg.fds));
break;
}
}
@@ -192,12 +239,28 @@ read_msg(struct rte_mp_msg *msg)
}

static void
-process_msg(struct rte_mp_msg *msg)
+process_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
{
+ struct sync_request *sync_req;
struct action_entry *entry;
+ struct rte_mp_msg *msg = &m->msg;
rte_mp_t action = NULL;

RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+
+ if (m->type == MP_REP) {
+ pthread_mutex_lock(&sync_requests.lock);
+ sync_req = find_sync_request(s->sun_path, msg->name);
+ if (sync_req) {
+ memcpy(sync_req->reply, msg, sizeof(*msg));
+ sync_req->reply_received = 1;
+ pthread_cond_signal(&sync_req->cond);
+ } else
+ RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name);
+ pthread_mutex_unlock(&sync_requests.lock);
+ return;
+ }
+
pthread_mutex_lock(&mp_mutex_action);
entry = find_action_entry_by_name(msg->name);
if (entry != NULL)
@@ -206,18 +269,19 @@ process_msg(struct rte_mp_msg *msg)

if (!action)
RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
- else if (action(msg) < 0)
+ else if (action(msg, s->sun_path) < 0)
RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
}

static void *
mp_handle(void *arg __rte_unused)
{
- struct rte_mp_msg msg;
+ struct mp_msg_internal msg;
+ struct sockaddr_un sa;

while (1) {
- if (read_msg(&msg) == 0)
- process_msg(&msg);
+ if (read_msg(&msg, &sa) == 0)
+ process_msg(&msg, &sa);
}

return NULL;
@@ -336,16 +400,20 @@ rte_mp_channel_init(void)
*
*/
static int
-send_msg(const char *dst_path, struct rte_mp_msg *msg)
+send_msg(const char *dst_path, struct rte_mp_msg *msg, int type)
{
int snd;
struct iovec iov;
struct msghdr msgh;
struct cmsghdr *cmsg;
struct sockaddr_un dst;
+ struct mp_msg_internal m;
int fd_size = msg->num_fds * sizeof(int);
char control[CMSG_SPACE(fd_size)];

+ m.type = type;
+ memcpy(&m.msg, msg, sizeof(*msg));
+
memset(&dst, 0, sizeof(dst));
dst.sun_family = AF_UNIX;
snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
@@ -353,8 +421,8 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
memset(&msgh, 0, sizeof(msgh));
memset(control, 0, sizeof(control));

- iov.iov_base = msg;
- iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+ iov.iov_base = &m;
+ iov.iov_len = sizeof(m) - sizeof(msg->fds);

msgh.msg_name = &dst;
msgh.msg_namelen = sizeof(dst);
@@ -396,14 +464,17 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
}

static int
-mp_send(struct rte_mp_msg *msg)
+mp_send(struct rte_mp_msg *msg, const char *peer, int type)
{
int ret = 0;
DIR *mp_dir;
struct dirent *ent;

- if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
- if (send_msg(eal_mp_socket_path(), msg) < 0)
+ if (!peer && (rte_eal_process_type() == RTE_PROC_SECONDARY))
+ peer = eal_mp_socket_path();
+
+ if (peer) {
+ if (send_msg(peer, msg, type) < 0)
return -1;
else
return 0;
@@ -421,11 +492,11 @@ mp_send(struct rte_mp_msg *msg)
if (fnmatch(mp_filter, ent->d_name, 0) != 0)
continue;

- if (send_msg(ent->d_name, msg) < 0)
+ if (send_msg(ent->d_name, msg, type) < 0)
ret = -1;
}
- closedir(mp_dir);

+ closedir(mp_dir);
return ret;
}

@@ -464,5 +535,150 @@ rte_mp_sendmsg(struct rte_mp_msg *msg)
return -1;

RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
- return mp_send(msg);
+ return mp_send(msg, NULL, MP_MSG);
+}
+
+static int
+mp_request_one(const char *dst, struct rte_mp_msg *req,
+ struct rte_mp_reply *reply, const struct timespec *ts)
+{
+ int ret;
+ struct timeval now;
+ struct rte_mp_msg msg, *tmp;
+ struct sync_request sync_req, *exist;
+
+ sync_req.reply_received = 0;
+ strcpy(sync_req.dst, dst);
+ sync_req.request = req;
+ sync_req.reply = &msg;
+ pthread_cond_init(&sync_req.cond, NULL);
+
+ pthread_mutex_lock(&sync_requests.lock);
+ exist = find_sync_request(dst, req->name);
+ if (!exist)
+ TAILQ_INSERT_TAIL(&sync_requests.requests, &sync_req, next);
+ pthread_mutex_unlock(&sync_requests.lock);
+ if (exist) {
+ RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name);
+ rte_errno = -EEXIST;
+ return -1;
+ }
+
+ ret = send_msg(dst, req, MP_REQ);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n",
+ dst, req->name);
+ return -1;
+ } else if (ret == 0)
+ return 0;
+
+ reply->nb_sent++;
+
+ pthread_mutex_lock(&sync_requests.lock);
+ do {
+ pthread_cond_timedwait(&sync_req.cond, &sync_requests.lock, ts);
+ /* Check spurious wakeups */
+ if (sync_req.reply_received == 1)
+ break;
+ /* Check if time is out */
+ if (gettimeofday(&now, NULL) < 0)
+ break;
+ if (now.tv_sec < ts->tv_sec)
+ break;
+ else if (now.tv_sec == ts->tv_sec &&
+ now.tv_usec * 1000 < ts->tv_nsec)
+ break;
+ } while (1);
+ /* We got the lock now */
+ TAILQ_REMOVE(&sync_requests.requests, &sync_req, next);
+ pthread_mutex_unlock(&sync_requests.lock);
+
+ if (sync_req.reply_received == 0) {
+ RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n",
+ dst, req->name);
+ rte_errno = -ETIMEDOUT;
+ return -1;
+ }
+
+ tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_received + 1));
+ if (!tmp) {
+ RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n",
+ dst, req->name);
+ rte_errno = -ENOMEM;
+ return -1;
+ }
+ memcpy(&tmp[reply->nb_received], &msg, sizeof(msg));
+ reply->msgs = tmp;
+ reply->nb_received++;
+ return 0;
+}
+
+int
+rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+ const struct timespec *ts)
+{
+ int ret = 0;
+ DIR *mp_dir;
+ struct dirent *ent;
+ struct timeval now;
+ struct timespec end;
+
+ RTE_LOG(DEBUG, EAL, "request: %s\n", req->name);
+
+ if (check_input(req) == false)
+ return -1;
+ if (gettimeofday(&now, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "Faile to get current time\n");
+ rte_errno = errno;
+ return -1;
+ }
+
+ end.tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000;
+ end.tv_sec = now.tv_sec + ts->tv_sec +
+ (now.tv_usec * 1000 + ts->tv_nsec) / 1000000000;
+
+ reply->nb_sent = 0;
+ reply->nb_received = 0;
+ reply->msgs = NULL;
+
+ /* for secondary process, send request to the primary process only */
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+ return mp_request_one(eal_mp_socket_path(), req, reply, &end);
+
+ /* for primary process, broadcast request, and collect reply 1 by 1 */
+ mp_dir = opendir(mp_dir_path);
+ if (!mp_dir) {
+ RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+ rte_errno = errno;
+ return -1;
+ }
+
+ while ((ent = readdir(mp_dir))) {
+ if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+ continue;
+
+ if (mp_request_one(ent->d_name, req, reply, &end))
+ ret = -1;
+ }
+
+ closedir(mp_dir);
+ return ret;
+}
+
+int
+rte_mp_reply(struct rte_mp_msg *msg, const char *peer)
+{
+
+ RTE_LOG(DEBUG, EAL, "reply: %s\n", msg->name);
+
+ if (check_input(msg) == false)
+ return -1;
+
+ if (peer == NULL) {
+ RTE_LOG(ERR, EAL, "peer is not specified\n");
+ rte_errno = -EINVAL;
+ return -1;
+ }
+
+ return mp_send(msg, peer, MP_REP);
}
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 1d42e9c..9207ad9 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -13,6 +13,7 @@

#include <stdint.h>
#include <sched.h>
+#include <time.h>

#include <rte_config.h>
#include <rte_per_lcore.h>
@@ -197,13 +198,19 @@ struct rte_mp_msg {
int fds[RTE_MP_MAX_FD_NUM];
};

+struct rte_mp_reply {
+ int nb_sent;
+ int nb_received;
+ struct rte_mp_msg *msgs; /* caller to free */
+};
+
/**
* Action function typedef used by other components.
*
* As we create socket channel for primary/secondary communication, use
* this function typedef to register action for coming messages.
*/
-typedef int (*rte_mp_t)(const struct rte_mp_msg *msg);
+typedef int (*rte_mp_t)(const struct rte_mp_msg *msg, const void *peer);

/**
* @warning
@@ -262,6 +269,55 @@ void rte_mp_action_unregister(const char *name);
int rte_mp_sendmsg(struct rte_mp_msg *msg);

/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a request to the peer process and expect a reply.
+ *
+ * This function sends a request message to the peer process, and will
+ * block until receiving reply message from the peer process.
+ *
+ * @note The caller is responsible to free reply->replies.
+ *
+ * @param req
+ * The req argument contains the customized request message.
+ *
+ * @param reply
+ * The reply argument will be for storing all the replied messages;
+ * the caller is responsible for free reply->replies.
+ *
+ * @param ts
+ * The ts argument specifies how long we can wait for the peer(s) to reply.
+ *
+ * @return
+ * - On success, return 0.
+ * - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+ const struct timespec *ts);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a reply to the peer process.
+ *
+ * This function will send a reply message in response to a request message
+ * received previously.
+ *
+ * @param msg
+ * The msg argument contains the customized message.
+ *
+ * @param peer
+ * The peer argument is the pointer to the peer socket path.
+ *
+ * @return
+ * - On success, return 0.
+ * - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_reply(struct rte_mp_msg *msg, const char *peer);
+
+/**
* Usage function typedef used by the application usage function.
*
* Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index adeadfb..2cb6b07 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -220,6 +220,8 @@ EXPERIMENTAL {
rte_eal_mp_action_register;
rte_eal_mp_action_unregister;
rte_eal_mp_sendmsg;
+ rte_eal_mp_request;
+ rte_eal_mp_reply;
rte_service_attr_get;
rte_service_attr_reset_all;
rte_service_component_register;

--
2.7.4

Jianfeng Tan

2018-01-25 19:14:44 UTC

Previously, vfio uses its own private channel for the secondary
process to get container fd and group fd from the primary process.

This patch changes to use the generic mp channel.

Test:
1. Bind two NICs to vfio-pci.

2. Start the primary and secondary process.
$ (symmetric_mp) -c 2 -- -p 3 --num-procs=2 --proc-id=0
$ (symmetric_mp) -c 4 --proc-type=auto -- -p 3 \
--num-procs=2 --proc-id=1

Signed-off-by: Jianfeng Tan <***@intel.com>
---
lib/librte_eal/linuxapp/eal/eal.c | 14 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 169 ++++------
lib/librte_eal/linuxapp/eal/eal_vfio.h | 15 +-
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 410 ++++---------------------
4 files changed, 133 insertions(+), 475 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 53e29e4..07b2a06 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -713,18 +713,8 @@ static int rte_eal_vfio_setup(void)
return -1;
vfio_enabled = rte_vfio_is_enabled("vfio");

- if (vfio_enabled) {
-
- /* if we are primary process, create a thread to communicate with
- * secondary processes. the thread will use a socket to wait for
- * requests from secondary process to send open file descriptors,
- * because VFIO does not allow multiple open descriptors on a group or
- * VFIO container.
- */
- if (internal_config.process_type == RTE_PROC_PRIMARY &&
- vfio_mp_sync_setup() < 0)
- return -1;
- }
+ if (vfio_enabled && vfio_mp_sync_setup() < 0)
+ return -1;

return 0;
}
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index e44ae4d..2dbb37e 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

#include <string.h>
@@ -42,6 +42,10 @@ vfio_get_group_fd(int iommu_group_no)
int vfio_group_fd;
char filename[PATH_MAX];
struct vfio_group *cur_grp;
+ struct rte_mp_msg req, *rep;
+ struct rte_mp_reply reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)req.param;

/* check if we already have the group descriptor open */
for (i = 0; i < VFIO_MAX_GROUPS; i++)
@@ -101,50 +105,30 @@ vfio_get_group_fd(int iommu_group_no)
return vfio_group_fd;
}
/* if we're in a secondary process, request group fd from the primary
- * process via our socket
+ * process via mp channel
*/
- else {
- int socket_fd, ret;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, iommu_group_no) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group number!\n");
- close(socket_fd);
- return -1;
- }
- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- case SOCKET_NO_FD:
- close(socket_fd);
- return 0;
- case SOCKET_OK:
- vfio_group_fd = vfio_mp_sync_receive_fd(socket_fd);
- /* if we got the fd, store it and return it */
- if (vfio_group_fd > 0) {
- close(socket_fd);
- cur_grp->group_no = iommu_group_no;
- cur_grp->fd = vfio_group_fd;
- vfio_cfg.vfio_active_groups++;
- return vfio_group_fd;
- }
- /* fall-through on error */
- default:
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ p->req = SOCKET_REQ_GROUP;
+ p->group_no = iommu_group_no;
+ strcpy(req.name, "vfio");
+ req.len_param = sizeof(*p);
+ req.num_fds = 0;
+
+ vfio_group_fd = -1;
+ if (rte_mp_request(&req, &reply, &ts) == 0 && reply.nb_received > 0) {
+ rep = &reply.msgs[0];
+ p = (struct vfio_mp_param *)rep->param;
+ if (p->result == SOCKET_OK && rep->num_fds == 1) {
+ cur_grp->group_no = iommu_group_no;
+ vfio_group_fd = rep->fds[0];
+ cur_grp->fd = vfio_group_fd;
+ vfio_cfg.vfio_active_groups++;
}
+ free(reply.msgs);
}
- return -1;
+
+ if (vfio_group_fd < 0)
+ RTE_LOG(ERR, EAL, " cannot request group fd\n");
+ return vfio_group_fd;
}

@@ -200,7 +184,10 @@ int
rte_vfio_clear_group(int vfio_group_fd)
{
int i;
- int socket_fd, ret;
+ struct rte_mp_msg req, *rep;
+ struct rte_mp_reply reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)req.param;

if (internal_config.process_type == RTE_PROC_PRIMARY) {

@@ -214,43 +201,23 @@ rte_vfio_clear_group(int vfio_group_fd)
return 0;
}

- /* This is just for SECONDARY processes */
- socket_fd = vfio_mp_sync_connect_to_primary();
-
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
-
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
+ p->req = SOCKET_CLR_GROUP;
+ p->group_no = vfio_group_fd;
+ strcpy(req.name, "vfio");
+ req.len_param = sizeof(*p);
+ req.num_fds = 0;

- if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
- RTE_LOG(ERR, EAL, " cannot send group fd!\n");
- close(socket_fd);
- return -1;
+ if (rte_mp_request(&req, &reply, &ts) == 0 && reply.nb_received > 0) {
+ rep = &reply.msgs[0];
+ p = (struct vfio_mp_param *)rep->param;
+ if (p->result == SOCKET_OK) {
+ free(reply.msgs);
+ return 0;
+ }
+ free(reply.msgs);
}

- ret = vfio_mp_sync_receive_request(socket_fd);
- switch (ret) {
- case SOCKET_NO_FD:
- RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
- close(socket_fd);
- break;
- case SOCKET_OK:
- close(socket_fd);
- return 0;
- case SOCKET_ERR:
- RTE_LOG(ERR, EAL, " Socket error\n");
- close(socket_fd);
- break;
- default:
- RTE_LOG(ERR, EAL, " UNKNOWN reply, %d\n", ret);
- close(socket_fd);
- }
+ RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
return -1;
}

@@ -561,6 +528,11 @@ int
vfio_get_container_fd(void)
{
int ret, vfio_container_fd;
+ struct rte_mp_msg req, *rep;
+ struct rte_mp_reply reply;
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)req.param;
+

/* if we're in a primary process, try to open the container */
if (internal_config.process_type == RTE_PROC_PRIMARY) {
@@ -591,33 +563,28 @@ vfio_get_container_fd(void)
}

return vfio_container_fd;
- } else {
- /*
- * if we're in a secondary process, request container fd from the
- * primary process via our socket
- */
- int socket_fd;
-
- socket_fd = vfio_mp_sync_connect_to_primary();
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
- return -1;
- }
- if (vfio_mp_sync_send_request(socket_fd, SOCKET_REQ_CONTAINER) < 0) {
- RTE_LOG(ERR, EAL, " cannot request container fd!\n");
- close(socket_fd);
- return -1;
- }
- vfio_container_fd = vfio_mp_sync_receive_fd(socket_fd);
- if (vfio_container_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot get container fd!\n");
- close(socket_fd);
- return -1;
+ }
+ /*
+ * if we're in a secondary process, request container fd from the
+ * primary process via mp channel
+ */
+ p->req = SOCKET_REQ_CONTAINER;
+ strcpy(req.name, "vfio");
+ req.len_param = sizeof(*p);
+ req.num_fds = 0;
+
+ vfio_container_fd = -1;
+ if (rte_mp_request(&req, &reply, &ts) == 0 && reply.nb_received > 0) {
+ rep = &reply.msgs[0];
+ p = (struct vfio_mp_param *)rep->param;
+ if (p->result == SOCKET_OK && rep->num_fds == 1) {
+ free(reply.msgs);
+ return rep->fds[0];
}
- close(socket_fd);
- return vfio_container_fd;
+ free(reply.msgs);
}

+ RTE_LOG(ERR, EAL, " cannot request container fd\n");
return -1;
}

diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 8059577..6b48969 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -88,15 +88,6 @@ struct vfio_iommu_spapr_tce_info {
#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS

/*
- * Function prototypes for VFIO multiprocess sync functions
- */
-int vfio_mp_sync_send_request(int socket, int req);
-int vfio_mp_sync_receive_request(int socket);
-int vfio_mp_sync_send_fd(int socket, int fd);
-int vfio_mp_sync_receive_fd(int socket);
-int vfio_mp_sync_connect_to_primary(void);
-
-/*
* we don't need to store device fd's anywhere since they can be obtained from
* the group fd via an ioctl() call.
*/
@@ -157,6 +148,12 @@ int vfio_mp_sync_setup(void);
#define SOCKET_NO_FD 0x1
#define SOCKET_ERR 0xFF

+struct vfio_mp_param {
+ int req;
+ int result;
+ int group_no;
+};
+
#endif /* VFIO_PRESENT */

#endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index 7cc3c15..126e3c2 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -1,32 +1,15 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

+#include <unistd.h>
#include <string.h>
-#include <fcntl.h>
-#include <sys/socket.h>
-#include <pthread.h>
-
-/* sys/un.h with __USE_MISC uses strlen, which is unsafe */
-#ifdef __USE_MISC
-#define REMOVED_USE_MISC
-#undef __USE_MISC
-#endif
-#include <sys/un.h>
-/* make sure we redefine __USE_MISC only if it was previously undefined */
-#ifdef REMOVED_USE_MISC
-#define __USE_MISC
-#undef REMOVED_USE_MISC
-#endif

#include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_malloc.h>
#include <rte_vfio.h>
+#include <rte_eal.h>

-#include "eal_filesystem.h"
#include "eal_vfio.h"
-#include "eal_thread.h"

/**
* @file
@@ -37,360 +20,81 @@

#ifdef VFIO_PRESENT

-#define SOCKET_PATH_FMT "%s/.%s_mp_socket"
-#define CMSGLEN (CMSG_LEN(sizeof(int)))
-#define FD_TO_CMSGHDR(fd, chdr) \
- do {\
- (chdr).cmsg_len = CMSGLEN;\
- (chdr).cmsg_level = SOL_SOCKET;\
- (chdr).cmsg_type = SCM_RIGHTS;\
- memcpy((chdr).__cmsg_data, &(fd), sizeof(fd));\
- } while (0)
-#define CMSGHDR_TO_FD(chdr, fd) \
- memcpy(&(fd), (chdr).__cmsg_data, sizeof(fd))
-
-static pthread_t socket_thread;
-static int mp_socket_fd;
-
-
-/* get socket path (/var/run if root, $HOME otherwise) */
-static void
-get_socket_path(char *buffer, int bufsz)
-{
- const char *dir = "/var/run";
- const char *home_dir = getenv("HOME");
-
- if (getuid() != 0 && home_dir != NULL)
- dir = home_dir;
-
- /* use current prefix as file path */
- snprintf(buffer, bufsz, SOCKET_PATH_FMT, dir,
- internal_config.hugefile_prefix);
-}
-
-
-
-/*
- * data flow for socket comm protocol:
- * 1. client sends SOCKET_REQ_CONTAINER or SOCKET_REQ_GROUP
- * 1a. in case of SOCKET_REQ_GROUP, client also then sends group number
- * 2. server receives message
- * 2a. in case of invalid group, SOCKET_ERR is sent back to client
- * 2b. in case of unbound group, SOCKET_NO_FD is sent back to client
- * 2c. in case of valid group, SOCKET_OK is sent and immediately followed by fd
- *
- * in case of any error, socket is closed.
- */
-
-/* send a request, return -1 on error */
-int
-vfio_mp_sync_send_request(int socket, int req)
-{
- struct msghdr hdr;
- struct iovec iov;
- int buf;
- int ret;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = req;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive a request and return it */
-int
-vfio_mp_sync_receive_request(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct iovec iov;
- int ret, req;
-
- memset(&hdr, 0, sizeof(hdr));
-
- buf = SOCKET_ERR;
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- return req;
-}
-
-/* send OK in message, fd in control message */
-int
-vfio_mp_sync_send_fd(int socket, int fd)
+static int
+vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
+ int fd;
+ int num;
int ret;
+ struct rte_mp_msg reply;
+ struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
+ const struct vfio_mp_param *m = (const struct vfio_mp_param *)msg->param;

- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- buf = SOCKET_OK;
- FD_TO_CMSGHDR(fd, *chdr);
-
- ret = sendmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
- return 0;
-}
-
-/* receive OK in message, fd in control message */
-int
-vfio_mp_sync_receive_fd(int socket)
-{
- int buf;
- struct msghdr hdr;
- struct cmsghdr *chdr;
- char chdr_buf[CMSGLEN];
- struct iovec iov;
- int ret, req, fd;
-
- buf = SOCKET_ERR;
-
- chdr = (struct cmsghdr *) chdr_buf;
- memset(chdr, 0, sizeof(chdr_buf));
- memset(&hdr, 0, sizeof(hdr));
-
- hdr.msg_iov = &iov;
- hdr.msg_iovlen = 1;
- iov.iov_base = (char *) &buf;
- iov.iov_len = sizeof(buf);
- hdr.msg_control = chdr;
- hdr.msg_controllen = CMSGLEN;
-
- ret = recvmsg(socket, &hdr, 0);
- if (ret < 0)
- return -1;
-
- req = buf;
-
- if (req != SOCKET_OK)
- return -1;
-
- CMSGHDR_TO_FD(*chdr, fd);
-
- return fd;
-}
-
-/* connect socket_fd in secondary process to the primary process's socket */
-int
-vfio_mp_sync_connect_to_primary(void)
-{
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
- int socket_fd;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+ if (msg->len_param != sizeof(*m)) {
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}

- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- if (connect(socket_fd, (struct sockaddr *) &addr, sockaddr_len) == 0)
- return socket_fd;
-
- /* if connect failed */
- close(socket_fd);
- return -1;
-}
-
+ memset(&reply, 0, sizeof(reply));

-
-/*
- * socket listening thread for primary process
- */
-static __attribute__((noreturn)) void *
-vfio_mp_sync_thread(void __rte_unused * arg)
-{
- int ret, fd, vfio_data;
-
- /* wait for requests on the socket */
- for (;;) {
- int conn_sock;
- struct sockaddr_un addr;
- socklen_t sockaddr_len = sizeof(addr);
-
- /* this is a blocking call */
- conn_sock = accept(mp_socket_fd, (struct sockaddr *) &addr,
- &sockaddr_len);
-
- /* just restart on error */
- if (conn_sock == -1)
- continue;
-
- /* set socket to linger after close */
- struct linger l;
- l.l_onoff = 1;
- l.l_linger = 60;
-
- if (setsockopt(conn_sock, SOL_SOCKET, SO_LINGER, &l, sizeof(l)) < 0)
- RTE_LOG(WARNING, EAL, "Cannot set SO_LINGER option "
- "on listen socket (%s)\n", strerror(errno));
-
- ret = vfio_mp_sync_receive_request(conn_sock);
-
- switch (ret) {
- case SOCKET_REQ_CONTAINER:
- fd = vfio_get_container_fd();
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- else
- vfio_mp_sync_send_fd(conn_sock, fd);
- if (fd >= 0)
- close(fd);
- break;
- case SOCKET_REQ_GROUP:
- /* wait for group number */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- fd = vfio_get_group_fd(vfio_data);
-
- if (fd < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
+ switch (m->req) {
+ case SOCKET_REQ_GROUP:
+ r->req = SOCKET_REQ_GROUP;
+ r->group_no = m->group_no;
+ fd = vfio_get_group_fd(m->group_no);
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else if (fd == 0)
/* if VFIO group exists but isn't bound to VFIO driver */
- else if (fd == 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+ r->result = SOCKET_NO_FD;
+ else {
/* if group exists and is bound to VFIO driver */
- else {
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- vfio_mp_sync_send_fd(conn_sock, fd);
- }
- break;
- case SOCKET_CLR_GROUP:
- /* wait for group fd */
- vfio_data = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_data < 0) {
- close(conn_sock);
- continue;
- }
-
- ret = rte_vfio_clear_group(vfio_data);
-
- if (ret < 0)
- vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
- else
- vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
- break;
- default:
- vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
- break;
+ r->result = SOCKET_OK;
+ num = 1;
}
- close(conn_sock);
- }
-}
-
-static int
-vfio_mp_sync_socket_setup(void)
-{
- int ret, socket_fd;
- struct sockaddr_un addr;
- socklen_t sockaddr_len;
-
- /* set up a socket */
- socket_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0);
- if (socket_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to create socket!\n");
+ break;
+ case SOCKET_CLR_GROUP:
+ r->req = SOCKET_CLR_GROUP;
+ r->group_no = m->group_no;
+ if (rte_vfio_clear_group(m->group_no) < 0)
+ r->result = SOCKET_NO_FD;
+ else
+ r->result = SOCKET_OK;
+ break;
+ case SOCKET_REQ_CONTAINER:
+ r->req = SOCKET_REQ_CONTAINER;
+ fd = vfio_get_container_fd();
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else {
+ r->result = SOCKET_OK;
+ num = 1;
+ }
+ break;
+ default:
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
return -1;
}

- get_socket_path(addr.sun_path, sizeof(addr.sun_path));
- addr.sun_family = AF_UNIX;
-
- sockaddr_len = sizeof(struct sockaddr_un);
-
- unlink(addr.sun_path);
-
- ret = bind(socket_fd, (struct sockaddr *) &addr, sockaddr_len);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to bind socket: %s!\n", strerror(errno));
- close(socket_fd);
- return -1;
+ if (num == 1) {
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
}
+ strcpy(reply.name, "vfio");
+ reply.len_param = sizeof(*r);

- ret = listen(socket_fd, 50);
- if (ret) {
- RTE_LOG(ERR, EAL, "Failed to listen: %s!\n", strerror(errno));
- close(socket_fd);
- return -1;
- }
-
- /* save the socket in local configuration */
- mp_socket_fd = socket_fd;
-
- return 0;
+ ret = rte_mp_reply(&reply, peer);
+ if (m->req == SOCKET_REQ_CONTAINER && num == 1)
+ close(fd);
+ return ret;
}

-/*
- * set up a local socket and tell it to listen for incoming connections
- */
int
vfio_mp_sync_setup(void)
{
- int ret;
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
- if (vfio_mp_sync_socket_setup() < 0) {
- RTE_LOG(ERR, EAL, "Failed to set up local socket!\n");
- return -1;
- }
-
- ret = pthread_create(&socket_thread, NULL,
- vfio_mp_sync_thread, NULL);
- if (ret) {
- RTE_LOG(ERR, EAL,
- "Failed to create thread for communication with secondary processes!\n");
- close(mp_socket_fd);
- return -1;
- }
-
- /* Set thread_name for aid in debugging. */
- snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "vfio-sync");
- ret = rte_thread_setname(socket_thread, thread_name);
- if (ret)
- RTE_LOG(DEBUG, EAL,
- "Failed to set thread name for secondary processes!\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return rte_mp_action_register("vfio", vfio_mp_primary);

return 0;
}
-
#endif

--
2.7.4

Tan, Jianfeng

2018-01-25 19:15:53 UTC

Apology, please ignore this version which is not correct. Will send out
a new version.

Post by Jianfeng Tan
- Drop the patch 3 on vfio communication (postponed).
- Change names from rte_eal_mp_* -> rte_mp_* as suggested by Thomas.
- Add nb_sent and nb_received in struct rte_mp_reply.
- Standardize the return val of sendmsg, request, reply: 0 on sucess,
(-1) on failure.
- If we found an peer error when we send msg in primary, we try to
remove the secondary socket; as there is no sync mechanism there
(cannot do flock like regular file for socket file), we use a more
complex socket name (with tsc in it).
- Some other small changes.
- Add pre-check for each APIs.
- Remove the limitation of 8 secondary processes by: discard original
register/unregister mechanism of secondary process, instead, primary
discoveries secondary processes by looking up the folder for regex match.
- Previous implementation use two sockets for msg and request, this version
just uses one socket. And receive all kinds of messages in mp thread.
v1->v2: (Address comments from Anatoly and Konstantin)
- Use datagram unix socket to supersede stream unix socket + epoll.
- Change the secondary add/del mechanism as now we use connection-less channel.
- Add mp_mutex_action to sync action register/unregister/reference.
- Limit max length of action name to 64B.
- New APIs for synchronous communication: rte_eal_mp_request/rte_eal_mp_reply.
- Formalize the errno handle.
- Some other small issues.
This patchset adds a generic channel for multi-process (primary/secondary)
communication.
Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for the requests which need a immediate response.
eal: add synchronous multi-process communication
vfio: use the generic multi-process channel
doc/guides/rel_notes/release_18_02.rst | 2 +
lib/librte_eal/common/eal_common_proc.c | 254 +++++++++++++--
lib/librte_eal/common/include/rte_eal.h | 58 +++-
lib/librte_eal/linuxapp/eal/eal.c | 14 +-
lib/librte_eal/linuxapp/eal/eal_vfio.c | 169 ++++------
lib/librte_eal/linuxapp/eal/eal_vfio.h | 15 +-
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c | 410 ++++---------------------
lib/librte_eal/rte_eal_version.map | 2 +
8 files changed, 429 insertions(+), 495 deletions(-)

Jianfeng Tan

2018-01-25 19:21:08 UTC

v3->v5:
- Drop the patch 3 on vfio communication (postponed).
- Change names from rte_eal_mp_* -> rte_mp_* as suggested by Thomas.
- Add nb_sent and nb_received in struct rte_mp_reply.
- Standardize the return val of sendmsg, request, reply: 0 on sucess,
(-1) on failure.
- If we found an peer error when we send msg in primary, we try to
remove the secondary socket; as there is no sync mechanism there
(cannot do flock like regular file for socket file), we use a more
complex socket name (with tsc in it).
- Some other small changes.

v3->v4:
- Wrong patches are sent out.

v2->v3:
- Add pre-check for each APIs.
- Remove the limitation of 8 secondary processes by: discard original
register/unregister mechanism of secondary process, instead, primary
discoveries secondary processes by looking up the folder for regex match.
- Previous implementation use two sockets for msg and request, this version
just uses one socket. And receive all kinds of messages in mp thread.

v1->v2: (Address comments from Anatoly and Konstantin)
- Use datagram unix socket to supersede stream unix socket + epoll.
- Change the secondary add/del mechanism as now we use connection-less channel.
- Add mp_mutex_action to sync action register/unregister/reference.
- Limit max length of action name to 64B.
- New APIs for synchronous communication: rte_eal_mp_request/rte_eal_mp_reply.
- Formalize the errno handle.
- Some other small issues.

This patchset adds a generic channel for multi-process (primary/secondary)
communication.

Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for the requests which need a immediate response.

Jianfeng Tan (2):
eal: add channel for multi-process communication
eal: add synchronous multi-process communication

doc/guides/rel_notes/release_18_02.rst | 11 +
lib/librte_eal/bsdapp/eal/eal.c | 10 +-
lib/librte_eal/common/eal_common_proc.c | 655 +++++++++++++++++++++++++++++++-
lib/librte_eal/common/eal_filesystem.h | 19 +-
lib/librte_eal/common/eal_private.h | 12 +-
lib/librte_eal/common/include/rte_eal.h | 133 ++++++-
lib/librte_eal/linuxapp/eal/eal.c | 10 +-
lib/librte_eal/rte_eal_version.map | 5 +
8 files changed, 848 insertions(+), 7 deletions(-)

--
2.7.4

Jianfeng Tan

2018-01-25 19:21:09 UTC

Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
1. Config-file based channel, in which, the primary process writes
info into a pre-defined config file, and the secondary process
reads the info out.
2. vfio submodule has its own channel based on unix socket for the
secondary process to get container fd and group fd from the
primary process.
3. pdump submodule also has its own channel based on unix socket for
packet dump.

It'd be good to have a generic communication channel for multi-process
communication to accommodate the requirements including:
a. Secondary wants to send info to primary, for example, secondary
would like to send request (about some specific vdev to primary).
b. Sending info at any time, instead of just initialization time.
c. Share FDs with the other side, for vdev like vhost, related FDs
(memory region, kick) should be shared.
d. A send message request needs the other side to response immediately.

This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.

Three new APIs are added:

1. rte_eal_mp_action_register() is used to register an action,
indexed by a string, when a component at receiver side would like
to response the messages from the peer processe.
2. rte_eal_mp_action_unregister() is used to unregister the action
if the calling component does not want to response the messages.
3. rte_eal_mp_sendmsg() is used to send a message, and returns
immediately. If there are n secondary processes, the primary
process will send n messages.

Suggested-by: Konstantin Ananyev <***@intel.com>
Signed-off-by: Jianfeng Tan <***@intel.com>
Reviewed-by: Anatoly Burakov <***@intel.com>
Acked-by: Konstantin Ananyev <***@intel.com>
---
doc/guides/rel_notes/release_18_02.rst | 9 +
lib/librte_eal/bsdapp/eal/eal.c | 10 +-
lib/librte_eal/common/eal_common_proc.c | 439 +++++++++++++++++++++++++++++++-
lib/librte_eal/common/eal_filesystem.h | 19 +-
lib/librte_eal/common/eal_private.h | 12 +-
lib/librte_eal/common/include/rte_eal.h | 77 +++++-
lib/librte_eal/linuxapp/eal/eal.c | 10 +-
lib/librte_eal/rte_eal_version.map | 3 +
8 files changed, 572 insertions(+), 7 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index 00b3224..be6ac99 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -151,6 +151,15 @@ New Features
renamed the application from SW PMD specific ``eventdev_pipeline_sw_pmd``
to PMD agnostic ``eventdev_pipeline``.

+* **Added new multi-process communication channel**
+
+ Added a generic channel in EAL for multi-process (primary/secondary) communication.
+ Consumers of this channel need to register an action with an action name to response
+ a message received; the actions will be identified by the action name and executed
+ in the context of a new dedicated thread for this channel. The list of new APIs:
+
+ * ``rte_mp_register`` and ``rte_mp_unregister`` are for action (un)registration.
+ * ``rte_mp_sendmsg`` is for sending a message without blocking for a response.

API Changes
-----------
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 04cbd81..fcc9828 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -1,7 +1,7 @@
/*-
* BSD LICENSE
*
- * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * Copyright(c) 2010-2018 Intel Corporation. All rights reserved.
* Copyright(c) 2014 6WIND S.A.
* All rights reserved.
*
@@ -603,6 +603,14 @@ rte_eal_init(int argc, char **argv)

rte_config_init();

+ if (rte_mp_channel_init() < 0) {
+ rte_eal_init_alert("failed to init mp channel\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ rte_errno = EFAULT;
+ return -1;
+ }
+ }
+
if (rte_eal_memory_init() < 0) {
rte_eal_init_alert("Cannot init memory\n");
rte_errno = ENOMEM;
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 40fa982..aea0829 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -1,15 +1,51 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2016 Intel Corporation
+ * Copyright(c) 2016-2018 Intel Corporation
*/

-#include <stdio.h>
+#include <dirent.h>
+#include <errno.h>
#include <fcntl.h>
+#include <fnmatch.h>
+#include <inttypes.h>
+#include <libgen.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <rte_common.h>
+#include <rte_cycles.h>
#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_log.h>

+#include "eal_private.h"
#include "eal_filesystem.h"
#include "eal_internal_cfg.h"

+static int mp_fd = -1;
+static char mp_filter[PATH_MAX]; /* Filter for secondary process sockets */
+static char mp_dir_path[PATH_MAX]; /* The directory path for all mp sockets */
+static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+
+struct action_entry {
+ TAILQ_ENTRY(action_entry) next;
+ char action_name[RTE_MP_MAX_NAME_LEN];
+ rte_mp_t action;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(action_entry_list, action_entry);
+
+static struct action_entry_list action_entry_list =
+ TAILQ_HEAD_INITIALIZER(action_entry_list);
+
int
rte_eal_primary_proc_alive(const char *config_file_path)
{
@@ -31,3 +67,402 @@ rte_eal_primary_proc_alive(const char *config_file_path)

return !!ret;
}
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+ struct action_entry *entry;
+
+ TAILQ_FOREACH(entry, &action_entry_list, next) {
+ if (strncmp(entry->action_name, name, RTE_MP_MAX_NAME_LEN) == 0)
+ break;
+ }
+
+ return entry;
+}
+
+static int
+validate_action_name(const char *name)
+{
+ if (name == NULL) {
+ RTE_LOG(ERR, EAL, "Action name cannot be NULL\n");
+ rte_errno = -EINVAL;
+ return -1;
+ }
+ if (strnlen(name, RTE_MP_MAX_NAME_LEN) == 0) {
+ RTE_LOG(ERR, EAL, "Length of action name is zero\n");
+ rte_errno = -EINVAL;
+ return -1;
+ }
+ if (strnlen(name, RTE_MP_MAX_NAME_LEN) == RTE_MP_MAX_NAME_LEN) {
+ rte_errno = -E2BIG;
+ return -1;
+ }
+ return 0;
+}
+
+int
+rte_mp_action_register(const char *name, rte_mp_t action)
+{
+ struct action_entry *entry;
+
+ if (validate_action_name(name))
+ return -1;
+
+ entry = malloc(sizeof(struct action_entry));
+ if (entry == NULL) {
+ rte_errno = -ENOMEM;
+ return -1;
+ }
+ strcpy(entry->action_name, name);
+ entry->action = action;
+
+ pthread_mutex_lock(&mp_mutex_action);
+ if (find_action_entry_by_name(name) != NULL) {
+ pthread_mutex_unlock(&mp_mutex_action);
+ rte_errno = -EEXIST;
+ free(entry);
+ return -1;
+ }
+ TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+ pthread_mutex_unlock(&mp_mutex_action);
+ return 0;
+}
+
+void
+rte_mp_action_unregister(const char *name)
+{
+ struct action_entry *entry;
+
+ if (validate_action_name(name))
+ return;
+
+ pthread_mutex_lock(&mp_mutex_action);
+ entry = find_action_entry_by_name(name);
+ if (entry == NULL) {
+ pthread_mutex_unlock(&mp_mutex_action);
+ return;
+ }
+ TAILQ_REMOVE(&action_entry_list, entry, next);
+ pthread_mutex_unlock(&mp_mutex_action);
+ free(entry);
+}
+
+static int
+read_msg(struct rte_mp_msg *msg)
+{
+ int msglen;
+ struct iovec iov;
+ struct msghdr msgh;
+ char control[CMSG_SPACE(sizeof(msg->fds))];
+ struct cmsghdr *cmsg;
+ int buflen = sizeof(*msg) - sizeof(msg->fds);
+
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = msg;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ msglen = recvmsg(mp_fd, &msgh, 0);
+ if (msglen < 0) {
+ RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
+ return -1;
+ }
+
+ if (msglen != buflen || (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+ RTE_LOG(ERR, EAL, "truncted msg\n");
+ return -1;
+ }
+
+ /* read auxiliary FDs if any */
+ for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+ if ((cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS)) {
+ memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static void
+process_msg(struct rte_mp_msg *msg)
+{
+ struct action_entry *entry;
+ rte_mp_t action = NULL;
+
+ RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+ pthread_mutex_lock(&mp_mutex_action);
+ entry = find_action_entry_by_name(msg->name);
+ if (entry != NULL)
+ action = entry->action;
+ pthread_mutex_unlock(&mp_mutex_action);
+
+ if (!action)
+ RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
+ else if (action(msg) < 0)
+ RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
+}
+
+static void *
+mp_handle(void *arg __rte_unused)
+{
+ struct rte_mp_msg msg;
+
+ while (1) {
+ if (read_msg(&msg) == 0)
+ process_msg(&msg);
+ }
+
+ return NULL;
+}
+
+static int
+open_socket_fd(void)
+{
+ struct sockaddr_un un;
+ const char *prefix = eal_mp_socket_path();
+
+ mp_fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+ if (mp_fd < 0) {
+ RTE_LOG(ERR, EAL, "failed to create unix socket\n");
+ return -1;
+ }
+
+ memset(&un, 0, sizeof(un));
+ un.sun_family = AF_UNIX;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ snprintf(un.sun_path, sizeof(un.sun_path), "%s", prefix);
+ else {
+ snprintf(un.sun_path, sizeof(un.sun_path), "%s_%d_%"PRIx64,
+ prefix, getpid(), rte_rdtsc());
+ }
+ unlink(un.sun_path); /* May still exist since last run */
+ if (bind(mp_fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+ RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
+ un.sun_path, strerror(errno));
+ close(mp_fd);
+ return -1;
+ }
+
+ RTE_LOG(INFO, EAL, "Multi-process socket %s\n", un.sun_path);
+ return mp_fd;
+}
+
+static int
+unlink_sockets(const char *filter)
+{
+ int dir_fd;
+ DIR *mp_dir;
+ struct dirent *ent;
+
+ mp_dir = opendir(mp_dir_path);
+ if (!mp_dir) {
+ RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+ return -1;
+ }
+ dir_fd = dirfd(mp_dir);
+
+ while ((ent = readdir(mp_dir))) {
+ if (fnmatch(filter, ent->d_name, 0) == 0)
+ unlinkat(dir_fd, ent->d_name, 0);
+ }
+
+ closedir(mp_dir);
+ return 0;
+}
+
+static void
+unlink_socket_by_path(const char *path)
+{
+ char *filename;
+ char *fullpath = strdup(path);
+
+ if (!fullpath)
+ return;
+ filename = basename(fullpath);
+ unlink_sockets(filename);
+ free(fullpath);
+ RTE_LOG(INFO, EAL, "Remove socket %s\n", path);
+}
+
+int
+rte_mp_channel_init(void)
+{
+ char thread_name[RTE_MAX_THREAD_NAME_LEN];
+ char *path;
+ pthread_t tid;
+
+ snprintf(mp_filter, PATH_MAX, ".%s_unix_*",
+ internal_config.hugefile_prefix);
+
+ path = strdup(eal_mp_socket_path());
+ snprintf(mp_dir_path, PATH_MAX, "%s", dirname(path));
+ free(path);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
+ unlink_sockets(mp_filter)) {
+ RTE_LOG(ERR, EAL, "failed to unlink mp sockets\n");
+ return -1;
+ }
+
+ if (open_socket_fd() < 0)
+ return -1;
+
+ if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n",
+ strerror(errno));
+ close(mp_fd);
+ mp_fd = -1;
+ return -1;
+ }
+
+ /* try best to set thread name */
+ snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
+ rte_thread_setname(tid, thread_name);
+ return 0;
+}
+
+/**
+ * Return -1, as fail to send message and it's caused by the local side.
+ * Return 0, as fail to send message and it's caused by the remote side.
+ * Return 1, as succeed to send message.
+ *
+ */
+static int
+send_msg(const char *dst_path, struct rte_mp_msg *msg)
+{
+ int snd;
+ struct iovec iov;
+ struct msghdr msgh;
+ struct cmsghdr *cmsg;
+ struct sockaddr_un dst;
+ int fd_size = msg->num_fds * sizeof(int);
+ char control[CMSG_SPACE(fd_size)];
+
+ memset(&dst, 0, sizeof(dst));
+ dst.sun_family = AF_UNIX;
+ snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
+
+ memset(&msgh, 0, sizeof(msgh));
+ memset(control, 0, sizeof(control));
+
+ iov.iov_base = msg;
+ iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+
+ msgh.msg_name = &dst;
+ msgh.msg_namelen = sizeof(dst);
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ cmsg->cmsg_len = CMSG_LEN(fd_size);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), msg->fds, fd_size);
+
+ do {
+ snd = sendmsg(mp_fd, &msgh, 0);
+ } while (snd < 0 && errno == EINTR);
+
+ if (snd < 0) {
+ rte_errno = errno;
+ /* Check if it caused by peer process exits */
+ if (errno == -ECONNREFUSED) {
+ /* We don't unlink the primary's socket here */
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ unlink_socket_by_path(dst_path);
+ return 0;
+ }
+ if (errno == -ENOBUFS) {
+ RTE_LOG(ERR, EAL, "Peer cannot receive message %s\n",
+ dst_path);
+ return 0;
+ }
+ RTE_LOG(ERR, EAL, "failed to send to (%s) due to %s\n",
+ dst_path, strerror(errno));
+ return -1;
+ }
+
+ return 1;
+}
+
+static int
+mp_send(struct rte_mp_msg *msg)
+{
+ int ret = 0;
+ DIR *mp_dir;
+ struct dirent *ent;
+
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+ if (send_msg(eal_mp_socket_path(), msg) < 0)
+ return -1;
+ else
+ return 0;
+ }
+
+ /* broadcast to all secondary processes */
+ mp_dir = opendir(mp_dir_path);
+ if (!mp_dir) {
+ RTE_LOG(ERR, EAL, "Unable to open directory %s\n",
+ mp_dir_path);
+ rte_errno = errno;
+ return -1;
+ }
+ while ((ent = readdir(mp_dir))) {
+ if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+ continue;
+
+ if (send_msg(ent->d_name, msg) < 0)
+ ret = -1;
+ }
+ closedir(mp_dir);
+
+ return ret;
+}
+
+static bool
+check_input(const struct rte_mp_msg *msg)
+{
+ if (msg == NULL) {
+ RTE_LOG(ERR, EAL, "Msg cannot be NULL\n");
+ rte_errno = -EINVAL;
+ return false;
+ }
+
+ if (validate_action_name(msg->name))
+ return false;
+
+ if (msg->len_param > RTE_MP_MAX_PARAM_LEN) {
+ RTE_LOG(ERR, EAL, "Message data is too long\n");
+ rte_errno = -E2BIG;
+ return false;
+ }
+
+ if (msg->num_fds > RTE_MP_MAX_FD_NUM) {
+ RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n",
+ RTE_MP_MAX_FD_NUM);
+ rte_errno = -E2BIG;
+ return false;
+ }
+
+ return true;
+}
+
+int
+rte_mp_sendmsg(struct rte_mp_msg *msg)
+{
+ if (!check_input(msg))
+ return -1;
+
+ RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
+ return mp_send(msg);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index e8959eb..4708dd5 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

/**
@@ -38,6 +38,23 @@ eal_runtime_config_path(void)
return buffer;
}

+/** Path of primary/secondary communication unix socket file. */
+#define MP_SOCKET_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_socket_path(void)
+{
+ static char buffer[PATH_MAX]; /* static so auto-zeroed */
+ const char *directory = default_config_dir;
+ const char *home_dir = getenv("HOME");
+
+ if (getuid() != 0 && home_dir != NULL)
+ directory = home_dir;
+ snprintf(buffer, sizeof(buffer) - 1, MP_SOCKET_PATH_FMT,
+ directory, internal_config.hugefile_prefix);
+
+ return buffer;
+}
+
/** Path of hugepage info file. */
#define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"

diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index c46dd8f..0b28770 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

#ifndef _EAL_PRIVATE_H_
@@ -195,4 +195,14 @@ int rte_eal_hugepage_attach(void);
*/
struct rte_bus *rte_bus_find_by_device_name(const char *str);

+/**
+ * Create the unix channel for primary/secondary communication.
+ *
+ * @return
+ * 0 on success;
+ * (<0) on failure.
+ */
+
+int rte_mp_channel_init(void);
+
#endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 2aba2c8..1d42e9c 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

#ifndef _RTE_EAL_H_
@@ -186,6 +186,81 @@ int rte_eal_init(int argc, char **argv);
*/
int rte_eal_primary_proc_alive(const char *config_file_path);

+#define RTE_MP_MAX_FD_NUM 8 /* The max amount of fds */
+#define RTE_MP_MAX_NAME_LEN 64 /* The max length of action name */
+#define RTE_MP_MAX_PARAM_LEN 256 /* The max length of param */
+struct rte_mp_msg {
+ char name[RTE_MP_MAX_NAME_LEN];
+ int len_param;
+ int num_fds;
+ uint8_t param[RTE_MP_MAX_PARAM_LEN];
+ int fds[RTE_MP_MAX_FD_NUM];
+};
+
+/**
+ * Action function typedef used by other components.
+ *
+ * As we create socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_mp_t)(const struct rte_mp_msg *msg);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * @param name
+ * The name argument plays as the nonredundant key to find the action.
+ *
+ * @param action
+ * The action argument is the function pointer to the action function.
+ *
+ * @return
+ * - 0 on success.
+ * - (<0) on failure.
+ */
+int rte_mp_action_register(const char *name, rte_mp_t action);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Unregister an action function for primary/secondary communication.
+ *
+ * Call this function to unregister an action if the calling component does
+ * not want to response the messages from the corresponding component in its
+ * primary process or secondary processes.
+ *
+ * @param name
+ * The name argument plays as the nonredundant key to find the action.
+ *
+ */
+void rte_mp_action_unregister(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a message to the peer process.
+ *
+ * This function will send a message which will be responsed by the action
+ * identified by name in the peer process.
+ *
+ * @param msg
+ * The msg argument contains the customized message.
+ *
+ * @return
+ * - On success, return 0.
+ * - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_sendmsg(struct rte_mp_msg *msg);
+
/**
* Usage function typedef used by the application usage function.
*
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 229eec9..53e29e4 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -1,7 +1,7 @@
/*-
* BSD LICENSE
*
- * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * Copyright(c) 2010-2018 Intel Corporation. All rights reserved.
* Copyright(c) 2012-2014 6WIND S.A.
* All rights reserved.
*
@@ -852,6 +852,14 @@ rte_eal_init(int argc, char **argv)
return -1;
}

+ if (rte_mp_channel_init() < 0) {
+ rte_eal_init_alert("failed to init mp channel\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ rte_errno = EFAULT;
+ return -1;
+ }
+ }
+
#ifdef VFIO_PRESENT
if (rte_eal_vfio_setup() < 0) {
rte_eal_init_alert("Cannot init VFIO\n");
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 7088b72..adeadfb 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -217,6 +217,9 @@ EXPERIMENTAL {
rte_eal_devargs_remove;
rte_eal_hotplug_add;
rte_eal_hotplug_remove;
+ rte_eal_mp_action_register;
+ rte_eal_mp_action_unregister;
+ rte_eal_mp_sendmsg;
rte_service_attr_get;
rte_service_attr_reset_all;
rte_service_component_register;

--
2.7.4

Jianfeng Tan

2018-01-25 19:21:10 UTC

We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.

We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The caller
can specify the timeout. And the response messages will be collected
and returned so that the caller can decide how to translate them.

The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.

sender-process receiver-process
---------------------- ----------------

thread-n
|_rte_eal_mp_request() ----------> mp-thread
|_timedwait() |_process_msg()
|_action()
|_rte_eal_mp_reply()
mp_thread <---------------------|
|_process_msg()
|_signal(send_thread)
thread-m <----------|
|_collect-reply

* A secondary process is only allowed to talk to the primary process.
* If there are multiple secondary processes for the primary process,
it will send request to peer1, collect response from peer1; then
send request to peer2, collect response from peer2, and so on.
* When thread-n is sending request, thread-m of that process can send
request at the same time.
* For pair <action_name, peer>, we guarantee that only one such request
is on the fly.

Suggested-by: Anatoly Burakov <***@intel.com>
Suggested-by: Konstantin Ananyev <***@intel.com>
Signed-off-by: Jianfeng Tan <***@intel.com>
Reviewed-by: Anatoly Burakov <***@intel.com>
Acked-by: Konstantin Ananyev <***@intel.com>
---
doc/guides/rel_notes/release_18_02.rst | 2 +
lib/librte_eal/common/eal_common_proc.c | 254 +++++++++++++++++++++++++++++---
lib/librte_eal/common/include/rte_eal.h | 58 +++++++-
lib/librte_eal/rte_eal_version.map | 2 +
4 files changed, 296 insertions(+), 20 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index be6ac99..39425a4 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -160,6 +160,8 @@ New Features

* ``rte_mp_register`` and ``rte_mp_unregister`` are for action (un)registration.
* ``rte_mp_sendmsg`` is for sending a message without blocking for a response.
+ * ``rte_mp_request`` is for sending a request message and will block until
+ it gets a reply message which is sent from the peer by ``rte_mp_reply``.

API Changes
-----------
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index aea0829..6ad73f5 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -13,6 +13,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <sys/time.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
@@ -46,6 +47,50 @@ TAILQ_HEAD(action_entry_list, action_entry);
static struct action_entry_list action_entry_list =
TAILQ_HEAD_INITIALIZER(action_entry_list);

+enum mp_type {
+ MP_MSG, /* Share message with peers, will not block */
+ MP_REQ, /* Request for information, Will block for a reply */
+ MP_REP, /* Response to previously-received request */
+};
+
+struct mp_msg_internal {
+ int type;
+ struct rte_mp_msg msg;
+};
+
+struct sync_request {
+ TAILQ_ENTRY(sync_request) next;
+ int reply_received;
+ char dst[PATH_MAX];
+ struct rte_mp_msg *request;
+ struct rte_mp_msg *reply;
+ pthread_cond_t cond;
+};
+
+TAILQ_HEAD(sync_request_list, sync_request);
+
+static struct {
+ struct sync_request_list requests;
+ pthread_mutex_t lock;
+} sync_requests = {
+ .requests = TAILQ_HEAD_INITIALIZER(sync_requests.requests),
+ .lock = PTHREAD_MUTEX_INITIALIZER
+};
+
+static struct sync_request *
+find_sync_request(const char *dst, const char *act_name)
+{
+ struct sync_request *r;
+
+ TAILQ_FOREACH(r, &sync_requests.requests, next) {
+ if (!strcmp(r->dst, dst) &&
+ !strcmp(r->request->name, act_name))
+ break;
+ }
+
+ return r;
+}
+
int
rte_eal_primary_proc_alive(const char *config_file_path)
{
@@ -149,19 +194,21 @@ rte_mp_action_unregister(const char *name)
}

static int
-read_msg(struct rte_mp_msg *msg)
+read_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
{
int msglen;
struct iovec iov;
struct msghdr msgh;
- char control[CMSG_SPACE(sizeof(msg->fds))];
+ char control[CMSG_SPACE(sizeof(m->msg.fds))];
struct cmsghdr *cmsg;
- int buflen = sizeof(*msg) - sizeof(msg->fds);
+ int buflen = sizeof(*m) - sizeof(m->msg.fds);

memset(&msgh, 0, sizeof(msgh));
- iov.iov_base = msg;
+ iov.iov_base = m;
iov.iov_len = buflen;

+ msgh.msg_name = s;
+ msgh.msg_namelen = sizeof(*s);
msgh.msg_iov = &iov;
msgh.msg_iovlen = 1;
msgh.msg_control = control;
@@ -183,7 +230,7 @@ read_msg(struct rte_mp_msg *msg)
cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
if ((cmsg->cmsg_level == SOL_SOCKET) &&
(cmsg->cmsg_type == SCM_RIGHTS)) {
- memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+ memcpy(m->msg.fds, CMSG_DATA(cmsg), sizeof(m->msg.fds));
break;
}
}
@@ -192,12 +239,28 @@ read_msg(struct rte_mp_msg *msg)
}

static void
-process_msg(struct rte_mp_msg *msg)
+process_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
{
+ struct sync_request *sync_req;
struct action_entry *entry;
+ struct rte_mp_msg *msg = &m->msg;
rte_mp_t action = NULL;

RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+
+ if (m->type == MP_REP) {
+ pthread_mutex_lock(&sync_requests.lock);
+ sync_req = find_sync_request(s->sun_path, msg->name);
+ if (sync_req) {
+ memcpy(sync_req->reply, msg, sizeof(*msg));
+ sync_req->reply_received = 1;
+ pthread_cond_signal(&sync_req->cond);
+ } else
+ RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name);
+ pthread_mutex_unlock(&sync_requests.lock);
+ return;
+ }
+
pthread_mutex_lock(&mp_mutex_action);
entry = find_action_entry_by_name(msg->name);
if (entry != NULL)
@@ -206,18 +269,19 @@ process_msg(struct rte_mp_msg *msg)

if (!action)
RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
- else if (action(msg) < 0)
+ else if (action(msg, s->sun_path) < 0)
RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
}

static void *
mp_handle(void *arg __rte_unused)
{
- struct rte_mp_msg msg;
+ struct mp_msg_internal msg;
+ struct sockaddr_un sa;

while (1) {
- if (read_msg(&msg) == 0)
- process_msg(&msg);
+ if (read_msg(&msg, &sa) == 0)
+ process_msg(&msg, &sa);
}

return NULL;
@@ -336,16 +400,20 @@ rte_mp_channel_init(void)
*
*/
static int
-send_msg(const char *dst_path, struct rte_mp_msg *msg)
+send_msg(const char *dst_path, struct rte_mp_msg *msg, int type)
{
int snd;
struct iovec iov;
struct msghdr msgh;
struct cmsghdr *cmsg;
struct sockaddr_un dst;
+ struct mp_msg_internal m;
int fd_size = msg->num_fds * sizeof(int);
char control[CMSG_SPACE(fd_size)];

+ m.type = type;
+ memcpy(&m.msg, msg, sizeof(*msg));
+
memset(&dst, 0, sizeof(dst));
dst.sun_family = AF_UNIX;
snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
@@ -353,8 +421,8 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
memset(&msgh, 0, sizeof(msgh));
memset(control, 0, sizeof(control));

- iov.iov_base = msg;
- iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+ iov.iov_base = &m;
+ iov.iov_len = sizeof(m) - sizeof(msg->fds);

msgh.msg_name = &dst;
msgh.msg_namelen = sizeof(dst);
@@ -396,14 +464,17 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
}

static int
-mp_send(struct rte_mp_msg *msg)
+mp_send(struct rte_mp_msg *msg, const char *peer, int type)
{
int ret = 0;
DIR *mp_dir;
struct dirent *ent;

- if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
- if (send_msg(eal_mp_socket_path(), msg) < 0)
+ if (!peer && (rte_eal_process_type() == RTE_PROC_SECONDARY))
+ peer = eal_mp_socket_path();
+
+ if (peer) {
+ if (send_msg(peer, msg, type) < 0)
return -1;
else
return 0;
@@ -421,11 +492,11 @@ mp_send(struct rte_mp_msg *msg)
if (fnmatch(mp_filter, ent->d_name, 0) != 0)
continue;

- if (send_msg(ent->d_name, msg) < 0)
+ if (send_msg(ent->d_name, msg, type) < 0)
ret = -1;
}
- closedir(mp_dir);

+ closedir(mp_dir);
return ret;
}

@@ -464,5 +535,150 @@ rte_mp_sendmsg(struct rte_mp_msg *msg)
return -1;

RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
- return mp_send(msg);
+ return mp_send(msg, NULL, MP_MSG);
+}
+
+static int
+mp_request_one(const char *dst, struct rte_mp_msg *req,
+ struct rte_mp_reply *reply, const struct timespec *ts)
+{
+ int ret;
+ struct timeval now;
+ struct rte_mp_msg msg, *tmp;
+ struct sync_request sync_req, *exist;
+
+ sync_req.reply_received = 0;
+ strcpy(sync_req.dst, dst);
+ sync_req.request = req;
+ sync_req.reply = &msg;
+ pthread_cond_init(&sync_req.cond, NULL);
+
+ pthread_mutex_lock(&sync_requests.lock);
+ exist = find_sync_request(dst, req->name);
+ if (!exist)
+ TAILQ_INSERT_TAIL(&sync_requests.requests, &sync_req, next);
+ pthread_mutex_unlock(&sync_requests.lock);
+ if (exist) {
+ RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name);
+ rte_errno = -EEXIST;
+ return -1;
+ }
+
+ ret = send_msg(dst, req, MP_REQ);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n",
+ dst, req->name);
+ return -1;
+ } else if (ret == 0)
+ return 0;
+
+ reply->nb_sent++;
+
+ pthread_mutex_lock(&sync_requests.lock);
+ do {
+ pthread_cond_timedwait(&sync_req.cond, &sync_requests.lock, ts);
+ /* Check spurious wakeups */
+ if (sync_req.reply_received == 1)
+ break;
+ /* Check if time is out */
+ if (gettimeofday(&now, NULL) < 0)
+ break;
+ if (now.tv_sec < ts->tv_sec)
+ break;
+ else if (now.tv_sec == ts->tv_sec &&
+ now.tv_usec * 1000 < ts->tv_nsec)
+ break;
+ } while (1);
+ /* We got the lock now */
+ TAILQ_REMOVE(&sync_requests.requests, &sync_req, next);
+ pthread_mutex_unlock(&sync_requests.lock);
+
+ if (sync_req.reply_received == 0) {
+ RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n",
+ dst, req->name);
+ rte_errno = -ETIMEDOUT;
+ return -1;
+ }
+
+ tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_received + 1));
+ if (!tmp) {
+ RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n",
+ dst, req->name);
+ rte_errno = -ENOMEM;
+ return -1;
+ }
+ memcpy(&tmp[reply->nb_received], &msg, sizeof(msg));
+ reply->msgs = tmp;
+ reply->nb_received++;
+ return 0;
+}
+
+int
+rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+ const struct timespec *ts)
+{
+ int ret = 0;
+ DIR *mp_dir;
+ struct dirent *ent;
+ struct timeval now;
+ struct timespec end;
+
+ RTE_LOG(DEBUG, EAL, "request: %s\n", req->name);
+
+ if (check_input(req) == false)
+ return -1;
+ if (gettimeofday(&now, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "Faile to get current time\n");
+ rte_errno = errno;
+ return -1;
+ }
+
+ end.tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000;
+ end.tv_sec = now.tv_sec + ts->tv_sec +
+ (now.tv_usec * 1000 + ts->tv_nsec) / 1000000000;
+
+ reply->nb_sent = 0;
+ reply->nb_received = 0;
+ reply->msgs = NULL;
+
+ /* for secondary process, send request to the primary process only */
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+ return mp_request_one(eal_mp_socket_path(), req, reply, &end);
+
+ /* for primary process, broadcast request, and collect reply 1 by 1 */
+ mp_dir = opendir(mp_dir_path);
+ if (!mp_dir) {
+ RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+ rte_errno = errno;
+ return -1;
+ }
+
+ while ((ent = readdir(mp_dir))) {
+ if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+ continue;
+
+ if (mp_request_one(ent->d_name, req, reply, &end))
+ ret = -1;
+ }
+
+ closedir(mp_dir);
+ return ret;
+}
+
+int
+rte_mp_reply(struct rte_mp_msg *msg, const char *peer)
+{
+
+ RTE_LOG(DEBUG, EAL, "reply: %s\n", msg->name);
+
+ if (check_input(msg) == false)
+ return -1;
+
+ if (peer == NULL) {
+ RTE_LOG(ERR, EAL, "peer is not specified\n");
+ rte_errno = -EINVAL;
+ return -1;
+ }
+
+ return mp_send(msg, peer, MP_REP);
}
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 1d42e9c..9207ad9 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -13,6 +13,7 @@

#include <stdint.h>
#include <sched.h>
+#include <time.h>

#include <rte_config.h>
#include <rte_per_lcore.h>
@@ -197,13 +198,19 @@ struct rte_mp_msg {
int fds[RTE_MP_MAX_FD_NUM];
};

+struct rte_mp_reply {
+ int nb_sent;
+ int nb_received;
+ struct rte_mp_msg *msgs; /* caller to free */
+};
+
/**
* Action function typedef used by other components.
*
* As we create socket channel for primary/secondary communication, use
* this function typedef to register action for coming messages.
*/
-typedef int (*rte_mp_t)(const struct rte_mp_msg *msg);
+typedef int (*rte_mp_t)(const struct rte_mp_msg *msg, const void *peer);

/**
* @warning
@@ -262,6 +269,55 @@ void rte_mp_action_unregister(const char *name);
int rte_mp_sendmsg(struct rte_mp_msg *msg);

/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a request to the peer process and expect a reply.
+ *
+ * This function sends a request message to the peer process, and will
+ * block until receiving reply message from the peer process.
+ *
+ * @note The caller is responsible to free reply->replies.
+ *
+ * @param req
+ * The req argument contains the customized request message.
+ *
+ * @param reply
+ * The reply argument will be for storing all the replied messages;
+ * the caller is responsible for free reply->replies.
+ *
+ * @param ts
+ * The ts argument specifies how long we can wait for the peer(s) to reply.
+ *
+ * @return
+ * - On success, return 0.
+ * - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+ const struct timespec *ts);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a reply to the peer process.
+ *
+ * This function will send a reply message in response to a request message
+ * received previously.
+ *
+ * @param msg
+ * The msg argument contains the customized message.
+ *
+ * @param peer
+ * The peer argument is the pointer to the peer socket path.
+ *
+ * @return
+ * - On success, return 0.
+ * - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_reply(struct rte_mp_msg *msg, const char *peer);
+
+/**
* Usage function typedef used by the application usage function.
*
* Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index adeadfb..2cb6b07 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -220,6 +220,8 @@ EXPERIMENTAL {
rte_eal_mp_action_register;
rte_eal_mp_action_unregister;
rte_eal_mp_sendmsg;
+ rte_eal_mp_request;
+ rte_eal_mp_reply;
rte_service_attr_get;
rte_service_attr_reset_all;
rte_service_component_register;

--
2.7.4

Thomas Monjalon

2018-01-25 21:23:03 UTC

Post by Jianfeng Tan
- Drop the patch 3 on vfio communication (postponed).
- Change names from rte_eal_mp_* -> rte_mp_* as suggested by Thomas.

You did not update the .map file for this change.

Jianfeng Tan

2018-01-26 03:41:20 UTC

v5->v6:
- Correct the API name issue in rte_eal_version.map.

v3->v5:
- Drop the patch 3 on vfio communication (postponed).
- Change names from rte_eal_mp_* -> rte_mp_* as suggested by Thomas.
- Add nb_sent and nb_received in struct rte_mp_reply.
- Standardize the return val of sendmsg, request, reply: 0 on sucess,
(-1) on failure.
- If we found an peer error when we send msg in primary, we try to
remove the secondary socket; as there is no sync mechanism there
(cannot do flock like regular file for socket file), we use a more
complex socket name (with tsc in it).
- Some other small changes.

v3->v4:
- Wrong patches are sent out.

v2->v3:
- Add pre-check for each APIs.
- Remove the limitation of 8 secondary processes by: discard original
register/unregister mechanism of secondary process, instead, primary
discoveries secondary processes by looking up the folder for regex match.
- Previous implementation use two sockets for msg and request, this version
just uses one socket. And receive all kinds of messages in mp thread.

v1->v2: (Address comments from Anatoly and Konstantin)
- Use datagram unix socket to supersede stream unix socket + epoll.
- Change the secondary add/del mechanism as now we use connection-less channel.
- Add mp_mutex_action to sync action register/unregister/reference.
- Limit max length of action name to 64B.
- New APIs for synchronous communication: rte_eal_mp_request/rte_eal_mp_reply.
- Formalize the errno handle.
- Some other small issues.

This patchset adds a generic channel for multi-process (primary/secondary)
communication.

Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for the requests which need a immediate response.

Jianfeng Tan (2):
eal: add channel for multi-process communication
eal: add synchronous multi-process communication

doc/guides/rel_notes/release_18_02.rst | 11 +
lib/librte_eal/bsdapp/eal/eal.c | 10 +-
lib/librte_eal/common/eal_common_proc.c | 655 +++++++++++++++++++++++++++++++-
lib/librte_eal/common/eal_filesystem.h | 19 +-
lib/librte_eal/common/eal_private.h | 12 +-
lib/librte_eal/common/include/rte_eal.h | 133 ++++++-
lib/librte_eal/linuxapp/eal/eal.c | 10 +-
lib/librte_eal/rte_eal_version.map | 5 +
8 files changed, 848 insertions(+), 7 deletions(-)

--
2.7.4

Jianfeng Tan

2018-01-26 03:41:21 UTC

Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
1. Config-file based channel, in which, the primary process writes
info into a pre-defined config file, and the secondary process
reads the info out.
2. vfio submodule has its own channel based on unix socket for the
secondary process to get container fd and group fd from the
primary process.
3. pdump submodule also has its own channel based on unix socket for
packet dump.

It'd be good to have a generic communication channel for multi-process
communication to accommodate the requirements including:
a. Secondary wants to send info to primary, for example, secondary
would like to send request (about some specific vdev to primary).
b. Sending info at any time, instead of just initialization time.
c. Share FDs with the other side, for vdev like vhost, related FDs
(memory region, kick) should be shared.
d. A send message request needs the other side to response immediately.

This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.

Three new APIs are added:

1. rte_eal_mp_action_register() is used to register an action,
indexed by a string, when a component at receiver side would like
to response the messages from the peer processe.
2. rte_eal_mp_action_unregister() is used to unregister the action
if the calling component does not want to response the messages.
3. rte_eal_mp_sendmsg() is used to send a message, and returns
immediately. If there are n secondary processes, the primary
process will send n messages.

Suggested-by: Konstantin Ananyev <***@intel.com>
Signed-off-by: Jianfeng Tan <***@intel.com>
Reviewed-by: Anatoly Burakov <***@intel.com>
Acked-by: Konstantin Ananyev <***@intel.com>
---
doc/guides/rel_notes/release_18_02.rst | 9 +
lib/librte_eal/bsdapp/eal/eal.c | 10 +-
lib/librte_eal/common/eal_common_proc.c | 439 +++++++++++++++++++++++++++++++-
lib/librte_eal/common/eal_filesystem.h | 19 +-
lib/librte_eal/common/eal_private.h | 12 +-
lib/librte_eal/common/include/rte_eal.h | 77 +++++-
lib/librte_eal/linuxapp/eal/eal.c | 10 +-
lib/librte_eal/rte_eal_version.map | 3 +
8 files changed, 572 insertions(+), 7 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index 00b3224..be6ac99 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -151,6 +151,15 @@ New Features
renamed the application from SW PMD specific ``eventdev_pipeline_sw_pmd``
to PMD agnostic ``eventdev_pipeline``.

+* **Added new multi-process communication channel**
+
+ Added a generic channel in EAL for multi-process (primary/secondary) communication.
+ Consumers of this channel need to register an action with an action name to response
+ a message received; the actions will be identified by the action name and executed
+ in the context of a new dedicated thread for this channel. The list of new APIs:
+
+ * ``rte_mp_register`` and ``rte_mp_unregister`` are for action (un)registration.
+ * ``rte_mp_sendmsg`` is for sending a message without blocking for a response.

API Changes
-----------
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 04cbd81..fcc9828 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -1,7 +1,7 @@
/*-
* BSD LICENSE
*
- * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * Copyright(c) 2010-2018 Intel Corporation. All rights reserved.
* Copyright(c) 2014 6WIND S.A.
* All rights reserved.
*
@@ -603,6 +603,14 @@ rte_eal_init(int argc, char **argv)

rte_config_init();

+ if (rte_mp_channel_init() < 0) {
+ rte_eal_init_alert("failed to init mp channel\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ rte_errno = EFAULT;
+ return -1;
+ }
+ }
+
if (rte_eal_memory_init() < 0) {
rte_eal_init_alert("Cannot init memory\n");
rte_errno = ENOMEM;
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 40fa982..aea0829 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -1,15 +1,51 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2016 Intel Corporation
+ * Copyright(c) 2016-2018 Intel Corporation
*/

-#include <stdio.h>
+#include <dirent.h>
+#include <errno.h>
#include <fcntl.h>
+#include <fnmatch.h>
+#include <inttypes.h>
+#include <libgen.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <rte_common.h>
+#include <rte_cycles.h>
#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_log.h>

+#include "eal_private.h"
#include "eal_filesystem.h"
#include "eal_internal_cfg.h"

+static int mp_fd = -1;
+static char mp_filter[PATH_MAX]; /* Filter for secondary process sockets */
+static char mp_dir_path[PATH_MAX]; /* The directory path for all mp sockets */
+static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+
+struct action_entry {
+ TAILQ_ENTRY(action_entry) next;
+ char action_name[RTE_MP_MAX_NAME_LEN];
+ rte_mp_t action;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(action_entry_list, action_entry);
+
+static struct action_entry_list action_entry_list =
+ TAILQ_HEAD_INITIALIZER(action_entry_list);
+
int
rte_eal_primary_proc_alive(const char *config_file_path)
{
@@ -31,3 +67,402 @@ rte_eal_primary_proc_alive(const char *config_file_path)

return !!ret;
}
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+ struct action_entry *entry;
+
+ TAILQ_FOREACH(entry, &action_entry_list, next) {
+ if (strncmp(entry->action_name, name, RTE_MP_MAX_NAME_LEN) == 0)
+ break;
+ }
+
+ return entry;
+}
+
+static int
+validate_action_name(const char *name)
+{
+ if (name == NULL) {
+ RTE_LOG(ERR, EAL, "Action name cannot be NULL\n");
+ rte_errno = -EINVAL;
+ return -1;
+ }
+ if (strnlen(name, RTE_MP_MAX_NAME_LEN) == 0) {
+ RTE_LOG(ERR, EAL, "Length of action name is zero\n");
+ rte_errno = -EINVAL;
+ return -1;
+ }
+ if (strnlen(name, RTE_MP_MAX_NAME_LEN) == RTE_MP_MAX_NAME_LEN) {
+ rte_errno = -E2BIG;
+ return -1;
+ }
+ return 0;
+}
+
+int
+rte_mp_action_register(const char *name, rte_mp_t action)
+{
+ struct action_entry *entry;
+
+ if (validate_action_name(name))
+ return -1;
+
+ entry = malloc(sizeof(struct action_entry));
+ if (entry == NULL) {
+ rte_errno = -ENOMEM;
+ return -1;
+ }
+ strcpy(entry->action_name, name);
+ entry->action = action;
+
+ pthread_mutex_lock(&mp_mutex_action);
+ if (find_action_entry_by_name(name) != NULL) {
+ pthread_mutex_unlock(&mp_mutex_action);
+ rte_errno = -EEXIST;
+ free(entry);
+ return -1;
+ }
+ TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+ pthread_mutex_unlock(&mp_mutex_action);
+ return 0;
+}
+
+void
+rte_mp_action_unregister(const char *name)
+{
+ struct action_entry *entry;
+
+ if (validate_action_name(name))
+ return;
+
+ pthread_mutex_lock(&mp_mutex_action);
+ entry = find_action_entry_by_name(name);
+ if (entry == NULL) {
+ pthread_mutex_unlock(&mp_mutex_action);
+ return;
+ }
+ TAILQ_REMOVE(&action_entry_list, entry, next);
+ pthread_mutex_unlock(&mp_mutex_action);
+ free(entry);
+}
+
+static int
+read_msg(struct rte_mp_msg *msg)
+{
+ int msglen;
+ struct iovec iov;
+ struct msghdr msgh;
+ char control[CMSG_SPACE(sizeof(msg->fds))];
+ struct cmsghdr *cmsg;
+ int buflen = sizeof(*msg) - sizeof(msg->fds);
+
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = msg;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ msglen = recvmsg(mp_fd, &msgh, 0);
+ if (msglen < 0) {
+ RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
+ return -1;
+ }
+
+ if (msglen != buflen || (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+ RTE_LOG(ERR, EAL, "truncted msg\n");
+ return -1;
+ }
+
+ /* read auxiliary FDs if any */
+ for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+ if ((cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS)) {
+ memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static void
+process_msg(struct rte_mp_msg *msg)
+{
+ struct action_entry *entry;
+ rte_mp_t action = NULL;
+
+ RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+ pthread_mutex_lock(&mp_mutex_action);
+ entry = find_action_entry_by_name(msg->name);
+ if (entry != NULL)
+ action = entry->action;
+ pthread_mutex_unlock(&mp_mutex_action);
+
+ if (!action)
+ RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
+ else if (action(msg) < 0)
+ RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
+}
+
+static void *
+mp_handle(void *arg __rte_unused)
+{
+ struct rte_mp_msg msg;
+
+ while (1) {
+ if (read_msg(&msg) == 0)
+ process_msg(&msg);
+ }
+
+ return NULL;
+}
+
+static int
+open_socket_fd(void)
+{
+ struct sockaddr_un un;
+ const char *prefix = eal_mp_socket_path();
+
+ mp_fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+ if (mp_fd < 0) {
+ RTE_LOG(ERR, EAL, "failed to create unix socket\n");
+ return -1;
+ }
+
+ memset(&un, 0, sizeof(un));
+ un.sun_family = AF_UNIX;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ snprintf(un.sun_path, sizeof(un.sun_path), "%s", prefix);
+ else {
+ snprintf(un.sun_path, sizeof(un.sun_path), "%s_%d_%"PRIx64,
+ prefix, getpid(), rte_rdtsc());
+ }
+ unlink(un.sun_path); /* May still exist since last run */
+ if (bind(mp_fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+ RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
+ un.sun_path, strerror(errno));
+ close(mp_fd);
+ return -1;
+ }
+
+ RTE_LOG(INFO, EAL, "Multi-process socket %s\n", un.sun_path);
+ return mp_fd;
+}
+
+static int
+unlink_sockets(const char *filter)
+{
+ int dir_fd;
+ DIR *mp_dir;
+ struct dirent *ent;
+
+ mp_dir = opendir(mp_dir_path);
+ if (!mp_dir) {
+ RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+ return -1;
+ }
+ dir_fd = dirfd(mp_dir);
+
+ while ((ent = readdir(mp_dir))) {
+ if (fnmatch(filter, ent->d_name, 0) == 0)
+ unlinkat(dir_fd, ent->d_name, 0);
+ }
+
+ closedir(mp_dir);
+ return 0;
+}
+
+static void
+unlink_socket_by_path(const char *path)
+{
+ char *filename;
+ char *fullpath = strdup(path);
+
+ if (!fullpath)
+ return;
+ filename = basename(fullpath);
+ unlink_sockets(filename);
+ free(fullpath);
+ RTE_LOG(INFO, EAL, "Remove socket %s\n", path);
+}
+
+int
+rte_mp_channel_init(void)
+{
+ char thread_name[RTE_MAX_THREAD_NAME_LEN];
+ char *path;
+ pthread_t tid;
+
+ snprintf(mp_filter, PATH_MAX, ".%s_unix_*",
+ internal_config.hugefile_prefix);
+
+ path = strdup(eal_mp_socket_path());
+ snprintf(mp_dir_path, PATH_MAX, "%s", dirname(path));
+ free(path);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
+ unlink_sockets(mp_filter)) {
+ RTE_LOG(ERR, EAL, "failed to unlink mp sockets\n");
+ return -1;
+ }
+
+ if (open_socket_fd() < 0)
+ return -1;
+
+ if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n",
+ strerror(errno));
+ close(mp_fd);
+ mp_fd = -1;
+ return -1;
+ }
+
+ /* try best to set thread name */
+ snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
+ rte_thread_setname(tid, thread_name);
+ return 0;
+}
+
+/**
+ * Return -1, as fail to send message and it's caused by the local side.
+ * Return 0, as fail to send message and it's caused by the remote side.
+ * Return 1, as succeed to send message.
+ *
+ */
+static int
+send_msg(const char *dst_path, struct rte_mp_msg *msg)
+{
+ int snd;
+ struct iovec iov;
+ struct msghdr msgh;
+ struct cmsghdr *cmsg;
+ struct sockaddr_un dst;
+ int fd_size = msg->num_fds * sizeof(int);
+ char control[CMSG_SPACE(fd_size)];
+
+ memset(&dst, 0, sizeof(dst));
+ dst.sun_family = AF_UNIX;
+ snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
+
+ memset(&msgh, 0, sizeof(msgh));
+ memset(control, 0, sizeof(control));
+
+ iov.iov_base = msg;
+ iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+
+ msgh.msg_name = &dst;
+ msgh.msg_namelen = sizeof(dst);
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ cmsg->cmsg_len = CMSG_LEN(fd_size);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), msg->fds, fd_size);
+
+ do {
+ snd = sendmsg(mp_fd, &msgh, 0);
+ } while (snd < 0 && errno == EINTR);
+
+ if (snd < 0) {
+ rte_errno = errno;
+ /* Check if it caused by peer process exits */
+ if (errno == -ECONNREFUSED) {
+ /* We don't unlink the primary's socket here */
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ unlink_socket_by_path(dst_path);
+ return 0;
+ }
+ if (errno == -ENOBUFS) {
+ RTE_LOG(ERR, EAL, "Peer cannot receive message %s\n",
+ dst_path);
+ return 0;
+ }
+ RTE_LOG(ERR, EAL, "failed to send to (%s) due to %s\n",
+ dst_path, strerror(errno));
+ return -1;
+ }
+
+ return 1;
+}
+
+static int
+mp_send(struct rte_mp_msg *msg)
+{
+ int ret = 0;
+ DIR *mp_dir;
+ struct dirent *ent;
+
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+ if (send_msg(eal_mp_socket_path(), msg) < 0)
+ return -1;
+ else
+ return 0;
+ }
+
+ /* broadcast to all secondary processes */
+ mp_dir = opendir(mp_dir_path);
+ if (!mp_dir) {
+ RTE_LOG(ERR, EAL, "Unable to open directory %s\n",
+ mp_dir_path);
+ rte_errno = errno;
+ return -1;
+ }
+ while ((ent = readdir(mp_dir))) {
+ if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+ continue;
+
+ if (send_msg(ent->d_name, msg) < 0)
+ ret = -1;
+ }
+ closedir(mp_dir);
+
+ return ret;
+}
+
+static bool
+check_input(const struct rte_mp_msg *msg)
+{
+ if (msg == NULL) {
+ RTE_LOG(ERR, EAL, "Msg cannot be NULL\n");
+ rte_errno = -EINVAL;
+ return false;
+ }
+
+ if (validate_action_name(msg->name))
+ return false;
+
+ if (msg->len_param > RTE_MP_MAX_PARAM_LEN) {
+ RTE_LOG(ERR, EAL, "Message data is too long\n");
+ rte_errno = -E2BIG;
+ return false;
+ }
+
+ if (msg->num_fds > RTE_MP_MAX_FD_NUM) {
+ RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n",
+ RTE_MP_MAX_FD_NUM);
+ rte_errno = -E2BIG;
+ return false;
+ }
+
+ return true;
+}
+
+int
+rte_mp_sendmsg(struct rte_mp_msg *msg)
+{
+ if (!check_input(msg))
+ return -1;
+
+ RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
+ return mp_send(msg);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index e8959eb..4708dd5 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

/**
@@ -38,6 +38,23 @@ eal_runtime_config_path(void)
return buffer;
}

+/** Path of primary/secondary communication unix socket file. */
+#define MP_SOCKET_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_socket_path(void)
+{
+ static char buffer[PATH_MAX]; /* static so auto-zeroed */
+ const char *directory = default_config_dir;
+ const char *home_dir = getenv("HOME");
+
+ if (getuid() != 0 && home_dir != NULL)
+ directory = home_dir;
+ snprintf(buffer, sizeof(buffer) - 1, MP_SOCKET_PATH_FMT,
+ directory, internal_config.hugefile_prefix);
+
+ return buffer;
+}
+
/** Path of hugepage info file. */
#define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"

diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index c46dd8f..0b28770 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

#ifndef _EAL_PRIVATE_H_
@@ -195,4 +195,14 @@ int rte_eal_hugepage_attach(void);
*/
struct rte_bus *rte_bus_find_by_device_name(const char *str);

+/**
+ * Create the unix channel for primary/secondary communication.
+ *
+ * @return
+ * 0 on success;
+ * (<0) on failure.
+ */
+
+int rte_mp_channel_init(void);
+
#endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 2aba2c8..1d42e9c 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

#ifndef _RTE_EAL_H_
@@ -186,6 +186,81 @@ int rte_eal_init(int argc, char **argv);
*/
int rte_eal_primary_proc_alive(const char *config_file_path);

+#define RTE_MP_MAX_FD_NUM 8 /* The max amount of fds */
+#define RTE_MP_MAX_NAME_LEN 64 /* The max length of action name */
+#define RTE_MP_MAX_PARAM_LEN 256 /* The max length of param */
+struct rte_mp_msg {
+ char name[RTE_MP_MAX_NAME_LEN];
+ int len_param;
+ int num_fds;
+ uint8_t param[RTE_MP_MAX_PARAM_LEN];
+ int fds[RTE_MP_MAX_FD_NUM];
+};
+
+/**
+ * Action function typedef used by other components.
+ *
+ * As we create socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_mp_t)(const struct rte_mp_msg *msg);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * @param name
+ * The name argument plays as the nonredundant key to find the action.
+ *
+ * @param action
+ * The action argument is the function pointer to the action function.
+ *
+ * @return
+ * - 0 on success.
+ * - (<0) on failure.
+ */
+int rte_mp_action_register(const char *name, rte_mp_t action);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Unregister an action function for primary/secondary communication.
+ *
+ * Call this function to unregister an action if the calling component does
+ * not want to response the messages from the corresponding component in its
+ * primary process or secondary processes.
+ *
+ * @param name
+ * The name argument plays as the nonredundant key to find the action.
+ *
+ */
+void rte_mp_action_unregister(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a message to the peer process.
+ *
+ * This function will send a message which will be responsed by the action
+ * identified by name in the peer process.
+ *
+ * @param msg
+ * The msg argument contains the customized message.
+ *
+ * @return
+ * - On success, return 0.
+ * - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_sendmsg(struct rte_mp_msg *msg);
+
/**
* Usage function typedef used by the application usage function.
*
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 229eec9..53e29e4 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -1,7 +1,7 @@
/*-
* BSD LICENSE
*
- * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * Copyright(c) 2010-2018 Intel Corporation. All rights reserved.
* Copyright(c) 2012-2014 6WIND S.A.
* All rights reserved.
*
@@ -852,6 +852,14 @@ rte_eal_init(int argc, char **argv)
return -1;
}

+ if (rte_mp_channel_init() < 0) {
+ rte_eal_init_alert("failed to init mp channel\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ rte_errno = EFAULT;
+ return -1;
+ }
+ }
+
#ifdef VFIO_PRESENT
if (rte_eal_vfio_setup() < 0) {
rte_eal_init_alert("Cannot init VFIO\n");
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 7088b72..8fd60de 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -217,6 +217,9 @@ EXPERIMENTAL {
rte_eal_devargs_remove;
rte_eal_hotplug_add;
rte_eal_hotplug_remove;
+ rte_mp_action_register;
+ rte_mp_action_unregister;
+ rte_mp_sendmsg;
rte_service_attr_get;
rte_service_attr_reset_all;
rte_service_component_register;

--
2.7.4

Burakov, Anatoly

2018-01-26 10:25:45 UTC

Post by Jianfeng Tan
Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
1. Config-file based channel, in which, the primary process writes
info into a pre-defined config file, and the secondary process
reads the info out.
2. vfio submodule has its own channel based on unix socket for the
secondary process to get container fd and group fd from the
primary process.
3. pdump submodule also has its own channel based on unix socket for
packet dump.
It'd be good to have a generic communication channel for multi-process
a. Secondary wants to send info to primary, for example, secondary
would like to send request (about some specific vdev to primary).
b. Sending info at any time, instead of just initialization time.
c. Share FDs with the other side, for vdev like vhost, related FDs
(memory region, kick) should be shared.
d. A send message request needs the other side to response immediately.
This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.
1. rte_eal_mp_action_register() is used to register an action,
indexed by a string, when a component at receiver side would like
to response the messages from the peer processe.
2. rte_eal_mp_action_unregister() is used to unregister the action
if the calling component does not want to response the messages.
3. rte_eal_mp_sendmsg() is used to send a message, and returns
immediately. If there are n secondary processes, the primary
process will send n messages.
---

<snip>

Post by Jianfeng Tan
+
+static int
+mp_send(struct rte_mp_msg *msg)
+{
+ int ret = 0;
+ DIR *mp_dir;
+ struct dirent *ent;
+
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+ if (send_msg(eal_mp_socket_path(), msg) < 0)
+ return -1;
+ else
+ return 0;
+ }
+
+ /* broadcast to all secondary processes */
+ mp_dir = opendir(mp_dir_path);
+ if (!mp_dir) {
+ RTE_LOG(ERR, EAL, "Unable to open directory %s\n",
+ mp_dir_path);
+ rte_errno = errno;
+ return -1;
+ }
+ while ((ent = readdir(mp_dir))) {
+ if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+ continue;
+
+ if (send_msg(ent->d_name, msg) < 0)
+ ret = -1;
+ }
+ closedir(mp_dir);
+
+ return ret;

Nitpick: you probably don't need ret here, just return 0 as in other places.

Post by Jianfeng Tan
+}
+
+static bool
+check_input(const struct rte_mp_msg *msg)
+{
+ if (msg == NULL) {
+ RTE_LOG(ERR, EAL, "Msg cannot be NULL\n");
+ rte_errno = -EINVAL;
+ return false;
+ }
+
+ if (validate_action_name(msg->name))
+ return false;
+
+ if (msg->len_param > RTE_MP_MAX_PARAM_LEN) {
+ RTE_LOG(ERR, EAL, "Message data is too long\n");
+ rte_errno = -E2BIG;
+ return false;
+ }
+
+ if (msg->num_fds > RTE_MP_MAX_FD_NUM) {
+ RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n",
+ RTE_MP_MAX_FD_NUM);
+ rte_errno = -E2BIG;
+ return false;

Otherwise, i'm happy with this patch.

--
Thanks,
Anatoly

Tan, Jianfeng

2018-01-29 06:37:37 UTC

Hi Anatoly,

-----Original Message-----
From: Burakov, Anatoly
Sent: Friday, January 26, 2018 6:26 PM
Subject: Re: [PATCH v6 1/2] eal: add channel for multi-process
communication

Post by Jianfeng Tan
Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
1. Config-file based channel, in which, the primary process writes
info into a pre-defined config file, and the secondary process
reads the info out.
2. vfio submodule has its own channel based on unix socket for the
secondary process to get container fd and group fd from the
primary process.
3. pdump submodule also has its own channel based on unix socket for
packet dump.
It'd be good to have a generic communication channel for multi-process
a. Secondary wants to send info to primary, for example, secondary
would like to send request (about some specific vdev to primary).
b. Sending info at any time, instead of just initialization time.
c. Share FDs with the other side, for vdev like vhost, related FDs
(memory region, kick) should be shared.
d. A send message request needs the other side to response immediately.
This patch proposes to create a communication channel, based on

datagram

Post by Jianfeng Tan
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.
1. rte_eal_mp_action_register() is used to register an action,
indexed by a string, when a component at receiver side would like
to response the messages from the peer processe.
2. rte_eal_mp_action_unregister() is used to unregister the action
if the calling component does not want to response the messages.
3. rte_eal_mp_sendmsg() is used to send a message, and returns
immediately. If there are n secondary processes, the primary
process will send n messages.
---

<snip>

Post by Jianfeng Tan
+
+static int
+mp_send(struct rte_mp_msg *msg)
+{
+ int ret = 0;
+ DIR *mp_dir;
+ struct dirent *ent;
+
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+ if (send_msg(eal_mp_socket_path(), msg) < 0)
+ return -1;
+ else
+ return 0;
+ }
+
+ /* broadcast to all secondary processes */
+ mp_dir = opendir(mp_dir_path);
+ if (!mp_dir) {
+ RTE_LOG(ERR, EAL, "Unable to open directory %s\n",
+ mp_dir_path);
+ rte_errno = errno;
+ return -1;
+ }
+ while ((ent = readdir(mp_dir))) {
+ if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+ continue;
+
+ if (send_msg(ent->d_name, msg) < 0)
+ ret = -1;

Here ret is assigned to -1.

Post by Jianfeng Tan
+ }
+ closedir(mp_dir);
+
+ return ret;

Nitpick: you probably don't need ret here, just return 0 as in other places.

We cannot just return 0 as it could be -1 as above comment shows.
The ret variable was introd

Burakov, Anatoly

2018-01-29 09:37:47 UTC

Post by Tan, Jianfeng
Hi Anatoly,

-----Original Message-----
From: Burakov, Anatoly
Sent: Friday, January 26, 2018 6:26 PM
Subject: Re: [PATCH v6 1/2] eal: add channel for multi-process
communication

Post by Jianfeng Tan
Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
1. Config-file based channel, in which, the primary process writes
info into a pre-defined config file, and the secondary process
reads the info out.
2. vfio submodule has its own channel based on unix socket for the
secondary process to get container fd and group fd from the
primary process.
3. pdump submodule also has its own channel based on unix socket for
packet dump.
It'd be good to have a generic communication channel for multi-process
a. Secondary wants to send info to primary, for example, secondary
would like to send request (about some specific vdev to primary).
b. Sending info at any time, instead of just initialization time.
c. Share FDs with the other side, for vdev like vhost, related FDs
(memory region, kick) should be shared.
d. A send message request needs the other side to response immediately.
This patch proposes to create a communication channel, based on

datagram

Post by Jianfeng Tan
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.
1. rte_eal_mp_action_register() is used to register an action,
indexed by a string, when a component at receiver side would like
to response the messages from the peer processe.
2. rte_eal_mp_action_unregister() is used to unregister the action
if the calling component does not want to response the messages.
3. rte_eal_mp_sendmsg() is used to send a message, and returns
immediately. If there are n secondary processes, the primary
process will send n messages.
---
+ }
+ closedir(mp_dir);
+
+ return ret;

Nitpick: you probably don't need ret here, just return 0 as in other places.

We cannot just return 0 as it could be -1 as above comment shows.
The ret variable was introduced to avoid two "closedir()".
Thanks,
Jianfeng

Yep you're right, apologies.

--
Thanks,
Anatoly

Jianfeng Tan

2018-01-26 03:41:22 UTC

We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.

We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The caller
can specify the timeout. And the response messages will be collected
and returned so that the caller can decide how to translate them.

The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.

sender-process receiver-process
---------------------- ----------------

thread-n
|_rte_eal_mp_request() ----------> mp-thread
|_timedwait() |_process_msg()
|_action()
|_rte_eal_mp_reply()
mp_thread <---------------------|
|_process_msg()
|_signal(send_thread)
thread-m <----------|
|_collect-reply

* A secondary process is only allowed to talk to the primary process.
* If there are multiple secondary processes for the primary process,
it will send request to peer1, collect response from peer1; then
send request to peer2, collect response from peer2, and so on.
* When thread-n is sending request, thread-m of that process can send
request at the same time.
* For pair <action_name, peer>, we guarantee that only one such request
is on the fly.

Suggested-by: Anatoly Burakov <***@intel.com>
Suggested-by: Konstantin Ananyev <***@intel.com>
Signed-off-by: Jianfeng Tan <***@intel.com>
Reviewed-by: Anatoly Burakov <***@intel.com>
Acked-by: Konstantin Ananyev <***@intel.com>
---
doc/guides/rel_notes/release_18_02.rst | 2 +
lib/librte_eal/common/eal_common_proc.c | 254 +++++++++++++++++++++++++++++---
lib/librte_eal/common/include/rte_eal.h | 58 +++++++-
lib/librte_eal/rte_eal_version.map | 2 +
4 files changed, 296 insertions(+), 20 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index be6ac99..39425a4 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -160,6 +160,8 @@ New Features

* ``rte_mp_register`` and ``rte_mp_unregister`` are for action (un)registration.
* ``rte_mp_sendmsg`` is for sending a message without blocking for a response.
+ * ``rte_mp_request`` is for sending a request message and will block until
+ it gets a reply message which is sent from the peer by ``rte_mp_reply``.

API Changes
-----------
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index aea0829..6ad73f5 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -13,6 +13,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <sys/time.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
@@ -46,6 +47,50 @@ TAILQ_HEAD(action_entry_list, action_entry);
static struct action_entry_list action_entry_list =
TAILQ_HEAD_INITIALIZER(action_entry_list);

+enum mp_type {
+ MP_MSG, /* Share message with peers, will not block */
+ MP_REQ, /* Request for information, Will block for a reply */
+ MP_REP, /* Response to previously-received request */
+};
+
+struct mp_msg_internal {
+ int type;
+ struct rte_mp_msg msg;
+};
+
+struct sync_request {
+ TAILQ_ENTRY(sync_request) next;
+ int reply_received;
+ char dst[PATH_MAX];
+ struct rte_mp_msg *request;
+ struct rte_mp_msg *reply;
+ pthread_cond_t cond;
+};
+
+TAILQ_HEAD(sync_request_list, sync_request);
+
+static struct {
+ struct sync_request_list requests;
+ pthread_mutex_t lock;
+} sync_requests = {
+ .requests = TAILQ_HEAD_INITIALIZER(sync_requests.requests),
+ .lock = PTHREAD_MUTEX_INITIALIZER
+};
+
+static struct sync_request *
+find_sync_request(const char *dst, const char *act_name)
+{
+ struct sync_request *r;
+
+ TAILQ_FOREACH(r, &sync_requests.requests, next) {
+ if (!strcmp(r->dst, dst) &&
+ !strcmp(r->request->name, act_name))
+ break;
+ }
+
+ return r;
+}
+
int
rte_eal_primary_proc_alive(const char *config_file_path)
{
@@ -149,19 +194,21 @@ rte_mp_action_unregister(const char *name)
}

static int
-read_msg(struct rte_mp_msg *msg)
+read_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
{
int msglen;
struct iovec iov;
struct msghdr msgh;
- char control[CMSG_SPACE(sizeof(msg->fds))];
+ char control[CMSG_SPACE(sizeof(m->msg.fds))];
struct cmsghdr *cmsg;
- int buflen = sizeof(*msg) - sizeof(msg->fds);
+ int buflen = sizeof(*m) - sizeof(m->msg.fds);

memset(&msgh, 0, sizeof(msgh));
- iov.iov_base = msg;
+ iov.iov_base = m;
iov.iov_len = buflen;

+ msgh.msg_name = s;
+ msgh.msg_namelen = sizeof(*s);
msgh.msg_iov = &iov;
msgh.msg_iovlen = 1;
msgh.msg_control = control;
@@ -183,7 +230,7 @@ read_msg(struct rte_mp_msg *msg)
cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
if ((cmsg->cmsg_level == SOL_SOCKET) &&
(cmsg->cmsg_type == SCM_RIGHTS)) {
- memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+ memcpy(m->msg.fds, CMSG_DATA(cmsg), sizeof(m->msg.fds));
break;
}
}
@@ -192,12 +239,28 @@ read_msg(struct rte_mp_msg *msg)
}

static void
-process_msg(struct rte_mp_msg *msg)
+process_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
{
+ struct sync_request *sync_req;
struct action_entry *entry;
+ struct rte_mp_msg *msg = &m->msg;
rte_mp_t action = NULL;

RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+
+ if (m->type == MP_REP) {
+ pthread_mutex_lock(&sync_requests.lock);
+ sync_req = find_sync_request(s->sun_path, msg->name);
+ if (sync_req) {
+ memcpy(sync_req->reply, msg, sizeof(*msg));
+ sync_req->reply_received = 1;
+ pthread_cond_signal(&sync_req->cond);
+ } else
+ RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name);
+ pthread_mutex_unlock(&sync_requests.lock);
+ return;
+ }
+
pthread_mutex_lock(&mp_mutex_action);
entry = find_action_entry_by_name(msg->name);
if (entry != NULL)
@@ -206,18 +269,19 @@ process_msg(struct rte_mp_msg *msg)

if (!action)
RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
- else if (action(msg) < 0)
+ else if (action(msg, s->sun_path) < 0)
RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
}

static void *
mp_handle(void *arg __rte_unused)
{
- struct rte_mp_msg msg;
+ struct mp_msg_internal msg;
+ struct sockaddr_un sa;

while (1) {
- if (read_msg(&msg) == 0)
- process_msg(&msg);
+ if (read_msg(&msg, &sa) == 0)
+ process_msg(&msg, &sa);
}

return NULL;
@@ -336,16 +400,20 @@ rte_mp_channel_init(void)
*
*/
static int
-send_msg(const char *dst_path, struct rte_mp_msg *msg)
+send_msg(const char *dst_path, struct rte_mp_msg *msg, int type)
{
int snd;
struct iovec iov;
struct msghdr msgh;
struct cmsghdr *cmsg;
struct sockaddr_un dst;
+ struct mp_msg_internal m;
int fd_size = msg->num_fds * sizeof(int);
char control[CMSG_SPACE(fd_size)];

+ m.type = type;
+ memcpy(&m.msg, msg, sizeof(*msg));
+
memset(&dst, 0, sizeof(dst));
dst.sun_family = AF_UNIX;
snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
@@ -353,8 +421,8 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
memset(&msgh, 0, sizeof(msgh));
memset(control, 0, sizeof(control));

- iov.iov_base = msg;
- iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+ iov.iov_base = &m;
+ iov.iov_len = sizeof(m) - sizeof(msg->fds);

msgh.msg_name = &dst;
msgh.msg_namelen = sizeof(dst);
@@ -396,14 +464,17 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
}

static int
-mp_send(struct rte_mp_msg *msg)
+mp_send(struct rte_mp_msg *msg, const char *peer, int type)
{
int ret = 0;
DIR *mp_dir;
struct dirent *ent;

- if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
- if (send_msg(eal_mp_socket_path(), msg) < 0)
+ if (!peer && (rte_eal_process_type() == RTE_PROC_SECONDARY))
+ peer = eal_mp_socket_path();
+
+ if (peer) {
+ if (send_msg(peer, msg, type) < 0)
return -1;
else
return 0;
@@ -421,11 +492,11 @@ mp_send(struct rte_mp_msg *msg)
if (fnmatch(mp_filter, ent->d_name, 0) != 0)
continue;

- if (send_msg(ent->d_name, msg) < 0)
+ if (send_msg(ent->d_name, msg, type) < 0)
ret = -1;
}
- closedir(mp_dir);

+ closedir(mp_dir);
return ret;
}

@@ -464,5 +535,150 @@ rte_mp_sendmsg(struct rte_mp_msg *msg)
return -1;

RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
- return mp_send(msg);
+ return mp_send(msg, NULL, MP_MSG);
+}
+
+static int
+mp_request_one(const char *dst, struct rte_mp_msg *req,
+ struct rte_mp_reply *reply, const struct timespec *ts)
+{
+ int ret;
+ struct timeval now;
+ struct rte_mp_msg msg, *tmp;
+ struct sync_request sync_req, *exist;
+
+ sync_req.reply_received = 0;
+ strcpy(sync_req.dst, dst);
+ sync_req.request = req;
+ sync_req.reply = &msg;
+ pthread_cond_init(&sync_req.cond, NULL);
+
+ pthread_mutex_lock(&sync_requests.lock);
+ exist = find_sync_request(dst, req->name);
+ if (!exist)
+ TAILQ_INSERT_TAIL(&sync_requests.requests, &sync_req, next);
+ pthread_mutex_unlock(&sync_requests.lock);
+ if (exist) {
+ RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name);
+ rte_errno = -EEXIST;
+ return -1;
+ }
+
+ ret = send_msg(dst, req, MP_REQ);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n",
+ dst, req->name);
+ return -1;
+ } else if (ret == 0)
+ return 0;
+
+ reply->nb_sent++;
+
+ pthread_mutex_lock(&sync_requests.lock);
+ do {
+ pthread_cond_timedwait(&sync_req.cond, &sync_requests.lock, ts);
+ /* Check spurious wakeups */
+ if (sync_req.reply_received == 1)
+ break;
+ /* Check if time is out */
+ if (gettimeofday(&now, NULL) < 0)
+ break;
+ if (now.tv_sec < ts->tv_sec)
+ break;
+ else if (now.tv_sec == ts->tv_sec &&
+ now.tv_usec * 1000 < ts->tv_nsec)
+ break;
+ } while (1);
+ /* We got the lock now */
+ TAILQ_REMOVE(&sync_requests.requests, &sync_req, next);
+ pthread_mutex_unlock(&sync_requests.lock);
+
+ if (sync_req.reply_received == 0) {
+ RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n",
+ dst, req->name);
+ rte_errno = -ETIMEDOUT;
+ return -1;
+ }
+
+ tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_received + 1));
+ if (!tmp) {
+ RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n",
+ dst, req->name);
+ rte_errno = -ENOMEM;
+ return -1;
+ }
+ memcpy(&tmp[reply->nb_received], &msg, sizeof(msg));
+ reply->msgs = tmp;
+ reply->nb_received++;
+ return 0;
+}
+
+int
+rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+ const struct timespec *ts)
+{
+ int ret = 0;
+ DIR *mp_dir;
+ struct dirent *ent;
+ struct timeval now;
+ struct timespec end;
+
+ RTE_LOG(DEBUG, EAL, "request: %s\n", req->name);
+
+ if (check_input(req) == false)
+ return -1;
+ if (gettimeofday(&now, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "Faile to get current time\n");
+ rte_errno = errno;
+ return -1;
+ }
+
+ end.tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000;
+ end.tv_sec = now.tv_sec + ts->tv_sec +
+ (now.tv_usec * 1000 + ts->tv_nsec) / 1000000000;
+
+ reply->nb_sent = 0;
+ reply->nb_received = 0;
+ reply->msgs = NULL;
+
+ /* for secondary process, send request to the primary process only */
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+ return mp_request_one(eal_mp_socket_path(), req, reply, &end);
+
+ /* for primary process, broadcast request, and collect reply 1 by 1 */
+ mp_dir = opendir(mp_dir_path);
+ if (!mp_dir) {
+ RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+ rte_errno = errno;
+ return -1;
+ }
+
+ while ((ent = readdir(mp_dir))) {
+ if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+ continue;
+
+ if (mp_request_one(ent->d_name, req, reply, &end))
+ ret = -1;
+ }
+
+ closedir(mp_dir);
+ return ret;
+}
+
+int
+rte_mp_reply(struct rte_mp_msg *msg, const char *peer)
+{
+
+ RTE_LOG(DEBUG, EAL, "reply: %s\n", msg->name);
+
+ if (check_input(msg) == false)
+ return -1;
+
+ if (peer == NULL) {
+ RTE_LOG(ERR, EAL, "peer is not specified\n");
+ rte_errno = -EINVAL;
+ return -1;
+ }
+
+ return mp_send(msg, peer, MP_REP);
}
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 1d42e9c..9207ad9 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -13,6 +13,7 @@

#include <stdint.h>
#include <sched.h>
+#include <time.h>

#include <rte_config.h>
#include <rte_per_lcore.h>
@@ -197,13 +198,19 @@ struct rte_mp_msg {
int fds[RTE_MP_MAX_FD_NUM];
};

+struct rte_mp_reply {
+ int nb_sent;
+ int nb_received;
+ struct rte_mp_msg *msgs; /* caller to free */
+};
+
/**
* Action function typedef used by other components.
*
* As we create socket channel for primary/secondary communication, use
* this function typedef to register action for coming messages.
*/
-typedef int (*rte_mp_t)(const struct rte_mp_msg *msg);
+typedef int (*rte_mp_t)(const struct rte_mp_msg *msg, const void *peer);

/**
* @warning
@@ -262,6 +269,55 @@ void rte_mp_action_unregister(const char *name);
int rte_mp_sendmsg(struct rte_mp_msg *msg);

/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a request to the peer process and expect a reply.
+ *
+ * This function sends a request message to the peer process, and will
+ * block until receiving reply message from the peer process.
+ *
+ * @note The caller is responsible to free reply->replies.
+ *
+ * @param req
+ * The req argument contains the customized request message.
+ *
+ * @param reply
+ * The reply argument will be for storing all the replied messages;
+ * the caller is responsible for free reply->replies.
+ *
+ * @param ts
+ * The ts argument specifies how long we can wait for the peer(s) to reply.
+ *
+ * @return
+ * - On success, return 0.
+ * - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+ const struct timespec *ts);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a reply to the peer process.
+ *
+ * This function will send a reply message in response to a request message
+ * received previously.
+ *
+ * @param msg
+ * The msg argument contains the customized message.
+ *
+ * @param peer
+ * The peer argument is the pointer to the peer socket path.
+ *
+ * @return
+ * - On success, return 0.
+ * - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int rte_mp_reply(struct rte_mp_msg *msg, const char *peer);
+
+/**
* Usage function typedef used by the application usage function.
*
* Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 8fd60de..673e5e5 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -220,6 +220,8 @@ EXPERIMENTAL {
rte_mp_action_register;
rte_mp_action_unregister;
rte_mp_sendmsg;
+ rte_mp_request;
+ rte_mp_reply;
rte_service_attr_get;
rte_service_attr_reset_all;
rte_service_component_register;

--
2.7.4

Burakov, Anatoly

2018-01-26 10:31:40 UTC

Post by Jianfeng Tan
We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.
We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The caller
can specify the timeout. And the response messages will be collected
and returned so that the caller can decide how to translate them.
The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.
sender-process receiver-process
---------------------- ----------------
thread-n
|_rte_eal_mp_request() ----------> mp-thread
|_timedwait() |_process_msg()
|_action()
|_rte_eal_mp_reply()
mp_thread <---------------------|
|_process_msg()
|_signal(send_thread)
thread-m <----------|
|_collect-reply
* A secondary process is only allowed to talk to the primary process.
* If there are multiple secondary processes for the primary process,
it will send request to peer1, collect response from peer1; then
send request to peer2, collect response from peer2, and so on.
* When thread-n is sending request, thread-m of that process can send
request at the same time.
* For pair <action_name, peer>, we guarantee that only one such request
is on the fly.
---

No further comments from me :)

--
Thanks,
Anatoly

Thomas Monjalon

2018-01-29 23:52:51 UTC

Post by Jianfeng Tan
- Correct the API name issue in rte_eal_version.map.
- Drop the patch 3 on vfio communication (postponed).
- Change names from rte_eal_mp_* -> rte_mp_* as suggested by Thomas.
- Add nb_sent and nb_received in struct rte_mp_reply.
- Standardize the return val of sendmsg, request, reply: 0 on sucess,
(-1) on failure.
- If we found an peer error when we send msg in primary, we try to
remove the secondary socket; as there is no sync mechanism there
(cannot do flock like regular file for socket file), we use a more
complex socket name (with tsc in it).
- Some other small changes.

Please, may I ask a last rebase?
The __rte_experimental tag is now required to be added as in this commit:
http://dpdk.org/commit/77b7b81e32e
Thanks

Jianfeng Tan

2018-01-30 06:58:07 UTC

v6->v7:
- Add __rte_experimental tag for new APIs.
- Rebased on master.

v5->v6:
- Correct the API name issue in rte_eal_version.map.

v3->v5:
- Drop the patch 3 on vfio communication (postponed).
- Change names from rte_eal_mp_* -> rte_mp_* as suggested by Thomas.
- Add nb_sent and nb_received in struct rte_mp_reply.
- Standardize the return val of sendmsg, request, reply: 0 on sucess,
(-1) on failure.
- If we found an peer error when we send msg in primary, we try to
remove the secondary socket; as there is no sync mechanism there
(cannot do flock like regular file for socket file), we use a more
complex socket name (with tsc in it).
- Some other small changes.

v3->v4:
- Wrong patches are sent out.

v2->v3:
- Add pre-check for each APIs.
- Remove the limitation of 8 secondary processes by: discard original
register/unregister mechanism of secondary process, instead, primary
discoveries secondary processes by looking up the folder for regex match.
- Previous implementation use two sockets for msg and request, this version
just uses one socket. And receive all kinds of messages in mp thread.

v1->v2: (Address comments from Anatoly and Konstantin)
- Use datagram unix socket to supersede stream unix socket + epoll.
- Change the secondary add/del mechanism as now we use connection-less channel.
- Add mp_mutex_action to sync action register/unregister/reference.
- Limit max length of action name to 64B.
- New APIs for synchronous communication: rte_eal_mp_request/rte_eal_mp_reply.
- Formalize the errno handle.
- Some other small issues.

This patchset adds a generic channel for multi-process (primary/secondary)
communication.

Patch 1: addess the purpose and howto;
Patch 2: add a syncrhonous way for the requests which need a immediate response.

Jianfeng Tan (2):
eal: add channel for multi-process communication
eal: add synchronous multi-process communication

doc/guides/rel_notes/release_18_02.rst | 11 +
lib/librte_eal/bsdapp/eal/eal.c | 10 +-
lib/librte_eal/common/eal_common_proc.c | 655 +++++++++++++++++++++++++++++++-
lib/librte_eal/common/eal_filesystem.h | 19 +-
lib/librte_eal/common/eal_private.h | 12 +-
lib/librte_eal/common/include/rte_eal.h | 138 ++++++-
lib/librte_eal/linuxapp/eal/eal.c | 10 +-
lib/librte_eal/rte_eal_version.map | 5 +
8 files changed, 853 insertions(+), 7 deletions(-)

--
2.7.4

Jianfeng Tan

2018-01-30 06:58:08 UTC

Previouly, there are three channels for multi-process
(i.e., primary/secondary) communication.
1. Config-file based channel, in which, the primary process writes
info into a pre-defined config file, and the secondary process
reads the info out.
2. vfio submodule has its own channel based on unix socket for the
secondary process to get container fd and group fd from the
primary process.
3. pdump submodule also has its own channel based on unix socket for
packet dump.

It'd be good to have a generic communication channel for multi-process
communication to accommodate the requirements including:
a. Secondary wants to send info to primary, for example, secondary
would like to send request (about some specific vdev to primary).
b. Sending info at any time, instead of just initialization time.
c. Share FDs with the other side, for vdev like vhost, related FDs
(memory region, kick) should be shared.
d. A send message request needs the other side to response immediately.

This patch proposes to create a communication channel, based on datagram
unix socket, for above requirements. Each process will block on a unix
socket waiting for messages from the peers.

Three new APIs are added:

1. rte_eal_mp_action_register() is used to register an action,
indexed by a string, when a component at receiver side would like
to response the messages from the peer processe.
2. rte_eal_mp_action_unregister() is used to unregister the action
if the calling component does not want to response the messages.
3. rte_eal_mp_sendmsg() is used to send a message, and returns
immediately. If there are n secondary processes, the primary
process will send n messages.

Suggested-by: Konstantin Ananyev <***@intel.com>
Signed-off-by: Jianfeng Tan <***@intel.com>
Reviewed-by: Anatoly Burakov <***@intel.com>
Acked-by: Konstantin Ananyev <***@intel.com>
---
doc/guides/rel_notes/release_18_02.rst | 9 +
lib/librte_eal/bsdapp/eal/eal.c | 10 +-
lib/librte_eal/common/eal_common_proc.c | 439 +++++++++++++++++++++++++++++++-
lib/librte_eal/common/eal_filesystem.h | 19 +-
lib/librte_eal/common/eal_private.h | 12 +-
lib/librte_eal/common/include/rte_eal.h | 80 +++++-
lib/librte_eal/linuxapp/eal/eal.c | 10 +-
lib/librte_eal/rte_eal_version.map | 3 +
8 files changed, 575 insertions(+), 7 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index 8c3968e..0531f59 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -160,6 +160,15 @@ New Features
renamed the application from SW PMD specific ``eventdev_pipeline_sw_pmd``
to PMD agnostic ``eventdev_pipeline``.

+* **Added new multi-process communication channel**
+
+ Added a generic channel in EAL for multi-process (primary/secondary) communication.
+ Consumers of this channel need to register an action with an action name to response
+ a message received; the actions will be identified by the action name and executed
+ in the context of a new dedicated thread for this channel. The list of new APIs:
+
+ * ``rte_mp_register`` and ``rte_mp_unregister`` are for action (un)registration.
+ * ``rte_mp_sendmsg`` is for sending a message without blocking for a response.

API Changes
-----------
diff --git a/lib/librte_eal/bsdapp/eal/eal.c b/lib/librte_eal/bsdapp/eal/eal.c
index 0bac6cf..ba1811a 100644
--- a/lib/librte_eal/bsdapp/eal/eal.c
+++ b/lib/librte_eal/bsdapp/eal/eal.c
@@ -1,7 +1,7 @@
/*-
* BSD LICENSE
*
- * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * Copyright(c) 2010-2018 Intel Corporation. All rights reserved.
* Copyright(c) 2014 6WIND S.A.
* All rights reserved.
*
@@ -604,6 +604,14 @@ rte_eal_init(int argc, char **argv)

rte_config_init();

+ if (rte_mp_channel_init() < 0) {
+ rte_eal_init_alert("failed to init mp channel\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ rte_errno = EFAULT;
+ return -1;
+ }
+ }
+
if (rte_eal_memory_init() < 0) {
rte_eal_init_alert("Cannot init memory\n");
rte_errno = ENOMEM;
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index 40fa982..f63c9c2 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -1,15 +1,51 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2016 Intel Corporation
+ * Copyright(c) 2016-2018 Intel Corporation
*/

-#include <stdio.h>
+#include <dirent.h>
+#include <errno.h>
#include <fcntl.h>
+#include <fnmatch.h>
+#include <inttypes.h>
+#include <libgen.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <rte_common.h>
+#include <rte_cycles.h>
#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_log.h>

+#include "eal_private.h"
#include "eal_filesystem.h"
#include "eal_internal_cfg.h"

+static int mp_fd = -1;
+static char mp_filter[PATH_MAX]; /* Filter for secondary process sockets */
+static char mp_dir_path[PATH_MAX]; /* The directory path for all mp sockets */
+static pthread_mutex_t mp_mutex_action = PTHREAD_MUTEX_INITIALIZER;
+
+struct action_entry {
+ TAILQ_ENTRY(action_entry) next;
+ char action_name[RTE_MP_MAX_NAME_LEN];
+ rte_mp_t action;
+};
+
+/** Double linked list of actions. */
+TAILQ_HEAD(action_entry_list, action_entry);
+
+static struct action_entry_list action_entry_list =
+ TAILQ_HEAD_INITIALIZER(action_entry_list);
+
int
rte_eal_primary_proc_alive(const char *config_file_path)
{
@@ -31,3 +67,402 @@ rte_eal_primary_proc_alive(const char *config_file_path)

return !!ret;
}
+
+static struct action_entry *
+find_action_entry_by_name(const char *name)
+{
+ struct action_entry *entry;
+
+ TAILQ_FOREACH(entry, &action_entry_list, next) {
+ if (strncmp(entry->action_name, name, RTE_MP_MAX_NAME_LEN) == 0)
+ break;
+ }
+
+ return entry;
+}
+
+static int
+validate_action_name(const char *name)
+{
+ if (name == NULL) {
+ RTE_LOG(ERR, EAL, "Action name cannot be NULL\n");
+ rte_errno = -EINVAL;
+ return -1;
+ }
+ if (strnlen(name, RTE_MP_MAX_NAME_LEN) == 0) {
+ RTE_LOG(ERR, EAL, "Length of action name is zero\n");
+ rte_errno = -EINVAL;
+ return -1;
+ }
+ if (strnlen(name, RTE_MP_MAX_NAME_LEN) == RTE_MP_MAX_NAME_LEN) {
+ rte_errno = -E2BIG;
+ return -1;
+ }
+ return 0;
+}
+
+int __rte_experimental
+rte_mp_action_register(const char *name, rte_mp_t action)
+{
+ struct action_entry *entry;
+
+ if (validate_action_name(name))
+ return -1;
+
+ entry = malloc(sizeof(struct action_entry));
+ if (entry == NULL) {
+ rte_errno = -ENOMEM;
+ return -1;
+ }
+ strcpy(entry->action_name, name);
+ entry->action = action;
+
+ pthread_mutex_lock(&mp_mutex_action);
+ if (find_action_entry_by_name(name) != NULL) {
+ pthread_mutex_unlock(&mp_mutex_action);
+ rte_errno = -EEXIST;
+ free(entry);
+ return -1;
+ }
+ TAILQ_INSERT_TAIL(&action_entry_list, entry, next);
+ pthread_mutex_unlock(&mp_mutex_action);
+ return 0;
+}
+
+void __rte_experimental
+rte_mp_action_unregister(const char *name)
+{
+ struct action_entry *entry;
+
+ if (validate_action_name(name))
+ return;
+
+ pthread_mutex_lock(&mp_mutex_action);
+ entry = find_action_entry_by_name(name);
+ if (entry == NULL) {
+ pthread_mutex_unlock(&mp_mutex_action);
+ return;
+ }
+ TAILQ_REMOVE(&action_entry_list, entry, next);
+ pthread_mutex_unlock(&mp_mutex_action);
+ free(entry);
+}
+
+static int
+read_msg(struct rte_mp_msg *msg)
+{
+ int msglen;
+ struct iovec iov;
+ struct msghdr msgh;
+ char control[CMSG_SPACE(sizeof(msg->fds))];
+ struct cmsghdr *cmsg;
+ int buflen = sizeof(*msg) - sizeof(msg->fds);
+
+ memset(&msgh, 0, sizeof(msgh));
+ iov.iov_base = msg;
+ iov.iov_len = buflen;
+
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ msglen = recvmsg(mp_fd, &msgh, 0);
+ if (msglen < 0) {
+ RTE_LOG(ERR, EAL, "recvmsg failed, %s\n", strerror(errno));
+ return -1;
+ }
+
+ if (msglen != buflen || (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+ RTE_LOG(ERR, EAL, "truncted msg\n");
+ return -1;
+ }
+
+ /* read auxiliary FDs if any */
+ for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+ if ((cmsg->cmsg_level == SOL_SOCKET) &&
+ (cmsg->cmsg_type == SCM_RIGHTS)) {
+ memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static void
+process_msg(struct rte_mp_msg *msg)
+{
+ struct action_entry *entry;
+ rte_mp_t action = NULL;
+
+ RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+ pthread_mutex_lock(&mp_mutex_action);
+ entry = find_action_entry_by_name(msg->name);
+ if (entry != NULL)
+ action = entry->action;
+ pthread_mutex_unlock(&mp_mutex_action);
+
+ if (!action)
+ RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
+ else if (action(msg) < 0)
+ RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
+}
+
+static void *
+mp_handle(void *arg __rte_unused)
+{
+ struct rte_mp_msg msg;
+
+ while (1) {
+ if (read_msg(&msg) == 0)
+ process_msg(&msg);
+ }
+
+ return NULL;
+}
+
+static int
+open_socket_fd(void)
+{
+ struct sockaddr_un un;
+ const char *prefix = eal_mp_socket_path();
+
+ mp_fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+ if (mp_fd < 0) {
+ RTE_LOG(ERR, EAL, "failed to create unix socket\n");
+ return -1;
+ }
+
+ memset(&un, 0, sizeof(un));
+ un.sun_family = AF_UNIX;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ snprintf(un.sun_path, sizeof(un.sun_path), "%s", prefix);
+ else {
+ snprintf(un.sun_path, sizeof(un.sun_path), "%s_%d_%"PRIx64,
+ prefix, getpid(), rte_rdtsc());
+ }
+ unlink(un.sun_path); /* May still exist since last run */
+ if (bind(mp_fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
+ RTE_LOG(ERR, EAL, "failed to bind %s: %s\n",
+ un.sun_path, strerror(errno));
+ close(mp_fd);
+ return -1;
+ }
+
+ RTE_LOG(INFO, EAL, "Multi-process socket %s\n", un.sun_path);
+ return mp_fd;
+}
+
+static int
+unlink_sockets(const char *filter)
+{
+ int dir_fd;
+ DIR *mp_dir;
+ struct dirent *ent;
+
+ mp_dir = opendir(mp_dir_path);
+ if (!mp_dir) {
+ RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+ return -1;
+ }
+ dir_fd = dirfd(mp_dir);
+
+ while ((ent = readdir(mp_dir))) {
+ if (fnmatch(filter, ent->d_name, 0) == 0)
+ unlinkat(dir_fd, ent->d_name, 0);
+ }
+
+ closedir(mp_dir);
+ return 0;
+}
+
+static void
+unlink_socket_by_path(const char *path)
+{
+ char *filename;
+ char *fullpath = strdup(path);
+
+ if (!fullpath)
+ return;
+ filename = basename(fullpath);
+ unlink_sockets(filename);
+ free(fullpath);
+ RTE_LOG(INFO, EAL, "Remove socket %s\n", path);
+}
+
+int
+rte_mp_channel_init(void)
+{
+ char thread_name[RTE_MAX_THREAD_NAME_LEN];
+ char *path;
+ pthread_t tid;
+
+ snprintf(mp_filter, PATH_MAX, ".%s_unix_*",
+ internal_config.hugefile_prefix);
+
+ path = strdup(eal_mp_socket_path());
+ snprintf(mp_dir_path, PATH_MAX, "%s", dirname(path));
+ free(path);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
+ unlink_sockets(mp_filter)) {
+ RTE_LOG(ERR, EAL, "failed to unlink mp sockets\n");
+ return -1;
+ }
+
+ if (open_socket_fd() < 0)
+ return -1;
+
+ if (pthread_create(&tid, NULL, mp_handle, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "failed to create mp thead: %s\n",
+ strerror(errno));
+ close(mp_fd);
+ mp_fd = -1;
+ return -1;
+ }
+
+ /* try best to set thread name */
+ snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "rte_mp_handle");
+ rte_thread_setname(tid, thread_name);
+ return 0;
+}
+
+/**
+ * Return -1, as fail to send message and it's caused by the local side.
+ * Return 0, as fail to send message and it's caused by the remote side.
+ * Return 1, as succeed to send message.
+ *
+ */
+static int
+send_msg(const char *dst_path, struct rte_mp_msg *msg)
+{
+ int snd;
+ struct iovec iov;
+ struct msghdr msgh;
+ struct cmsghdr *cmsg;
+ struct sockaddr_un dst;
+ int fd_size = msg->num_fds * sizeof(int);
+ char control[CMSG_SPACE(fd_size)];
+
+ memset(&dst, 0, sizeof(dst));
+ dst.sun_family = AF_UNIX;
+ snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
+
+ memset(&msgh, 0, sizeof(msgh));
+ memset(control, 0, sizeof(control));
+
+ iov.iov_base = msg;
+ iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+
+ msgh.msg_name = &dst;
+ msgh.msg_namelen = sizeof(dst);
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
+
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ cmsg->cmsg_len = CMSG_LEN(fd_size);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(cmsg), msg->fds, fd_size);
+
+ do {
+ snd = sendmsg(mp_fd, &msgh, 0);
+ } while (snd < 0 && errno == EINTR);
+
+ if (snd < 0) {
+ rte_errno = errno;
+ /* Check if it caused by peer process exits */
+ if (errno == -ECONNREFUSED) {
+ /* We don't unlink the primary's socket here */
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ unlink_socket_by_path(dst_path);
+ return 0;
+ }
+ if (errno == -ENOBUFS) {
+ RTE_LOG(ERR, EAL, "Peer cannot receive message %s\n",
+ dst_path);
+ return 0;
+ }
+ RTE_LOG(ERR, EAL, "failed to send to (%s) due to %s\n",
+ dst_path, strerror(errno));
+ return -1;
+ }
+
+ return 1;
+}
+
+static int
+mp_send(struct rte_mp_msg *msg)
+{
+ int ret = 0;
+ DIR *mp_dir;
+ struct dirent *ent;
+
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+ if (send_msg(eal_mp_socket_path(), msg) < 0)
+ return -1;
+ else
+ return 0;
+ }
+
+ /* broadcast to all secondary processes */
+ mp_dir = opendir(mp_dir_path);
+ if (!mp_dir) {
+ RTE_LOG(ERR, EAL, "Unable to open directory %s\n",
+ mp_dir_path);
+ rte_errno = errno;
+ return -1;
+ }
+ while ((ent = readdir(mp_dir))) {
+ if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+ continue;
+
+ if (send_msg(ent->d_name, msg) < 0)
+ ret = -1;
+ }
+ closedir(mp_dir);
+
+ return ret;
+}
+
+static bool
+check_input(const struct rte_mp_msg *msg)
+{
+ if (msg == NULL) {
+ RTE_LOG(ERR, EAL, "Msg cannot be NULL\n");
+ rte_errno = -EINVAL;
+ return false;
+ }
+
+ if (validate_action_name(msg->name))
+ return false;
+
+ if (msg->len_param > RTE_MP_MAX_PARAM_LEN) {
+ RTE_LOG(ERR, EAL, "Message data is too long\n");
+ rte_errno = -E2BIG;
+ return false;
+ }
+
+ if (msg->num_fds > RTE_MP_MAX_FD_NUM) {
+ RTE_LOG(ERR, EAL, "Cannot send more than %d FDs\n",
+ RTE_MP_MAX_FD_NUM);
+ rte_errno = -E2BIG;
+ return false;
+ }
+
+ return true;
+}
+
+int __rte_experimental
+rte_mp_sendmsg(struct rte_mp_msg *msg)
+{
+ if (!check_input(msg))
+ return -1;
+
+ RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
+ return mp_send(msg);
+}
diff --git a/lib/librte_eal/common/eal_filesystem.h b/lib/librte_eal/common/eal_filesystem.h
index e8959eb..4708dd5 100644
--- a/lib/librte_eal/common/eal_filesystem.h
+++ b/lib/librte_eal/common/eal_filesystem.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

/**
@@ -38,6 +38,23 @@ eal_runtime_config_path(void)
return buffer;
}

+/** Path of primary/secondary communication unix socket file. */
+#define MP_SOCKET_PATH_FMT "%s/.%s_unix"
+static inline const char *
+eal_mp_socket_path(void)
+{
+ static char buffer[PATH_MAX]; /* static so auto-zeroed */
+ const char *directory = default_config_dir;
+ const char *home_dir = getenv("HOME");
+
+ if (getuid() != 0 && home_dir != NULL)
+ directory = home_dir;
+ snprintf(buffer, sizeof(buffer) - 1, MP_SOCKET_PATH_FMT,
+ directory, internal_config.hugefile_prefix);
+
+ return buffer;
+}
+
/** Path of hugepage info file. */
#define HUGEPAGE_INFO_FMT "%s/.%s_hugepage_info"

diff --git a/lib/librte_eal/common/eal_private.h b/lib/librte_eal/common/eal_private.h
index c46dd8f..0b28770 100644
--- a/lib/librte_eal/common/eal_private.h
+++ b/lib/librte_eal/common/eal_private.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

#ifndef _EAL_PRIVATE_H_
@@ -195,4 +195,14 @@ int rte_eal_hugepage_attach(void);
*/
struct rte_bus *rte_bus_find_by_device_name(const char *str);

+/**
+ * Create the unix channel for primary/secondary communication.
+ *
+ * @return
+ * 0 on success;
+ * (<0) on failure.
+ */
+
+int rte_mp_channel_init(void);
+
#endif /* _EAL_PRIVATE_H_ */
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 1f37c7a..2d022c0 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2016 Intel Corporation
+ * Copyright(c) 2010-2018 Intel Corporation
*/

#ifndef _RTE_EAL_H_
@@ -203,6 +203,84 @@ int __rte_experimental rte_eal_cleanup(void);
*/
int rte_eal_primary_proc_alive(const char *config_file_path);

+#define RTE_MP_MAX_FD_NUM 8 /* The max amount of fds */
+#define RTE_MP_MAX_NAME_LEN 64 /* The max length of action name */
+#define RTE_MP_MAX_PARAM_LEN 256 /* The max length of param */
+struct rte_mp_msg {
+ char name[RTE_MP_MAX_NAME_LEN];
+ int len_param;
+ int num_fds;
+ uint8_t param[RTE_MP_MAX_PARAM_LEN];
+ int fds[RTE_MP_MAX_FD_NUM];
+};
+
+/**
+ * Action function typedef used by other components.
+ *
+ * As we create socket channel for primary/secondary communication, use
+ * this function typedef to register action for coming messages.
+ */
+typedef int (*rte_mp_t)(const struct rte_mp_msg *msg);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Register an action function for primary/secondary communication.
+ *
+ * Call this function to register an action, if the calling component wants
+ * to response the messages from the corresponding component in its primary
+ * process or secondary processes.
+ *
+ * @param name
+ * The name argument plays as the nonredundant key to find the action.
+ *
+ * @param action
+ * The action argument is the function pointer to the action function.
+ *
+ * @return
+ * - 0 on success.
+ * - (<0) on failure.
+ */
+int __rte_experimental
+rte_mp_action_register(const char *name, rte_mp_t action);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Unregister an action function for primary/secondary communication.
+ *
+ * Call this function to unregister an action if the calling component does
+ * not want to response the messages from the corresponding component in its
+ * primary process or secondary processes.
+ *
+ * @param name
+ * The name argument plays as the nonredundant key to find the action.
+ *
+ */
+void __rte_experimental
+rte_mp_action_unregister(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a message to the peer process.
+ *
+ * This function will send a message which will be responsed by the action
+ * identified by name in the peer process.
+ *
+ * @param msg
+ * The msg argument contains the customized message.
+ *
+ * @return
+ * - On success, return 0.
+ * - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int __rte_experimental
+rte_mp_sendmsg(struct rte_mp_msg *msg);
+
/**
* Usage function typedef used by the application usage function.
*
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 828baac..66f7585 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -1,7 +1,7 @@
/*-
* BSD LICENSE
*
- * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ * Copyright(c) 2010-2018 Intel Corporation. All rights reserved.
* Copyright(c) 2012-2014 6WIND S.A.
* All rights reserved.
*
@@ -853,6 +853,14 @@ rte_eal_init(int argc, char **argv)
return -1;
}

+ if (rte_mp_channel_init() < 0) {
+ rte_eal_init_alert("failed to init mp channel\n");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ rte_errno = EFAULT;
+ return -1;
+ }
+ }
+
#ifdef VFIO_PRESENT
if (rte_eal_vfio_setup() < 0) {
rte_eal_init_alert("Cannot init VFIO\n");
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 93f6c13..24deaef 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -220,6 +220,9 @@ EXPERIMENTAL {
rte_eal_devargs_remove;
rte_eal_hotplug_add;
rte_eal_hotplug_remove;
+ rte_mp_action_register;
+ rte_mp_action_unregister;
+ rte_mp_sendmsg;
rte_service_attr_get;
rte_service_attr_reset_all;
rte_service_component_register;

--
2.7.4

Jianfeng Tan

2018-01-30 06:58:09 UTC

We need the synchronous way for multi-process communication,
i.e., blockingly waiting for reply message when we send a request
to the peer process.

We add two APIs rte_eal_mp_request() and rte_eal_mp_reply() for
such use case. By invoking rte_eal_mp_request(), a request message
is sent out, and then it waits there for a reply message. The caller
can specify the timeout. And the response messages will be collected
and returned so that the caller can decide how to translate them.

The API rte_eal_mp_reply() is always called by an mp action handler.
Here we add another parameter for rte_eal_mp_t so that the action
handler knows which peer address to reply.

sender-process receiver-process
---------------------- ----------------

thread-n
|_rte_eal_mp_request() ----------> mp-thread
|_timedwait() |_process_msg()
|_action()
|_rte_eal_mp_reply()
mp_thread <---------------------|
|_process_msg()
|_signal(send_thread)
thread-m <----------|
|_collect-reply

* A secondary process is only allowed to talk to the primary process.
* If there are multiple secondary processes for the primary process,
it will send request to peer1, collect response from peer1; then
send request to peer2, collect response from peer2, and so on.
* When thread-n is sending request, thread-m of that process can send
request at the same time.
* For pair <action_name, peer>, we guarantee that only one such request
is on the fly.

Suggested-by: Anatoly Burakov <***@intel.com>
Suggested-by: Konstantin Ananyev <***@intel.com>
Signed-off-by: Jianfeng Tan <***@intel.com>
Reviewed-by: Anatoly Burakov <***@intel.com>
Acked-by: Konstantin Ananyev <***@intel.com>
---
doc/guides/rel_notes/release_18_02.rst | 2 +
lib/librte_eal/common/eal_common_proc.c | 254 +++++++++++++++++++++++++++++---
lib/librte_eal/common/include/rte_eal.h | 60 +++++++-
lib/librte_eal/rte_eal_version.map | 2 +
4 files changed, 298 insertions(+), 20 deletions(-)

diff --git a/doc/guides/rel_notes/release_18_02.rst b/doc/guides/rel_notes/release_18_02.rst
index 0531f59..bb8559b 100644
--- a/doc/guides/rel_notes/release_18_02.rst
+++ b/doc/guides/rel_notes/release_18_02.rst
@@ -169,6 +169,8 @@ New Features

* ``rte_mp_register`` and ``rte_mp_unregister`` are for action (un)registration.
* ``rte_mp_sendmsg`` is for sending a message without blocking for a response.
+ * ``rte_mp_request`` is for sending a request message and will block until
+ it gets a reply message which is sent from the peer by ``rte_mp_reply``.

API Changes
-----------
diff --git a/lib/librte_eal/common/eal_common_proc.c b/lib/librte_eal/common/eal_common_proc.c
index f63c9c2..b974837 100644
--- a/lib/librte_eal/common/eal_common_proc.c
+++ b/lib/librte_eal/common/eal_common_proc.c
@@ -13,6 +13,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <sys/time.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
@@ -46,6 +47,50 @@ TAILQ_HEAD(action_entry_list, action_entry);
static struct action_entry_list action_entry_list =
TAILQ_HEAD_INITIALIZER(action_entry_list);

+enum mp_type {
+ MP_MSG, /* Share message with peers, will not block */
+ MP_REQ, /* Request for information, Will block for a reply */
+ MP_REP, /* Response to previously-received request */
+};
+
+struct mp_msg_internal {
+ int type;
+ struct rte_mp_msg msg;
+};
+
+struct sync_request {
+ TAILQ_ENTRY(sync_request) next;
+ int reply_received;
+ char dst[PATH_MAX];
+ struct rte_mp_msg *request;
+ struct rte_mp_msg *reply;
+ pthread_cond_t cond;
+};
+
+TAILQ_HEAD(sync_request_list, sync_request);
+
+static struct {
+ struct sync_request_list requests;
+ pthread_mutex_t lock;
+} sync_requests = {
+ .requests = TAILQ_HEAD_INITIALIZER(sync_requests.requests),
+ .lock = PTHREAD_MUTEX_INITIALIZER
+};
+
+static struct sync_request *
+find_sync_request(const char *dst, const char *act_name)
+{
+ struct sync_request *r;
+
+ TAILQ_FOREACH(r, &sync_requests.requests, next) {
+ if (!strcmp(r->dst, dst) &&
+ !strcmp(r->request->name, act_name))
+ break;
+ }
+
+ return r;
+}
+
int
rte_eal_primary_proc_alive(const char *config_file_path)
{
@@ -149,19 +194,21 @@ rte_mp_action_unregister(const char *name)
}

static int
-read_msg(struct rte_mp_msg *msg)
+read_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
{
int msglen;
struct iovec iov;
struct msghdr msgh;
- char control[CMSG_SPACE(sizeof(msg->fds))];
+ char control[CMSG_SPACE(sizeof(m->msg.fds))];
struct cmsghdr *cmsg;
- int buflen = sizeof(*msg) - sizeof(msg->fds);
+ int buflen = sizeof(*m) - sizeof(m->msg.fds);

memset(&msgh, 0, sizeof(msgh));
- iov.iov_base = msg;
+ iov.iov_base = m;
iov.iov_len = buflen;

+ msgh.msg_name = s;
+ msgh.msg_namelen = sizeof(*s);
msgh.msg_iov = &iov;
msgh.msg_iovlen = 1;
msgh.msg_control = control;
@@ -183,7 +230,7 @@ read_msg(struct rte_mp_msg *msg)
cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
if ((cmsg->cmsg_level == SOL_SOCKET) &&
(cmsg->cmsg_type == SCM_RIGHTS)) {
- memcpy(msg->fds, CMSG_DATA(cmsg), sizeof(msg->fds));
+ memcpy(m->msg.fds, CMSG_DATA(cmsg), sizeof(m->msg.fds));
break;
}
}
@@ -192,12 +239,28 @@ read_msg(struct rte_mp_msg *msg)
}

static void
-process_msg(struct rte_mp_msg *msg)
+process_msg(struct mp_msg_internal *m, struct sockaddr_un *s)
{
+ struct sync_request *sync_req;
struct action_entry *entry;
+ struct rte_mp_msg *msg = &m->msg;
rte_mp_t action = NULL;

RTE_LOG(DEBUG, EAL, "msg: %s\n", msg->name);
+
+ if (m->type == MP_REP) {
+ pthread_mutex_lock(&sync_requests.lock);
+ sync_req = find_sync_request(s->sun_path, msg->name);
+ if (sync_req) {
+ memcpy(sync_req->reply, msg, sizeof(*msg));
+ sync_req->reply_received = 1;
+ pthread_cond_signal(&sync_req->cond);
+ } else
+ RTE_LOG(ERR, EAL, "Drop mp reply: %s\n", msg->name);
+ pthread_mutex_unlock(&sync_requests.lock);
+ return;
+ }
+
pthread_mutex_lock(&mp_mutex_action);
entry = find_action_entry_by_name(msg->name);
if (entry != NULL)
@@ -206,18 +269,19 @@ process_msg(struct rte_mp_msg *msg)

if (!action)
RTE_LOG(ERR, EAL, "Cannot find action: %s\n", msg->name);
- else if (action(msg) < 0)
+ else if (action(msg, s->sun_path) < 0)
RTE_LOG(ERR, EAL, "Fail to handle message: %s\n", msg->name);
}

static void *
mp_handle(void *arg __rte_unused)
{
- struct rte_mp_msg msg;
+ struct mp_msg_internal msg;
+ struct sockaddr_un sa;

while (1) {
- if (read_msg(&msg) == 0)
- process_msg(&msg);
+ if (read_msg(&msg, &sa) == 0)
+ process_msg(&msg, &sa);
}

return NULL;
@@ -336,16 +400,20 @@ rte_mp_channel_init(void)
*
*/
static int
-send_msg(const char *dst_path, struct rte_mp_msg *msg)
+send_msg(const char *dst_path, struct rte_mp_msg *msg, int type)
{
int snd;
struct iovec iov;
struct msghdr msgh;
struct cmsghdr *cmsg;
struct sockaddr_un dst;
+ struct mp_msg_internal m;
int fd_size = msg->num_fds * sizeof(int);
char control[CMSG_SPACE(fd_size)];

+ m.type = type;
+ memcpy(&m.msg, msg, sizeof(*msg));
+
memset(&dst, 0, sizeof(dst));
dst.sun_family = AF_UNIX;
snprintf(dst.sun_path, sizeof(dst.sun_path), "%s", dst_path);
@@ -353,8 +421,8 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
memset(&msgh, 0, sizeof(msgh));
memset(control, 0, sizeof(control));

- iov.iov_base = msg;
- iov.iov_len = sizeof(*msg) - sizeof(msg->fds);
+ iov.iov_base = &m;
+ iov.iov_len = sizeof(m) - sizeof(msg->fds);

msgh.msg_name = &dst;
msgh.msg_namelen = sizeof(dst);
@@ -396,14 +464,17 @@ send_msg(const char *dst_path, struct rte_mp_msg *msg)
}

static int
-mp_send(struct rte_mp_msg *msg)
+mp_send(struct rte_mp_msg *msg, const char *peer, int type)
{
int ret = 0;
DIR *mp_dir;
struct dirent *ent;

- if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
- if (send_msg(eal_mp_socket_path(), msg) < 0)
+ if (!peer && (rte_eal_process_type() == RTE_PROC_SECONDARY))
+ peer = eal_mp_socket_path();
+
+ if (peer) {
+ if (send_msg(peer, msg, type) < 0)
return -1;
else
return 0;
@@ -421,11 +492,11 @@ mp_send(struct rte_mp_msg *msg)
if (fnmatch(mp_filter, ent->d_name, 0) != 0)
continue;

- if (send_msg(ent->d_name, msg) < 0)
+ if (send_msg(ent->d_name, msg, type) < 0)
ret = -1;
}
- closedir(mp_dir);

+ closedir(mp_dir);
return ret;
}

@@ -464,5 +535,150 @@ rte_mp_sendmsg(struct rte_mp_msg *msg)
return -1;

RTE_LOG(DEBUG, EAL, "sendmsg: %s\n", msg->name);
- return mp_send(msg);
+ return mp_send(msg, NULL, MP_MSG);
+}
+
+static int
+mp_request_one(const char *dst, struct rte_mp_msg *req,
+ struct rte_mp_reply *reply, const struct timespec *ts)
+{
+ int ret;
+ struct timeval now;
+ struct rte_mp_msg msg, *tmp;
+ struct sync_request sync_req, *exist;
+
+ sync_req.reply_received = 0;
+ strcpy(sync_req.dst, dst);
+ sync_req.request = req;
+ sync_req.reply = &msg;
+ pthread_cond_init(&sync_req.cond, NULL);
+
+ pthread_mutex_lock(&sync_requests.lock);
+ exist = find_sync_request(dst, req->name);
+ if (!exist)
+ TAILQ_INSERT_TAIL(&sync_requests.requests, &sync_req, next);
+ pthread_mutex_unlock(&sync_requests.lock);
+ if (exist) {
+ RTE_LOG(ERR, EAL, "A pending request %s:%s\n", dst, req->name);
+ rte_errno = -EEXIST;
+ return -1;
+ }
+
+ ret = send_msg(dst, req, MP_REQ);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Fail to send request %s:%s\n",
+ dst, req->name);
+ return -1;
+ } else if (ret == 0)
+ return 0;
+
+ reply->nb_sent++;
+
+ pthread_mutex_lock(&sync_requests.lock);
+ do {
+ pthread_cond_timedwait(&sync_req.cond, &sync_requests.lock, ts);
+ /* Check spurious wakeups */
+ if (sync_req.reply_received == 1)
+ break;
+ /* Check if time is out */
+ if (gettimeofday(&now, NULL) < 0)
+ break;
+ if (now.tv_sec < ts->tv_sec)
+ break;
+ else if (now.tv_sec == ts->tv_sec &&
+ now.tv_usec * 1000 < ts->tv_nsec)
+ break;
+ } while (1);
+ /* We got the lock now */
+ TAILQ_REMOVE(&sync_requests.requests, &sync_req, next);
+ pthread_mutex_unlock(&sync_requests.lock);
+
+ if (sync_req.reply_received == 0) {
+ RTE_LOG(ERR, EAL, "Fail to recv reply for request %s:%s\n",
+ dst, req->name);
+ rte_errno = -ETIMEDOUT;
+ return -1;
+ }
+
+ tmp = realloc(reply->msgs, sizeof(msg) * (reply->nb_received + 1));
+ if (!tmp) {
+ RTE_LOG(ERR, EAL, "Fail to alloc reply for request %s:%s\n",
+ dst, req->name);
+ rte_errno = -ENOMEM;
+ return -1;
+ }
+ memcpy(&tmp[reply->nb_received], &msg, sizeof(msg));
+ reply->msgs = tmp;
+ reply->nb_received++;
+ return 0;
+}
+
+int __rte_experimental
+rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+ const struct timespec *ts)
+{
+ int ret = 0;
+ DIR *mp_dir;
+ struct dirent *ent;
+ struct timeval now;
+ struct timespec end;
+
+ RTE_LOG(DEBUG, EAL, "request: %s\n", req->name);
+
+ if (check_input(req) == false)
+ return -1;
+ if (gettimeofday(&now, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "Faile to get current time\n");
+ rte_errno = errno;
+ return -1;
+ }
+
+ end.tv_nsec = (now.tv_usec * 1000 + ts->tv_nsec) % 1000000000;
+ end.tv_sec = now.tv_sec + ts->tv_sec +
+ (now.tv_usec * 1000 + ts->tv_nsec) / 1000000000;
+
+ reply->nb_sent = 0;
+ reply->nb_received = 0;
+ reply->msgs = NULL;
+
+ /* for secondary process, send request to the primary process only */
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+ return mp_request_one(eal_mp_socket_path(), req, reply, &end);
+
+ /* for primary process, broadcast request, and collect reply 1 by 1 */
+ mp_dir = opendir(mp_dir_path);
+ if (!mp_dir) {
+ RTE_LOG(ERR, EAL, "Unable to open directory %s\n", mp_dir_path);
+ rte_errno = errno;
+ return -1;
+ }
+
+ while ((ent = readdir(mp_dir))) {
+ if (fnmatch(mp_filter, ent->d_name, 0) != 0)
+ continue;
+
+ if (mp_request_one(ent->d_name, req, reply, &end))
+ ret = -1;
+ }
+
+ closedir(mp_dir);
+ return ret;
+}
+
+int __rte_experimental
+rte_mp_reply(struct rte_mp_msg *msg, const char *peer)
+{
+
+ RTE_LOG(DEBUG, EAL, "reply: %s\n", msg->name);
+
+ if (check_input(msg) == false)
+ return -1;
+
+ if (peer == NULL) {
+ RTE_LOG(ERR, EAL, "peer is not specified\n");
+ rte_errno = -EINVAL;
+ return -1;
+ }
+
+ return mp_send(msg, peer, MP_REP);
}
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index 2d022c0..08c6637 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -13,6 +13,7 @@

#include <stdint.h>
#include <sched.h>
+#include <time.h>

#include <rte_config.h>
#include <rte_compat.h>
@@ -214,13 +215,19 @@ struct rte_mp_msg {
int fds[RTE_MP_MAX_FD_NUM];
};

+struct rte_mp_reply {
+ int nb_sent;
+ int nb_received;
+ struct rte_mp_msg *msgs; /* caller to free */
+};
+
/**
* Action function typedef used by other components.
*
* As we create socket channel for primary/secondary communication, use
* this function typedef to register action for coming messages.
*/
-typedef int (*rte_mp_t)(const struct rte_mp_msg *msg);
+typedef int (*rte_mp_t)(const struct rte_mp_msg *msg, const void *peer);

/**
* @warning
@@ -282,6 +289,57 @@ int __rte_experimental
rte_mp_sendmsg(struct rte_mp_msg *msg);

/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a request to the peer process and expect a reply.
+ *
+ * This function sends a request message to the peer process, and will
+ * block until receiving reply message from the peer process.
+ *
+ * @note The caller is responsible to free reply->replies.
+ *
+ * @param req
+ * The req argument contains the customized request message.
+ *
+ * @param reply
+ * The reply argument will be for storing all the replied messages;
+ * the caller is responsible for free reply->replies.
+ *
+ * @param ts
+ * The ts argument specifies how long we can wait for the peer(s) to reply.
+ *
+ * @return
+ * - On success, return 0.
+ * - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int __rte_experimental
+rte_mp_request(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+ const struct timespec *ts);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Send a reply to the peer process.
+ *
+ * This function will send a reply message in response to a request message
+ * received previously.
+ *
+ * @param msg
+ * The msg argument contains the customized message.
+ *
+ * @param peer
+ * The peer argument is the pointer to the peer socket path.
+ *
+ * @return
+ * - On success, return 0.
+ * - On failure, return -1, and the reason will be stored in rte_errno.
+ */
+int __rte_experimental
+rte_mp_reply(struct rte_mp_msg *msg, const char *peer);
+
+/**
* Usage function typedef used by the application usage function.
*
* Use this function typedef to define and call rte_set_application_usage_hook()
diff --git a/lib/librte_eal/rte_eal_version.map b/lib/librte_eal/rte_eal_version.map
index 24deaef..4146907 100644
--- a/lib/librte_eal/rte_eal_version.map
+++ b/lib/librte_eal/rte_eal_version.map
@@ -223,6 +223,8 @@ EXPERIMENTAL {
rte_mp_action_register;
rte_mp_action_unregister;
rte_mp_sendmsg;
+ rte_mp_request;
+ rte_mp_reply;
rte_service_attr_get;
rte_service_attr_reset_all;
rte_service_component_register;

--
2.7.4

Thomas Monjalon

2018-01-30 14:46:28 UTC

Post by Jianfeng Tan
This patchset adds a generic channel for multi-process (primary/secondary)
communication.

Applied, thanks

87 Replies
65 Views
Permalink to this page
Disable enhanced parsing

Thread Navigation

Jianfeng Tan 2017-11-30 18:44:07 UTC

Jianfeng Tan 2017-11-30 18:44:08 UTC

Burakov, Anatoly 2017-12-11 11:04:33 UTC

Ananyev, Konstantin 2017-12-11 16:43:08 UTC

Jianfeng Tan 2017-11-30 18:44:09 UTC

Burakov, Anatoly 2017-12-11 11:39:22 UTC

Ananyev, Konstantin 2017-12-11 16:49:18 UTC

Jianfeng Tan 2017-11-30 18:44:10 UTC

Burakov, Anatoly 2017-12-11 12:01:08 UTC

Burakov, Anatoly 2017-12-11 09:59:46 UTC

Tan, Jianfeng 2017-12-12 07:34:03 UTC

Burakov, Anatoly 2017-12-12 16:18:15 UTC

Jianfeng Tan 2018-01-11 04:07:30 UTC

Jianfeng Tan 2018-01-11 04:07:31 UTC

Burakov, Anatoly 2018-01-13 12:57:29 UTC

Ananyev, Konstantin 2018-01-15 19:52:07 UTC

Jianfeng Tan 2018-01-11 04:07:32 UTC

Burakov, Anatoly 2018-01-13 13:11:59 UTC

Ananyev, Konstantin 2018-01-15 21:45:53 UTC

Jianfeng Tan 2018-01-11 04:07:33 UTC

Burakov, Anatoly 2018-01-13 13:41:50 UTC

Ananyev, Konstantin 2018-01-16 00:00:43 UTC

Tan, Jianfeng 2018-01-16 08:10:31 UTC

Ananyev, Konstantin 2018-01-16 11:12:47 UTC

Tan, Jianfeng 2018-01-16 16:47:46 UTC

Ananyev, Konstantin 2018-01-17 10:50:22 UTC

Tan, Jianfeng 2018-01-17 13:09:22 UTC

Tan, Jianfeng 2018-01-17 13:15:53 UTC

Ananyev, Konstantin 2018-01-17 17:20:38 UTC

Jianfeng Tan 2018-01-11 04:07:34 UTC

Burakov, Anatoly 2018-01-13 14:03:07 UTC

Jianfeng Tan 2018-03-04 14:57:36 UTC

Burakov, Anatoly 2018-03-14 13:27:17 UTC

Tan, Jianfeng 2018-03-19 06:53:33 UTC

Burakov, Anatoly 2018-03-20 10:33:00 UTC

Burakov, Anatoly 2018-03-20 10:56:28 UTC

Jianfeng Tan 2018-03-20 08:50:09 UTC

Tan, Jianfeng 2018-04-05 14:26:24 UTC

Burakov, Anatoly 2018-04-05 14:39:29 UTC

Thomas Monjalon 2018-04-12 23:27:34 UTC

Burakov, Anatoly 2018-04-12 15:26:08 UTC

Jianfeng Tan 2018-04-15 15:06:19 UTC

Tan, Jianfeng 2018-04-15 15:10:44 UTC

Thomas Monjalon 2018-04-17 23:04:45 UTC

Jianfeng Tan 2018-01-25 04:16:20 UTC

Jianfeng Tan 2018-01-25 04:16:21 UTC

Thomas Monjalon 2018-01-25 10:41:24 UTC

Burakov, Anatoly 2018-01-25 11:27:35 UTC

Thomas Monjalon 2018-01-25 11:34:46 UTC

Ananyev, Konstantin 2018-01-25 12:21:45 UTC

Jianfeng Tan 2018-01-25 04:16:22 UTC

Burakov, Anatoly 2018-01-25 12:00:12 UTC

Burakov, Anatoly 2018-01-25 12:19:23 UTC

Ananyev, Konstantin 2018-01-25 12:19:23 UTC

Burakov, Anatoly 2018-01-25 12:25:58 UTC

Ananyev, Konstantin 2018-01-25 13:00:25 UTC

Burakov, Anatoly 2018-01-25 13:05:57 UTC

Burakov, Anatoly 2018-01-25 13:10:07 UTC

Ananyev, Konstantin 2018-01-25 15:03:58 UTC

Burakov, Anatoly 2018-01-25 16:22:03 UTC

Tan, Jianfeng 2018-01-25 17:10:34 UTC

Burakov, Anatoly 2018-01-25 18:02:57 UTC

Ananyev, Konstantin 2018-01-25 12:22:32 UTC

Jianfeng Tan 2018-01-25 04:16:23 UTC

Thomas Monjalon 2018-01-25 10:47:20 UTC

Burakov, Anatoly 2018-01-25 10:52:41 UTC

Thomas Monjalon 2018-01-25 10:57:06 UTC

Burakov, Anatoly 2018-01-25 12:15:35 UTC

Jianfeng Tan 2018-01-25 19:14:42 UTC

Jianfeng Tan 2018-01-25 19:14:43 UTC

Jianfeng Tan 2018-01-25 19:14:44 UTC

Tan, Jianfeng 2018-01-25 19:15:53 UTC

Jianfeng Tan 2018-01-25 19:21:08 UTC

Jianfeng Tan 2018-01-25 19:21:09 UTC

Jianfeng Tan 2018-01-25 19:21:10 UTC

Thomas Monjalon 2018-01-25 21:23:03 UTC

Jianfeng Tan 2018-01-26 03:41:20 UTC

Jianfeng Tan 2018-01-26 03:41:21 UTC

Burakov, Anatoly 2018-01-26 10:25:45 UTC

Tan, Jianfeng 2018-01-29 06:37:37 UTC

Burakov, Anatoly 2018-01-29 09:37:47 UTC

Jianfeng Tan 2018-01-26 03:41:22 UTC

Burakov, Anatoly 2018-01-26 10:31:40 UTC

Thomas Monjalon 2018-01-29 23:52:51 UTC

Jianfeng Tan 2018-01-30 06:58:07 UTC

Jianfeng Tan 2018-01-30 06:58:08 UTC

Jianfeng Tan 2018-01-30 06:58:09 UTC

Thomas Monjalon 2018-01-30 14:46:28 UTC

about - legalese

Loading...