Discussion:
[PATCH][RFC] net/bridge: add basic VEPA support
Fischer, Anna
2009-06-15 17:33:10 UTC
Permalink
This patch adds basic Virtual Ethernet Port Aggregator (VEPA)
capabilities to the Linux kernel Ethernet bridging code.

A Virtual Ethernet Port Aggregator (VEPA) is a capability within
a physical end station that collaborates with an adjacent, external
bridge to provide distributed bridging support between multiple
virtual end stations and external networks. The VEPA collaborates
by forwarding all station-originated frames to the adjacent bridge
for frame processing and frame relay (including so-called 'hairpin'
forwarding) and by steering and replicating frames received from
the VEPA uplink to the appropriate destinations. A VEPA may be
implemented in software or in conjunction with embedded hardware.

In particular, the patch extends the Linux Ethernet bridge to act as
(1) a VEPA - for this we have added VEPA forwarding functionality and
added a configuration option for a VEPA uplink port, or as
(2) a bridge supporting 'hairpin' forwarding - for this we have added a
bridge port 'hairpin' mode which allows sending frames back out
through the port the frame was received on.

Configuration of VEPA capabilities through Linux userspace bridge
utilities is provided by an additional patch 'bridge-utils: add
basic VEPA support'.

You can find additional information on VEPA here:
http://tech.groups.yahoo.com/group/evb/
http://www.ieee802.org/1/files/public/docs2009/new-hudson-vepa_seminar-20090514d.pdf

Signed-off-by: Paul Congdon <***@hp.com>
Signed-off-by: Anna Fischer <***@hp.com>

---

net/bridge/br_fdb.c | 22 ++++++++++++++
net/bridge/br_forward.c | 24 ++++++++++++++-
net/bridge/br_if.c | 3 ++
net/bridge/br_input.c | 9 ++++++
net/bridge/br_private.h | 12 ++++++++
net/bridge/br_sysfs_br.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++
net/bridge/br_sysfs_if.c | 17 +++++++++++
7 files changed, 154 insertions(+), 2 deletions(-)

diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index a48f5ef..7d0f6ed 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -394,6 +394,15 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,

fdb = fdb_find(head, addr);
if (likely(fdb)) {
+ /*
+ * If we are a VEPA and the source port is the uplink,
+ * this could be a reflected packet, so don't learn any
+ * addresses that already are in the fdb but on other ports
+ */
+ if ((br->flags & BR_VEPA_MODE) && br->uplink == source &&
+ fdb->dst != br->uplink)
+ return;
+
/* attempt to update an entry for a local interface */
if (unlikely(fdb->is_local)) {
if (net_ratelimit())
@@ -415,3 +424,16 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
spin_unlock(&br->hash_lock);
}
}
+
+struct net_bridge_port *br_vepa_find_src(struct net_bridge *br,
+ const unsigned char *addr)
+{
+ struct hlist_head *head = &br->hash[br_mac_hash(addr)];
+ struct net_bridge_fdb_entry *fdb;
+
+ fdb = fdb_find(head, addr);
+ if (fdb)
+ return fdb->dst;
+ else
+ return NULL;
+}
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index d2c27c8..ff1135e 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -22,7 +22,8 @@
static inline int should_deliver(const struct net_bridge_port *p,
const struct sk_buff *skb)
{
- return (skb->dev != p->dev && p->state == BR_STATE_FORWARDING);
+ return (((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
+ p->state == BR_STATE_FORWARDING);
}

static inline unsigned packet_length(const struct sk_buff *skb)
@@ -92,6 +93,17 @@ void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
}

/* called with rcu_read_lock */
+void br_vepa_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
+{
+ if (!skb_warn_if_lro(skb) && (to != NULL)) {
+ __br_forward(to, skb);
+ return;
+ }
+
+ kfree_skb(skb);
+}
+
+/* called with rcu_read_lock */
void br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
{
if (should_deliver(to, skb)) {
@@ -109,11 +121,19 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb,
{
struct net_bridge_port *p;
struct net_bridge_port *prev;
+ struct net_bridge_port *sp = NULL;
+
+ /*
+ * If we are a VEPA, then we do not want to send the frame
+ * to the port it came from originally.
+ */
+ if (br->flags & BR_VEPA_MODE)
+ sp = br_vepa_find_src(br, eth_hdr(skb)->h_source);

prev = NULL;

list_for_each_entry_rcu(p, &br->port_list, list) {
- if (should_deliver(p, skb)) {
+ if (should_deliver(p, skb) && p != sp) {
if (prev != NULL) {
struct sk_buff *skb2;

diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 8a96672..22239ef 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -146,6 +146,8 @@ static void del_nbp(struct net_bridge_port *p)
list_del_rcu(&p->list);

rcu_assign_pointer(dev->br_port, NULL);
+ if (br->uplink == p)
+ br->uplink = NULL;

kobject_uevent(&p->kobj, KOBJ_REMOVE);
kobject_del(&p->kobj);
@@ -203,6 +205,7 @@ static struct net_device *new_bridge_dev(struct net *net, const char *name)
br->topology_change = 0;
br->topology_change_detected = 0;
br->ageing_time = 300 * HZ;
+ br->uplink = NULL;

br_netfilter_rtable_init(br);

diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 5ee1a36..8027156 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -50,6 +50,15 @@ int br_handle_frame_finish(struct sk_buff *skb)
br = p->br;
br_fdb_update(br, p, eth_hdr(skb)->h_source);

+ /*
+ * If we are a VEPA, and the receiving port is not the uplink we
+ * simply want to send this frame to the uplink (after learning)
+ */
+ if ((br->flags & BR_VEPA_MODE) && p != br->uplink) {
+ br_vepa_deliver(br->uplink, skb);
+ goto out;
+ }
+
if (p->state == BR_STATE_LEARNING)
goto drop;

diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index b6c3b71..0c7ee4c 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -82,6 +82,9 @@ struct net_bridge_port
struct timer_list message_age_timer;
struct kobject kobj;
struct rcu_head rcu;
+
+ unsigned long flags;
+#define BR_HAIRPIN_MODE 0x00000001
};

struct net_bridge
@@ -98,6 +101,7 @@ struct net_bridge
#endif
unsigned long flags;
#define BR_SET_MAC_ADDR 0x00000001
+#define BR_VEPA_MODE 0x00000010

/* STP */
bridge_id designated_root;
@@ -128,6 +132,9 @@ struct net_bridge
struct timer_list topology_change_timer;
struct timer_list gc_timer;
struct kobject *ifobj;
+
+ /* VEPA */
+ struct net_bridge_port *uplink;
};

extern struct notifier_block br_device_notifier;
@@ -165,6 +172,9 @@ extern int br_fdb_insert(struct net_bridge *br,
extern void br_fdb_update(struct net_bridge *br,
struct net_bridge_port *source,
const unsigned char *addr);
+extern struct net_bridge_port *br_vepa_find_src(struct net_bridge *br,
+ const unsigned char *addr);
+

/* br_forward.c */
extern void br_deliver(const struct net_bridge_port *to,
@@ -175,6 +185,8 @@ extern void br_forward(const struct net_bridge_port *to,
extern int br_forward_finish(struct sk_buff *skb);
extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb);
extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb);
+extern void br_vepa_deliver(const struct net_bridge_port *to,
+ struct sk_buff *skb);

/* br_if.c */
extern void br_port_carrier_check(struct net_bridge_port *p);
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 603d892..557d7c3 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -344,6 +344,73 @@ static ssize_t store_flush(struct device *d,
}
static DEVICE_ATTR(flush, S_IWUSR, NULL, store_flush);

+static ssize_t show_vepa_mode(struct device *d,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ int vepa_mode = (br->flags & BR_VEPA_MODE) ? 1 : 0;
+ return sprintf(buf, "%d\n", vepa_mode);
+}
+
+static ssize_t store_vepa_mode(struct device *d,
+ struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ struct net_bridge *br = to_bridge(d);
+ int vepa_mode = 0;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (sscanf(buf, "%d", &vepa_mode) != 1)
+ return -EINVAL;
+
+ rtnl_lock();
+ if (vepa_mode)
+ br->flags |= BR_VEPA_MODE;
+ else
+ br->flags &= ~BR_VEPA_MODE;
+ rtnl_unlock();
+
+ return len;
+}
+static DEVICE_ATTR(vepa_mode, S_IRUGO | S_IWUSR, show_vepa_mode,
+ store_vepa_mode);
+
+static ssize_t show_uplink_port(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ if (br->uplink && br->uplink->dev)
+ return sprintf(buf, "%s\n", br->uplink->dev->name);
+ else
+ return sprintf(buf, "\n");
+}
+
+static ssize_t store_uplink_port(struct device *d,
+ struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+ struct net_bridge *br = to_bridge(d);
+ struct net_device *dev;
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ dev = dev_get_by_name(&init_net, buf);
+ if (!dev || !dev->br_port || (dev->br_port->br != br)) {
+ br->uplink = NULL;
+ return -EINVAL;
+ }
+
+ rtnl_lock();
+ br->uplink = dev->br_port;
+ rtnl_unlock();
+
+ return len;
+}
+static DEVICE_ATTR(uplink_port, S_IRUGO | S_IWUSR, show_uplink_port,
+ store_uplink_port);
+
static struct attribute *bridge_attrs[] = {
&dev_attr_forward_delay.attr,
&dev_attr_hello_time.attr,
@@ -363,6 +430,8 @@ static struct attribute *bridge_attrs[] = {
&dev_attr_gc_timer.attr,
&dev_attr_group_addr.attr,
&dev_attr_flush.attr,
+ &dev_attr_vepa_mode.attr,
+ &dev_attr_uplink_port.attr,
NULL
};

diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 02b2d50..0e79531 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -143,6 +143,22 @@ static ssize_t store_flush(struct net_bridge_port *p, unsigned long v)
}
static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush);

+static ssize_t show_hairpin_mode(struct net_bridge_port *p, char *buf)
+{
+ int hairpin_mode = (p->flags & BR_HAIRPIN_MODE) ? 1 : 0;
+ return sprintf(buf, "%d\n", hairpin_mode);
+}
+static ssize_t store_hairpin_mode(struct net_bridge_port *p, unsigned long v)
+{
+ if (v)
+ p->flags |= BR_HAIRPIN_MODE;
+ else
+ p->flags &= ~BR_HAIRPIN_MODE;
+ return 0;
+}
+static BRPORT_ATTR(hairpin_mode, S_IRUGO | S_IWUSR,
+ show_hairpin_mode, store_hairpin_mode);
+
static struct brport_attribute *brport_attrs[] = {
&brport_attr_path_cost,
&brport_attr_priority,
@@ -159,6 +175,7 @@ static struct brport_attribute *brport_attrs[] = {
&brport_attr_forward_delay_timer,
&brport_attr_hold_timer,
&brport_attr_flush,
+ &brport_attr_hairpin_mode,
NULL
};
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Stephen Hemminger
2009-08-07 04:00:02 UTC
Permalink
On Mon, 15 Jun 2009 17:33:10 +0000
Post by Fischer, Anna
This patch adds basic Virtual Ethernet Port Aggregator (VEPA)
capabilities to the Linux kernel Ethernet bridging code.
A Virtual Ethernet Port Aggregator (VEPA) is a capability within
a physical end station that collaborates with an adjacent, external
bridge to provide distributed bridging support between multiple
virtual end stations and external networks. The VEPA collaborates
by forwarding all station-originated frames to the adjacent bridge
for frame processing and frame relay (including so-called 'hairpin'
forwarding) and by steering and replicating frames received from
the VEPA uplink to the appropriate destinations. A VEPA may be
implemented in software or in conjunction with embedded hardware.
In particular, the patch extends the Linux Ethernet bridge to act as
(1) a VEPA - for this we have added VEPA forwarding functionality and
added a configuration option for a VEPA uplink port, or as
(2) a bridge supporting 'hairpin' forwarding - for this we have added a
bridge port 'hairpin' mode which allows sending frames back out
through the port the frame was received on.
Configuration of VEPA capabilities through Linux userspace bridge
utilities is provided by an additional patch 'bridge-utils: add
basic VEPA support'.
After reading more about this, I am not convinced this should be part
of the bridge code. The bridge code really consists of two parts:
forwarding table and optional spanning tree. Well the VEPA code
short circuits both of these; it can't imagine it working
with STP turned on. The only part of bridge code that really gets
used by this are the receive packet hooks and the crufty old
API.

So instead of adding more stuff to existing bridge code, why not
have a new driver for just VEPA. You could
do it with a simple version of macvlan type driver.
Arnd Bergmann
2009-08-07 11:29:44 UTC
Permalink
Post by Stephen Hemminger
So instead of adding more stuff to existing bridge code, why not
have a new driver for just VEPA. You could
do it with a simple version of macvlan type driver.
The current macvlan driver already does the downstream side of
VEPA and only needs a connection to KVM et al, either using
Or's qemu packet socket interface, or using the macvtap driver
I posted.

Now Anna's patch also addresses the upstream side of VEPA, by
making it possible for the bridge code to send frames back
in the bridge code that they were received from, if that port
is marked as a hairpin mode port.

Is your suggestion to do that part also with a macvlan type driver?
I've thought about this before, and I guess that would mean
basically the same as the macvlan driver, except hashing the
source MAC address instead of the destination MAC address for
inbound frames. That way you should be able to do something
like:

Host A Host B

/- nalvcam0 -\ /- macvlan0 - 192.168.1.1
br0 -| |- ethA === ethB -|
\- nalvcam1 -/ \- macvlan1 - 192.168.1.2

Now assuming that macvlan0 and macvlan1 are in different
network namespaces or belong to different KVM guests, these
guests would be able to communicate with each other through
the bridge on host A, which can set the policy (using ebtables)
for this communication and get interface statistics on its
nalvcam interfaces. Also, instead of having the br0, Host A could
assign an IP addresses to the two nalvcam interfaces that host
B has, and use IP forwarding between the guests of host B.

Arnd <><
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Paul Congdon (UC Davis)
2009-08-07 19:44:06 UTC
Permalink
Arnd,



I don't think your scheme works too well because broadcast packet coming
from other interfaces on br0 would get replicated and sent across the wire
to ethB multiple times.



Paul

That way you should be able to do something
like:

Host A Host B

/- nalvcam0 -\ /- macvlan0 - 192.168.1.1
br0 -| |- ethA === ethB -|
\- nalvcam1 -/ \- macvlan1 - 192.168.1.2

Now assuming that macvlan0 and macvlan1 are in different
network namespaces or belong to different KVM guests, these
guests would be able to communicate with each other through
the bridge on host A, which can set the policy (using ebtables)
for this communication and get interface statistics on its
nalvcam interfaces. Also, instead of having the br0, Host A could
assign an IP addresses to the two nalvcam interfaces that host
B has, and use IP forwarding between the guests of host B.
Arnd Bergmann
2009-08-10 15:23:10 UTC
Permalink
Post by Paul Congdon (UC Davis)
I don't think your scheme works too well because broadcast packet coming
from other interfaces on br0 would get replicated and sent across the wire
to ethB multiple times.
Right, that won't work. So the bridge patch for the hairpin turn
is still the best solution. Btw, how will that interact with
the bride-netfilter (ebtables) setup? Can you apply any filters
that work on current bridges also between two VEPA ports while
doing the hairpin turn?

Arnd <><
Fischer, Anna
2009-08-10 15:59:04 UTC
Permalink
Subject: Re: [PATCH][RFC] net/bridge: add basic VEPA support
Post by Paul Congdon (UC Davis)
I don't think your scheme works too well because broadcast packet
coming
Post by Paul Congdon (UC Davis)
from other interfaces on br0 would get replicated and sent across the
wire
Post by Paul Congdon (UC Davis)
to ethB multiple times.
Right, that won't work. So the bridge patch for the hairpin turn
is still the best solution.
Yes, I think that we should separate the discussions between hairpin
mode on the adjacent bridge and the VEPA filtering service residing
within the end-station. The hairpin feature really has to be
implemented in the bridging code.
Btw, how will that interact with
the bride-netfilter (ebtables) setup? Can you apply any filters
that work on current bridges also between two VEPA ports while
doing the hairpin turn?
The hairpin mode is implemented on the adjacent bridge. The only
difference for a hairpin mode port vs. a normal bridge port is that
it can pass frames back out to the same port it came from. All the
netfilter hooks are still in place.

On the VEPA filtering service side, the only change we have implemented
in the bridging code is that in VEPA mode all frames are passed to the
uplink on TX. However, frames are still passed through the netfilter
hooks before they go out on the wire. On the inbound path, there are
no changes to the way frames are processed (except the filtering for
the original source port), so netfilter hooks work in the same way
as for a normal bridge.

If a frame is reflected back because of a hairpin turn, then of course
the incoming port is the VEPA uplink port and not the port that
originally sent the frame. So if you are trying to enforce some
packet filtering on that inbound path, then you would have to do that
based on MAC addresses and not on bridge ports. But I would assume that
you would enforce the filtering already before you send out the frame
to the adjacent bridge. Apart from that, if you enable your bridge to
behave in VEPA mode, then you would typically do packet filtering etc
on the adjacent bridge and not use the netfilter hook. You can still use
both though, if you like.

Anna
Arnd Bergmann
2009-08-10 16:16:08 UTC
Permalink
Post by Fischer, Anna
On the VEPA filtering service side, the only change we have implemented
in the bridging code is that in VEPA mode all frames are passed to the
uplink on TX. However, frames are still passed through the netfilter
hooks before they go out on the wire. On the inbound path, there are
no changes to the way frames are processed (except the filtering for
the original source port), so netfilter hooks work in the same way
as for a normal bridge.
Ah, interesting. I did not realize that the hooks were still active,
although that obviously makes sense. So that would be another
important difference between our implementations.
Post by Fischer, Anna
If a frame is reflected back because of a hairpin turn, then of course
the incoming port is the VEPA uplink port and not the port that
originally sent the frame. So if you are trying to enforce some
packet filtering on that inbound path, then you would have to do that
based on MAC addresses and not on bridge ports. But I would assume that
you would enforce the filtering already before you send out the frame
to the adjacent bridge. Apart from that, if you enable your bridge to
behave in VEPA mode, then you would typically do packet filtering etc
on the adjacent bridge and not use the netfilter hook. You can still use
both though, if you like.
Right, that was my point. They bridge in VEPA mode would likely be
configured to be completely ignorant of the data going through it
and not do any filter, and you do all filterring on the adjacent
bridge.

I just wasn't sure that this is possible with ebtables if the
adjacent bridge is a Linux system with the bridge in hairpin turn
mode.

Arnd <><
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Paul Congdon (UC Davis)
2009-08-07 18:58:00 UTC
Permalink
Post by Stephen Hemminger
After reading more about this, I am not convinced this should be part
forwarding table and optional spanning tree. Well the VEPA code short
circuits both of these; it can't imagine it working with STP turned
on. The only part of bridge code that really gets used by this are the
receive packet hooks and the crufty old API.
So instead of adding more stuff to existing bridge code, why not have
a new driver for just VEPA. You could do it with a simple version of
macvlan type driver.
Stephen,

Thanks for your comments and questions. We do believe the bridge code is
the right place for this, so I'd like to embellish on that a bit more to
help persuade you. Sorry for the long winded response, but here are some
thoughts:

- First and foremost, VEPA is going to be a standard addition to the IEEE
802.1Q specification. The working group agreed at the last meeting to
pursue a project to augment the bridge standard with hairpin mode (aka
reflective relay) and a remote filtering service (VEPA). See for details:
http://www.ieee802.org/1/files/public/docs2009/new-evb-congdon-evbPar5C-0709
-v01.pdf

- The VEPA functionality was really a pretty small change to the code with
low risk and wouldn't seem to warrant an entire new driver or module.

- There are good use cases where VMs will want to have some of their
interfaces attached to bridges and others to bridges operating in VEPA mode.
In other words, we see simultaneous operation of the bridge code and VEPA
occurring, so having as much of the underlying code as common as possible
would seem to be beneficial.

- By augmenting the bridge code with VEPA there is a great amount of re-use
achieved. It works wherever the bridge code works and doesn't need anything
special to support KVM, XEN, and all the hooks, etc...

- The hardware vendors building SR-IOV NICs with embedded switches will be
adding VEPA mode, so by keeping the bridge module in sync would be
consistent with this trend and direction. It will be possible to extend the
hardware implementations by cascading a software bridge and/or VEPA, so
being in sync with the architecture would make this more consistent.

- The forwarding table is still needed and used on inbound traffic to
deliver frames to the correct virtual interfaces and to filter any reflected
frames. A new driver would have to basically implement an equivalent
forwarding table anyway. As I understand the current macvlan type driver,
it wouldn't filter multicast frames properly without such a table.

- It seems the hairpin mode would be needed in the bridge module whether
VEPA was added to the bridge module or a new driver. Having the associated
changes together in the same code could aid in understanding and deployment.

As I understand the macvlan code, it currently doesn't allow two VMs on the
same machine to communicate with one another. I could imagine a hairpin
mode on the adjacent bridge making this possible, but the macvlan code would
need to be updated to filter reflected frames so a source did not receive
his own packet. I could imagine this being done as well, but to also
support selective multicast usage, something similar to the bridge
forwarding table would be needed. I think putting VEPA into a new driver
would cause you to implement many things the bridge code already supports.
Given that we expect the bridge standard to ultimately include VEPA, and the
new functions are basic forwarding operations, it seems to make most sense
to keep this consistent with the bridge module.

Paul
Arnd Bergmann
2009-08-08 09:49:27 UTC
Permalink
Post by Paul Congdon (UC Davis)
As I understand the macvlan code, it currently doesn't allow two VMs on the
same machine to communicate with one another.
There are patches to do that. I think if we add that, there should be
a way to choose the behavior between either bridging between the
guests or VEPA.
Post by Paul Congdon (UC Davis)
I could imagine a hairpin mode on the adjacent bridge making this
possible, but the macvlan code would need to be updated to filter
reflected frames so a source did not receive his own packet.
Right, I missed this point so far. I'll follow up with a patch
to do that.
Post by Paul Congdon (UC Davis)
I could imagine this being done as well, but to also
support selective multicast usage, something similar to the bridge
forwarding table would be needed. I think putting VEPA into a new driver
would cause you to implement many things the bridge code already supports.
Given that we expect the bridge standard to ultimately include VEPA, and the
new functions are basic forwarding operations, it seems to make most sense
to keep this consistent with the bridge module.
This is the interesting part of the discussion. The bridge and macvlan
drivers certainly have an overlap in functionality and you can argue
that you only need one. Then again, the bridge code is a little crufty
and we might not want to add much more to it for functionality that can
be implemented in a much simpler way elsewhere. My preferred way would
be to use bridge when you really need 802.1d MAC learning, netfilter-bridge
and STP, while we put the optimizations for stuff like VMDq, zero-copy
and multiqueue guest adapters only into the macvlan code.

Arnd <><
Fischer, Anna
2009-08-10 13:16:00 UTC
Permalink
Subject: Re: [PATCH][RFC] net/bridge: add basic VEPA support
Post by Paul Congdon (UC Davis)
As I understand the macvlan code, it currently doesn't allow two VMs
on the
Post by Paul Congdon (UC Davis)
same machine to communicate with one another.
There are patches to do that. I think if we add that, there should be
a way to choose the behavior between either bridging between the
guests or VEPA.
If you implement this direct bridging capability between local VMs for
macvlan, then would this not break existing applications that currently
use it? It would be quite a significant change to how macvlan works
today. I guess, ideally, you would want to have macvlan work in
separate modes, e.g. traditional macvlan, bridging, and VEPA.
Post by Paul Congdon (UC Davis)
I could imagine a hairpin mode on the adjacent bridge making this
possible, but the macvlan code would need to be updated to filter
reflected frames so a source did not receive his own packet.
Right, I missed this point so far. I'll follow up with a patch
to do that.
Can you maybe point me to the missing patches for macvlan that you
have mentioned in other emails, and the one you mention above?
E.g. enabling multicast distribution and allowing local bridging etc.
I could not find any of those in the archives. Thanks.
Post by Paul Congdon (UC Davis)
I could imagine this being done as well, but to also
support selective multicast usage, something similar to the bridge
forwarding table would be needed. I think putting VEPA into a new
driver
Post by Paul Congdon (UC Davis)
would cause you to implement many things the bridge code already
supports.
Post by Paul Congdon (UC Davis)
Given that we expect the bridge standard to ultimately include VEPA,
and the
Post by Paul Congdon (UC Davis)
new functions are basic forwarding operations, it seems to make most
sense
Post by Paul Congdon (UC Davis)
to keep this consistent with the bridge module.
This is the interesting part of the discussion. The bridge and macvlan
drivers certainly have an overlap in functionality and you can argue
that you only need one. Then again, the bridge code is a little crufty
and we might not want to add much more to it for functionality that can
be implemented in a much simpler way elsewhere. My preferred way would
be to use bridge when you really need 802.1d MAC learning, netfilter-
bridge
and STP, while we put the optimizations for stuff like VMDq, zero-copy
and multiqueue guest adapters only into the macvlan code.
I can see this being a possible solution.

My concern with putting VEPA into macvlan instead of the bridging code
is that there will be more work required to make it usable for other
virtualization solution as macvtap will only work for KVM type setups.
Basically, VEPA capabilities would rely on someone developing further
drivers to connect macvlan to different backend interfaces, e.g. one for
KVM (macvtap), one for Xen PV drivers, one for virtio, and whatever else
is out there, or will be there in the future. The bridging code is
already very generic in that respect, and all virtualization layers
can deal with connecting interfaces to a bridge.

Our extensions to the bridging code to enable VEPA for the Linux kernel
are only very minimal code changes and would allow to make VEPA available
to most virtualization solutions today.

Anna
Arnd Bergmann
2009-08-10 15:07:32 UTC
Permalink
Post by Fischer, Anna
Subject: Re: [PATCH][RFC] net/bridge: add basic VEPA support
Post by Paul Congdon (UC Davis)
As I understand the macvlan code, it currently doesn't allow two VMs
on the
Post by Paul Congdon (UC Davis)
same machine to communicate with one another.
There are patches to do that. I think if we add that, there should be
a way to choose the behavior between either bridging between the
guests or VEPA.
If you implement this direct bridging capability between local VMs for
macvlan, then would this not break existing applications that currently
use it? It would be quite a significant change to how macvlan works
today. I guess, ideally, you would want to have macvlan work in
separate modes, e.g. traditional macvlan, bridging, and VEPA.
Right, that's what I meant with my sentence above. I'm not sure
if we need to differentiate traditional macvlan and VEPA though.
AFAICT, the only difference should be the handling of broadcast
and multicast frames returning from the hairpin turn. Since this
does not happen with a traditional macvlan, we can always send them
to all macvlan ports except the source port.
Post by Fischer, Anna
Post by Paul Congdon (UC Davis)
I could imagine a hairpin mode on the adjacent bridge making this
possible, but the macvlan code would need to be updated to filter
reflected frames so a source did not receive his own packet.
Right, I missed this point so far. I'll follow up with a patch
to do that.
Can you maybe point me to the missing patches for macvlan that you
have mentioned in other emails, and the one you mention above?
E.g. enabling multicast distribution and allowing local bridging etc.
I could not find any of those in the archives. Thanks.
The patch from Eric Biederman to allow macvlan to bridge between
its slave ports is at

http://kerneltrap.org/mailarchive/linux-netdev/2009/3/9/5125774

I could not find any patches for the other features (or bugs).
Post by Fischer, Anna
This is the interesting part of the discussion. The bridge and macvlan
drivers certainly have an overlap in functionality and you can argue
that you only need one. Then again, the bridge code is a little crufty
and we might not want to add much more to it for functionality that can
be implemented in a much simpler way elsewhere. My preferred way would
be to use bridge when you really need 802.1d MAC learning, netfilter-
bridge
and STP, while we put the optimizations for stuff like VMDq, zero-copy
and multiqueue guest adapters only into the macvlan code.
I can see this being a possible solution.
My concern with putting VEPA into macvlan instead of the bridging code
is that there will be more work required to make it usable for other
virtualization solution as macvtap will only work for KVM type setups.
Right, I understand.
Post by Fischer, Anna
Basically, VEPA capabilities would rely on someone developing further
drivers to connect macvlan to different backend interfaces, e.g. one for
KVM (macvtap), one for Xen PV drivers, one for virtio, and whatever else
is out there, or will be there in the future. The bridging code is
already very generic in that respect, and all virtualization layers
can deal with connecting interfaces to a bridge.
Our extensions to the bridging code to enable VEPA for the Linux kernel
are only very minimal code changes and would allow to make VEPA available
to most virtualization solutions today.
I don't object to having VEPA supported in the bridge code at all.
I think your patch is simple enough so it won't hurt in the bridge
code. If Stephen prefers to do VEPA only in one component, we should
probably make it possible for that component to act as a bridge between
1+n existing interfaces as well. You can almost do that with the regular
macvlan and the bridge driver, like

/ macvlan0 - br0 - tap0
eth0 -- macvlan1 - br1 - tap1
\ macvlan2 - br2 - tap2

Here, you can have two guests attached to tap devices (or xen net ...)
and the macvlan driver doing the VEPA. Of course this is not how bridge
works -- you would have the same mac addresses on two sides of
the bridge.

So we could have another macvlan backend (let's call it macvbridge)
so you can do this:

/ macvlan0 - 'qemu -net raw'
eth0 -- macvtap0 - 'qemu -net tap,fd=3 3<>/dev/net/macvtap0'
\ macvbr0 -- tap0 - 'qemu -net tap'

The macvbr driver could this way be used to associate an existing
network device to a slave of a macvlan port. Not sure if this
has any significant advantage over your bridge patches, it does
have the obvious disadvantage that someone needs to implement
it first, while your patch is there ;-)

Arnd <><
Fischer, Anna
2009-08-11 14:30:23 UTC
Permalink
Subject: Re: [PATCH][RFC] net/bridge: add basic VEPA support
Post by Fischer, Anna
Subject: Re: [PATCH][RFC] net/bridge: add basic VEPA support
Post by Paul Congdon (UC Davis)
As I understand the macvlan code, it currently doesn't allow two
VMs
Post by Fischer, Anna
on the
Post by Paul Congdon (UC Davis)
same machine to communicate with one another.
There are patches to do that. I think if we add that, there should
be
Post by Fischer, Anna
a way to choose the behavior between either bridging between the
guests or VEPA.
If you implement this direct bridging capability between local VMs
for
Post by Fischer, Anna
macvlan, then would this not break existing applications that
currently
Post by Fischer, Anna
use it? It would be quite a significant change to how macvlan works
today. I guess, ideally, you would want to have macvlan work in
separate modes, e.g. traditional macvlan, bridging, and VEPA.
Right, that's what I meant with my sentence above. I'm not sure
if we need to differentiate traditional macvlan and VEPA though.
AFAICT, the only difference should be the handling of broadcast
and multicast frames returning from the hairpin turn. Since this
does not happen with a traditional macvlan, we can always send them
to all macvlan ports except the source port.
Yes, if you add a check for the original source port on broadcast/
multicast delivery, then macvlan would be able to function in
basic VEPA mode.

Maybe it might still be worth preserving the old behaviour, and
just add an explicit VEPA mode.
Post by Fischer, Anna
Post by Paul Congdon (UC Davis)
I could imagine a hairpin mode on the adjacent bridge making this
possible, but the macvlan code would need to be updated to filter
reflected frames so a source did not receive his own packet.
Right, I missed this point so far. I'll follow up with a patch
to do that.
Can you maybe point me to the missing patches for macvlan that you
have mentioned in other emails, and the one you mention above?
E.g. enabling multicast distribution and allowing local bridging etc.
I could not find any of those in the archives. Thanks.
The patch from Eric Biederman to allow macvlan to bridge between
its slave ports is at
http://kerneltrap.org/mailarchive/linux-netdev/2009/3/9/5125774
Looking through the discussions here, it does not seem as if a decision
was made to integrate those patches, because they would make the macvlan
interface behave too much like a bridge. Also, it seems as if there was
still a problem with doing multicast/broadcast delivery when enabling
local VM-to-VM communication. Is that solved by now?

Thanks,
Anna
Paul Congdon (UC Davis)
2009-08-11 14:55:04 UTC
Permalink
Post by Fischer, Anna
Post by Arnd Bergmann
The patch from Eric Biederman to allow macvlan to bridge between
its slave ports is at
http://kerneltrap.org/mailarchive/linux-netdev/2009/3/9/5125774
Looking through the discussions here, it does not seem as if a decision
was made to integrate those patches, because they would make the macvlan
interface behave too much like a bridge. Also, it seems as if there was
still a problem with doing multicast/broadcast delivery when enabling
local VM-to-VM communication. Is that solved by now?
Also, is there a solution, or plans for a solution, to address macvtap
interfaces that are set to 'promiscuous' mode? It would seem fairly easy to
support this for interfaces that are simply trying to listen to the port
(e.g. Wireshark). If the port was being used by something like a firewall
then the VEPA filtering doesn't work too well.

Paul
Arnd Bergmann
2009-08-12 13:19:36 UTC
Permalink
Post by Paul Congdon (UC Davis)
Post by Fischer, Anna
Post by Arnd Bergmann
The patch from Eric Biederman to allow macvlan to bridge between
its slave ports is at
http://kerneltrap.org/mailarchive/linux-netdev/2009/3/9/5125774
Looking through the discussions here, it does not seem as if a decision
was made to integrate those patches, because they would make the
macvlan interface behave too much like a bridge.
Right, that question is still open, and dont't see it as very important
right now, as long as we can still use it for VEPA.
Post by Paul Congdon (UC Davis)
Post by Fischer, Anna
Also, it seems as if there was still a problem with doing
multicast/broadcast delivery when enabling local VM-to-VM
communication. Is that solved by now?
Not yet, but I guess it comes as a natural extension when I fix
multicast/broadcast delivery from the reflective relay for VEPA.

The logic that I would use there is:

broadcast from a dowstream port:
if (bridge_mode(source_port)) {
forward_to_upstream(frame);
for_each_downstream(port) {
/* deliver to all bridge ports except self, do
not deliver to any VEPA port. */
if (bridge_mode(port) && port != source_port) {
forward_to_downstream(frame, port);
}
}
} else {
forward_to_upstream(frame);
}


broadcast from the upstream port
if (bridge_mode(frame.source)) {
/* comes from a port in bridge mode, so has already been
delivered to all other bridge ports */
for_each_downstream(port) {
if (!bridge_mode(port)) {
forward_to_downstream(frame, port);
}
}
} else if (vepa_mode(frame.source)) {
/* comes from VEPA port, so need to deliver to all
bridge and all vepa ports except self */
for_each_downstream(port) {
if (port != frame.source)
forward_to_downstream(frame, port);
} else {
/* external source, so flood to everyone */
for_each_downstream(port) {
forward_to_downstream(frame, port);
}

For multicast, we can do the same, or optionally add a per-port filter
as you mentioned, if it becomes a bottleneck.

Do you think this addresses the problem, or did I miss something important?
Post by Paul Congdon (UC Davis)
Also, is there a solution, or plans for a solution, to address macvtap
interfaces that are set to 'promiscuous' mode? It would seem fairly easy to
support this for interfaces that are simply trying to listen to the port
(e.g. Wireshark).
If you want to use tcpdump or wireshark on all ports simulateously in a pure
VEPA, you can still attach it to the 'lowerdev', e.g. eth0 or eth0.2 (for macvlan
nested in vlan).
If we allow bridge ports, we might want to extend the local delivery
to also go through all the hooks of the external port, so that you can
attach packet sockets there.
Post by Paul Congdon (UC Davis)
If the port was being used by something like a firewall
then the VEPA filtering doesn't work too well.
Not sure what you mean. Are you talking about a firewall separating the guests
from the outside, between the VEPA and the reflective relay, or a firewall between
the guests in case of local delivery?

Arnd <><
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Fischer, Anna
2009-08-12 14:32:32 UTC
Permalink
Subject: Re: [PATCH][RFC] net/bridge: add basic VEPA support
Post by Paul Congdon (UC Davis)
Post by Fischer, Anna
Post by Arnd Bergmann
The patch from Eric Biederman to allow macvlan to bridge between
its slave ports is at
http://kerneltrap.org/mailarchive/linux-netdev/2009/3/9/5125774
Looking through the discussions here, it does not seem as if a
decision
Post by Paul Congdon (UC Davis)
Post by Fischer, Anna
was made to integrate those patches, because they would make the
macvlan interface behave too much like a bridge.
Right, that question is still open, and dont't see it as very important
right now, as long as we can still use it for VEPA.
Yes, for the basic VEPA this is not important. For MultiChannel VEPA, it
would be nice if a macvlan device could operate as VEPA and as a typical
VEB (VEB = traditional bridge but no learning).

Basically, what we would need to be able to support is running a VEB and
a VEPA simultaneously on the same uplink port (e.g. the physical device).
A new component (called the S-Component) would then multiplex frames
to the VEB or the VEPA based on a tagging scheme.

I could see this potentially working with macvlan, if it can operate in
both VEPA and VEB mode. But you are right that for basic VEPA, it would
not be an immediate requirement.
Post by Paul Congdon (UC Davis)
Also, is there a solution, or plans for a solution, to address
macvtap
Post by Paul Congdon (UC Davis)
interfaces that are set to 'promiscuous' mode? It would seem fairly
easy to
Post by Paul Congdon (UC Davis)
support this for interfaces that are simply trying to listen to the
port
Post by Paul Congdon (UC Davis)
(e.g. Wireshark).
If you want to use tcpdump or wireshark on all ports simulateously in a pure
VEPA, you can still attach it to the 'lowerdev', e.g. eth0 or eth0.2 (for macvlan
nested in vlan).
If we allow bridge ports, we might want to extend the local delivery
to also go through all the hooks of the external port, so that you can
attach packet sockets there.
I think the question here was whether there is a way for a macvlan interface
to be set to promiscuous mode. At the moment, I believe a macvlan interface
only receives packets based on its destination address (this is for unicast
packets now). What if a macvlan interface wanted to get all packets that
are being received (either on the physical device, or on a particular
VLAN if using macvlan nested in vlan). Would this work easily? Imagine
you have a virtual machine attached to that macvlan / macvtap device and
this VM wants to do packet inspection or network traffic monitoring on
all packets flowing through the virtualized server.

Anna
Arnd Bergmann
2009-08-12 16:27:28 UTC
Permalink
Post by Fischer, Anna
Yes, for the basic VEPA this is not important. For MultiChannel VEPA, it
would be nice if a macvlan device could operate as VEPA and as a typical
VEB (VEB = traditional bridge but no learning).
Right, this would be a logical extension in that scenario. I would imagine
that in many scenarios running a VEB also means that you want to use
the advanced ebtables/iptables filtering of the bridge subsystem, but
if all guests trust each other, using macvlan to bridge between them
sounds useful as well, if only for simplicity.
Post by Fischer, Anna
Basically, what we would need to be able to support is running a VEB and
a VEPA simultaneously on the same uplink port (e.g. the physical device).
A new component (called the S-Component) would then multiplex frames
to the VEB or the VEPA based on a tagging scheme.
You can of course do that by adding one port of the S-component to
a port of a bridge, and using another port of the S-component to
create macvlan devices, or you could have multiple ports of the
S-component each with a macvlan multiplexor.

Just to make sure I get the chain right, would it look like this?
(adapted from Paul's PDF)

eth0 (external) ---scomponent0 --- vlan2 --- macvlan0
| | \- macvlan1
| \-vlan3 --- macvlan2
|-scomponent1 --- vlan2 --- br0 --- tap0
| \ --- tap1
|-scomponent2 --- vlan3 --- macvlan3
\-scomponent3 --- --- --- macvlan4

In this scenario, tap0 and tap1 could communicate over the bridge without
tagging, while any data going out through the S-Component gets tagged
with both a 802.1q Q-Tag and an S-Tag.

macvlan4 would be a guest that does its own tagging, and the external
switch would need to check the VLAN IDs, but it could communicate with
any other guest by tagging the frames as 2 or 3.

macvlan2 and macvlan3 could communicate with each other and with external
guests in vlan3.

Guests on scomponent1 and scomponent3 could in theory have
subdivisions of the network with macvlan running in the guest
to run containers.
Post by Fischer, Anna
I think the question here was whether there is a way for a macvlan interface
to be set to promiscuous mode. At the moment, I believe a macvlan interface
only receives packets based on its destination address (this is for unicast
packets now). What if a macvlan interface wanted to get all packets that
are being received (either on the physical device, or on a particular
VLAN if using macvlan nested in vlan). Would this work easily? Imagine
you have a virtual machine attached to that macvlan / macvtap device and
this VM wants to do packet inspection or network traffic monitoring on
all packets flowing through the virtualized server.
Ok, I see. As I said, the host could easily get access to all frames
on macvlan downstream ports by opening a raw socket on the upstream
port (with some extra work if we want to support this in bridge mode).

If you want the inspection to be done in a guest rather than the host,
the easiest way to achieve that would be to connect that raw socket
to the guest using Or's raw frontend for qemu.

Arnd <><
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Fischer, Anna
2009-08-13 22:11:06 UTC
Permalink
Subject: Re: [PATCH][RFC] net/bridge: add basic VEPA support
Post by Fischer, Anna
Yes, for the basic VEPA this is not important. For MultiChannel VEPA,
it
Post by Fischer, Anna
would be nice if a macvlan device could operate as VEPA and as a
typical
Post by Fischer, Anna
VEB (VEB = traditional bridge but no learning).
Right, this would be a logical extension in that scenario. I would imagine
that in many scenarios running a VEB also means that you want to use
the advanced ebtables/iptables filtering of the bridge subsystem, but
if all guests trust each other, using macvlan to bridge between them
sounds useful as well, if only for simplicity.
Post by Fischer, Anna
Basically, what we would need to be able to support is running a VEB
and
Post by Fischer, Anna
a VEPA simultaneously on the same uplink port (e.g. the physical
device).
Post by Fischer, Anna
A new component (called the S-Component) would then multiplex frames
to the VEB or the VEPA based on a tagging scheme.
You can of course do that by adding one port of the S-component to
a port of a bridge, and using another port of the S-component to
create macvlan devices, or you could have multiple ports of the
S-component each with a macvlan multiplexor.
Just to make sure I get the chain right, would it look like this?
(adapted from Paul's PDF)
eth0 (external) ---scomponent0 --- vlan2 --- macvlan0
| | \- macvlan1
| \-vlan3 --- macvlan2
|-scomponent1 --- vlan2 --- br0 --- tap0
| \ --- tap1
|-scomponent2 --- vlan3 --- macvlan3
\-scomponent3 --- --- --- macvlan4
In this scenario, tap0 and tap1 could communicate over the bridge without
tagging, while any data going out through the S-Component gets tagged
with both a 802.1q Q-Tag and an S-Tag.
Yes, that looks right. If all the different interfaces, e.g. bridge ports,
macvlan devices, vlan tagging devices can be stacked that easily without
any known issues, that would be great.
macvlan4 would be a guest that does its own tagging, and the external
switch would need to check the VLAN IDs, but it could communicate with
any other guest by tagging the frames as 2 or 3.
macvlan2 and macvlan3 could communicate with each other and with external
guests in vlan3.
Guests on scomponent1 and scomponent3 could in theory have
subdivisions of the network with macvlan running in the guest
to run containers.
Post by Fischer, Anna
I think the question here was whether there is a way for a macvlan
interface
Post by Fischer, Anna
to be set to promiscuous mode. At the moment, I believe a macvlan
interface
Post by Fischer, Anna
only receives packets based on its destination address (this is for
unicast
Post by Fischer, Anna
packets now). What if a macvlan interface wanted to get all packets
that
Post by Fischer, Anna
are being received (either on the physical device, or on a particular
VLAN if using macvlan nested in vlan). Would this work easily?
Imagine
Post by Fischer, Anna
you have a virtual machine attached to that macvlan / macvtap device
and
Post by Fischer, Anna
this VM wants to do packet inspection or network traffic monitoring
on
Post by Fischer, Anna
all packets flowing through the virtualized server.
Ok, I see. As I said, the host could easily get access to all frames
on macvlan downstream ports by opening a raw socket on the upstream
port (with some extra work if we want to support this in bridge mode).
If you want the inspection to be done in a guest rather than the host,
the easiest way to achieve that would be to connect that raw socket
to the guest using Or's raw frontend for qemu.
I am not too familiar with that raw frontend for qemu to be honest, but
if it can share the physical device with other macvlan interfaces
simultaneously, then I think that would indeed be sufficient to support
promiscuous mode ports. We would need to have a similar sort of driver
for Xen and other hypervisor solutions again as well though.

If it is possible to easily stack macvlan devices and bridges as you
describe above, then a promiscuous port should also be realized quite
easily as a typical bridge port, e.g. as shown above tap0 and tap1
would be typical, traditional bridge ports and though could send
and receive from/with any MAC addresses they like.

Anna
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Fischer, Anna
2009-08-13 22:24:20 UTC
Permalink
Subject: Re: [PATCH][RFC] net/bridge: add basic VEPA support
Post by Fischer, Anna
Post by Arnd Bergmann
The patch from Eric Biederman to allow macvlan to bridge between
its slave ports is at
http://kerneltrap.org/mailarchive/linux-netdev/2009/3/9/5125774
Looking through the discussions here, it does not seem as if a
decision
Post by Fischer, Anna
was made to integrate those patches, because they would make the
macvlan interface behave too much like a bridge.
Right, that question is still open, and dont't see it as very important
right now, as long as we can still use it for VEPA.
Post by Fischer, Anna
Also, it seems as if there was still a problem with doing
multicast/broadcast delivery when enabling local VM-to-VM
communication. Is that solved by now?
Not yet, but I guess it comes as a natural extension when I fix
multicast/broadcast delivery from the reflective relay for VEPA.
if (bridge_mode(source_port)) {
forward_to_upstream(frame);
for_each_downstream(port) {
/* deliver to all bridge ports except self, do
not deliver to any VEPA port. */
if (bridge_mode(port) && port != source_port) {
forward_to_downstream(frame, port);
}
}
} else {
forward_to_upstream(frame);
}
broadcast from the upstream port
if (bridge_mode(frame.source)) {
/* comes from a port in bridge mode, so has already been
delivered to all other bridge ports */
for_each_downstream(port) {
if (!bridge_mode(port)) {
forward_to_downstream(frame, port);
}
}
} else if (vepa_mode(frame.source)) {
/* comes from VEPA port, so need to deliver to all
bridge and all vepa ports except self */
for_each_downstream(port) {
if (port != frame.source)
forward_to_downstream(frame, port);
} else {
/* external source, so flood to everyone */
for_each_downstream(port) {
forward_to_downstream(frame, port);
}
For multicast, we can do the same, or optionally add a per-port filter
as you mentioned, if it becomes a bottleneck.
Do you think this addresses the problem, or did I miss something important?
Yes, I think this addresses the problem. It would be very useful if
this functionality was in macvlan.

Thanks,
Anna
Arnd Bergmann
2009-08-10 15:06:58 UTC
Permalink
Post by Fischer, Anna
Subject: Re: [PATCH][RFC] net/bridge: add basic VEPA support
Post by Paul Congdon (UC Davis)
As I understand the macvlan code, it currently doesn't allow two VMs
on the
Post by Paul Congdon (UC Davis)
same machine to communicate with one another.
There are patches to do that. I think if we add that, there should be
a way to choose the behavior between either bridging between the
guests or VEPA.
If you implement this direct bridging capability between local VMs for
macvlan, then would this not break existing applications that currently
use it? It would be quite a significant change to how macvlan works
today. I guess, ideally, you would want to have macvlan work in
separate modes, e.g. traditional macvlan, bridging, and VEPA.
Right, that's what I meant with my sentence above. I'm not sure
if we need to differentiate traditional macvlan and VEPA though.
AFAICT, the only difference should be the handling of broadcast
and multicast frames returning from the hairpin turn. Since this
does not happen with a traditional macvlan, we can always send them
to all macvlan ports except the source port.
Post by Fischer, Anna
Post by Paul Congdon (UC Davis)
I could imagine a hairpin mode on the adjacent bridge making this
possible, but the macvlan code would need to be updated to filter
reflected frames so a source did not receive his own packet.
Right, I missed this point so far. I'll follow up with a patch
to do that.
Can you maybe point me to the missing patches for macvlan that you
have mentioned in other emails, and the one you mention above?
E.g. enabling multicast distribution and allowing local bridging etc.
I could not find any of those in the archives. Thanks.
The patch from Eric Biederman to allow macvlan to bridge between
its slave ports is at

http://kerneltrap.org/mailarchive/linux-netdev/2009/3/9/5125774

I could not find any patches for the other features (or bugs).
Post by Fischer, Anna
This is the interesting part of the discussion. The bridge and macvlan
drivers certainly have an overlap in functionality and you can argue
that you only need one. Then again, the bridge code is a little crufty
and we might not want to add much more to it for functionality that can
be implemented in a much simpler way elsewhere. My preferred way would
be to use bridge when you really need 802.1d MAC learning, netfilter-
bridge
and STP, while we put the optimizations for stuff like VMDq, zero-copy
and multiqueue guest adapters only into the macvlan code.
I can see this being a possible solution.
My concern with putting VEPA into macvlan instead of the bridging code
is that there will be more work required to make it usable for other
virtualization solution as macvtap will only work for KVM type setups.
Right, I understand.
Post by Fischer, Anna
Basically, VEPA capabilities would rely on someone developing further
drivers to connect macvlan to different backend interfaces, e.g. one for
KVM (macvtap), one for Xen PV drivers, one for virtio, and whatever else
is out there, or will be there in the future. The bridging code is
already very generic in that respect, and all virtualization layers
can deal with connecting interfaces to a bridge.
Our extensions to the bridging code to enable VEPA for the Linux kernel
are only very minimal code changes and would allow to make VEPA available
to most virtualization solutions today.
I don't object to having VEPA supported in the bridge code at all.
I think your patch is simple enough so it won't hurt in the bridge
code. If Stephen prefers to do VEPA only in one component, we should
probably make it possible for that component to act as a bridge between
1+n existing interfaces as well. You can almost do that with the regular
macvlan and the bridge driver, like

/ macvlan0 - br0 - tap0
eth0 -- macvlan1 - br1 - tap1
\ macvlan2 - br2 - tap2

Here, you can have two guests attached to tap devices (or xen net ...)
and the macvlan driver doing the VEPA. Of course this is not how bridge
works -- you would have the same mac addresses on two sides of
the bridge.

So we could have another macvlan backend (let's call it macvbridge)
so you can do this:

/ macvlan0 - 'qemu -net raw'
eth0 -- macvtap0 - 'qemu -net tap,fd=3 3<>/dev/net/macvtap0'
\ macvbr0 -- tap0 - 'qemu -net tap'

The macvbr driver could this way be used to associate an existing
network device to a slave of a macvlan port. Not sure if this
has any significant advantage over your bridge patches, it does
have the obvious disadvantage that someone needs to implement
it first, while your patch is there ;-)

Arnd <><
Fischer, Anna
2009-08-07 21:00:54 UTC
Permalink
Hi Yaron,

Yes, I also believe that VEPA + SRIOV can potentially, in some deployments, achieve better performance than a bridge/tap configuration, especially when you run multiple VMs and if you want to enable more sophisticated network processing in the data path.

If you do have a SRIOV NIC that supports VEPA, then I would think that you do not have QEMU or macvtap in the setup any more though. Simply because in that case the VM can directly access the VF on the physical device. That would be ideal.

I do think that the macvtap driver is a good addition as a simple and fast virtual network I/O interface, in case you do not need full bridge functionality. It does seem to assume though that the virtualization software uses QEMU/tap interfaces. How would this work with a Xen para-virtualized network interface? I guess there would need to be yet another driver?

Anna
--
From: Yaron Haviv [mailto:***@voltaire.com]
Sent: 07 August 2009 21:36
To: ***@yahoogroups.com; ***@linux-foundation.org; Fischer, Anna
Cc: ***@lists.linux-foundation.org; ***@vger.kernel.org; ***@lists.linux-foundation.org; ***@davemloft.net; ***@trash.net; ***@gmail.com; ***@arndb.de
Subject: Re: [evb] RE: [PATCH][RFC] net/bridge: add basic VEPA support

Paul,

I also think that bridge may not be the right place for VEPA, but rather a simpler sw/hw mux
Although the VEPA support may reside in multiple places (I.e. also in the bridge)

As Arnd pointed out Or already added an extension to qemu that allow direct guest virtual NIC mapping to an interface device (vs using tap), this was done specifically to address VEPA, and result in much faster performance and lower cpu overhead (Or and some others are planning additional meaningful performance optimizations)

The interface multiplexing can be achieved using macvlan driver or using an SR-IOV capable NIC (the preferred option), macvlan may need to be extended to support VEPA multicast handling, this looks like a rather simple task

It may be counter intuitive for some, but we expect the (completed) qemu VEPA mode + SR-IOV + certain switches with hairpin (vepa) mode to perform faster than using bridge+tap even for connecting 2 VMs on the same host


Yaron

Sent from BlackBerry
________________________________________
From: ***@yahoogroups.com
To: 'Stephen Hemminger' ; 'Fischer, Anna'
Cc: ***@lists.linux-foundation.org ; linux-***@vger.kernel.org ; ***@vger.kernel.org ; ***@lists.linux-foundation.org ; ***@yahoogroups.com ; ***@davemloft.net ; ***@trash.net ; ***@gmail.com ; 'Arnd Bergmann'
Sent: Fri Aug 07 21:58:00 2009
Subject: [evb] RE: [PATCH][RFC] net/bridge: add basic VEPA support
 
Post by Stephen Hemminger
After reading more about this, I am not convinced this should be part
forwarding table and optional spanning tree. Well the VEPA code short
circuits both of these; it can't imagine it working with STP turned
on. The only part of bridge code that really gets used by this are the
receive packet hooks and the crufty old API.
So instead of adding more stuff to existing bridge code, why not have
a new driver for just VEPA. You could do it with a simple version of
macvlan type driver.
Stephen,

Thanks for your comments and questions. We do believe the bridge code is
the right place for this, so I'd like to embellish on that a bit more to
help persuade you. Sorry for the long winded response, but here are some
thoughts:

- First and foremost, VEPA is going to be a standard addition to the IEEE
802.1Q specification. The working group agreed at the last meeting to
pursue a project to augment the bridge standard with hairpin mode (aka
reflective relay) and a remote filtering service (VEPA). See for details:
http://www.ieee802.org/1/files/public/docs2009/new-evb-congdon-evbPar5C-0709
-v01.pdf

- The VEPA functionality was really a pretty small change to the code with
low risk and wouldn't seem to warrant an entire new driver or module.

- There are good use cases where VMs will want to have some of their
interfaces attached to bridges and others to bridges operating in VEPA mode.
In other words, we see simultaneous operation of the bridge code and VEPA
occurring, so having as much of the underlying code as common as possible
would seem to be beneficial.

- By augmenting the bridge code with VEPA there is a great amount of re-use
achieved. It works wherever the bridge code works and doesn't need anything
special to support KVM, XEN, and all the hooks, etc...

- The hardware vendors building SR-IOV NICs with embedded switches will be
adding VEPA mode, so by keeping the bridge module in sync would be
consistent with this trend and direction. It will be possible to extend the
hardware implementations by cascading a software bridge and/or VEPA, so
being in sync with the architecture would make this more consistent.

- The forwarding table is still needed and used on inbound traffic to
deliver frames to the correct virtual interfaces and to filter any reflected
frames. A new driver would have to basically implement an equivalent
forwarding table anyway. As I understand the current macvlan type driver,
it wouldn't filter multicast frames properly without such a table.

- It seems the hairpin mode would be needed in the bridge module whether
VEPA was added to the bridge module or a new driver. Having the associated
changes together in the same code could aid in understanding and deployment.

As I understand the macvlan code, it currently doesn't allow two VMs on the
same machine to communicate with one another. I could imagine a hairpin
mode on the adjacent bridge making this possible, but the macvlan code would
need to be updated to filter reflected frames so a source did not receive
his own packet. I could imagine this being done as well, but to also
support selective multicast usage, something similar to the bridge
forwarding table would be needed. I think putting VEPA into a new driver
would cause you to implement many things the bridge code already supports.
Given that we expect the bridge standard to ultimately include VEPA, and the
new functions are basic forwarding operations, it seems to make most sense
to keep this consistent with the bridge module.

Paul
��칻�&�~�&���+-��ݶ��w��˛���m�޵ׯ�{ay�ʇڙ�,j��f���h���z��w���
Arnd Bergmann
2009-08-08 09:22:28 UTC
Permalink
Post by Fischer, Anna
If you do have a SRIOV NIC that supports VEPA, then I would think
that you do not have QEMU or macvtap in the setup any more though.
Simply because in that case the VM can directly access the VF on
the physical device. That would be ideal.
There may be reasons why even with an SR-IOV adapter you may want
to use the macvtap setup, with some extensions. E.g. guest migration
becomes a lot simpler if you don't have to deal with PCI passthrough
devices. If we manage to add both TX and RX zero-copy (into the
guest) to the macvlan driver, we can treat an SR-IOV adapter like
a VMDq adapter and get the best of both.
Post by Fischer, Anna
I do think that the macvtap driver is a good addition as a simple
and fast virtual network I/O interface, in case you do not need
full bridge functionality. It does seem to assume though that the
virtualization software uses QEMU/tap interfaces. How would this
work with a Xen para-virtualized network interface? I guess there
would need to be yet another driver?
I'm not sure how Xen guest networking works, but if neither the
traditional macvlan driver nor the macvtap driver are able to
connect it to the external NIC, then you can probably add a third
macvlan backend to handle that.

Arnd <><
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Paul Congdon (UC Davis)
2009-08-07 21:06:58 UTC
Permalink
Yaron,


The interface multiplexing can be achieved using macvlan driver or using an SR-IOV capable NIC (the preferred option), macvlan may need to be extended to support VEPA multicast handling, this looks like a rather simple task

Agreed that the hardware solution is preferred so the macvlan implementation doesn’t really matter. If we are talking SR-IOV, then it is direct mapped, regardless of whether there is a VEB or VEPA in the hardware below, so you are bypassing the bridge software code also.

I disagree that adding the multicast handling is simple – while not conceptually hard, it will basically require you to put an address table into the macvlan implementation – if you have that, then why not have just used the one already in the bridge code. If you hook a VEPA up to a non-hairpin mode external bridge, you get the macvlan capability as well.

It also seems to me like the special macvlan interfaces for KVM don’t apply to XEN or a non-virtualized environment? Or more has to be written to make that work? If it is in the bridge code, you get all of this re-use.
Stephen Hemminger
2009-08-07 21:36:52 UTC
Permalink
On Fri, 7 Aug 2009 14:06:58 -0700
Post by Fischer, Anna
Yaron,
=20
=20
The interface multiplexing can be achieved using macvlan driver or us=
ing an SR-IOV capable NIC (the preferred option), macvlan may need to b=
e extended to support VEPA multicast handling, this looks like a rather=
simple task=20
Post by Fischer, Anna
=20
Agreed that the hardware solution is preferred so the macvlan impleme=
ntation doesn=E2=80=99t really matter. If we are talking SR-IOV, then =
it is direct mapped, regardless of whether there is a VEB or VEPA in th=
e hardware below, so you are bypassing the bridge software code also. =20
Post by Fischer, Anna
=20
I disagree that adding the multicast handling is simple =E2=80=93 whi=
le not conceptually hard, it will basically require you to put an addre=
ss table into the macvlan implementation =E2=80=93 if you have that, th=
en why not have just used the one already in the bridge code. If you h=
ook a VEPA up to a non-hairpin mode external bridge, you get the macvla=
n capability as well.

I have a patch that forwards all multicast packets, and another that do=
es
proper forwarding. It should have worked that way in original macvlan, =
the
current behavior is really a bug.


--=20
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Or Gerlitz
2009-08-09 11:19:08 UTC
Permalink
I have a patch that forwards all multicast packets, and another that does proper forwarding. It should have worked that way in original macvlan, the current behavior is really a bug.
Looking in macvlan_set_multicast_list() it acts in a similar manner to
macvlan_set_mac_address() in the sense that it calls dev_mc_sync(). I
assume what's left is to add macvlan_hash_xxx multicast logic to
map/unmap multicast groups to what macvlan devices want to receive them
and this way the flooding can be removed, correct?


Or.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Stephen Hemminger
2009-08-10 15:20:37 UTC
Permalink
On Sun, 09 Aug 2009 14:19:08 +0300
Post by Or Gerlitz
I have a patch that forwards all multicast packets, and another that does proper forwarding. It should have worked that way in original macvlan, the current behavior is really a bug.
Looking in macvlan_set_multicast_list() it acts in a similar manner to
macvlan_set_mac_address() in the sense that it calls dev_mc_sync(). I
assume what's left is to add macvlan_hash_xxx multicast logic to
map/unmap multicast groups to what macvlan devices want to receive them
and this way the flooding can be removed, correct?
The device can just flood all multicast packets, since the filtering
is done on the receive path anyway.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Arnd Bergmann
2009-08-10 15:28:15 UTC
Permalink
Post by Stephen Hemminger
Post by Or Gerlitz
Looking in macvlan_set_multicast_list() it acts in a similar manner to
macvlan_set_mac_address() in the sense that it calls dev_mc_sync(). I
assume what's left is to add macvlan_hash_xxx multicast logic to
map/unmap multicast groups to what macvlan devices want to receive them
and this way the flooding can be removed, correct?
The device can just flood all multicast packets, since the filtering
is done on the receive path anyway.
But we'd still have to copy the frames to user space (for both
macvtap and raw packet sockets) and exit from the guest to inject
it into its stack, right?

I guess for multicast heavy workloads, we could save a lot of cycles
by throwing the frames away as early as possible. How common are those
setups in virtual servers though?

Arnd <><
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Fischer, Anna
2009-08-10 16:32:01 UTC
Permalink
Post by Fischer, Anna
Subject: Re: [evb] RE: [PATCH][RFC] net/bridge: add basic VEPA support
On Sun, 09 Aug 2009 14:19:08 +0300, Or Gerlitz
Post by Or Gerlitz
Looking in macvlan_set_multicast_list() it acts in a similar manner
to
Post by Or Gerlitz
macvlan_set_mac_address() in the sense that it calls dev_mc_sync().
I
Post by Or Gerlitz
assume what's left is to add macvlan_hash_xxx multicast logic to
map/unmap multicast groups to what macvlan devices want to receive
them
Post by Or Gerlitz
and this way the flooding can be removed, correct?
The device can just flood all multicast packets, since the filtering
is done on the receive path anyway.
Is this handled by one of the additional patches? In the current kernel tree
macvlan code it looks as if multicast filtering is only handled by the
physical device driver, but not on particular macvlan devices.
Post by Fischer, Anna
But we'd still have to copy the frames to user space (for both
macvtap and raw packet sockets) and exit from the guest to inject
it into its stack, right?
I think it would be nice if you can implement what Or describes for
macvlan and avoid flooding, and it doesn't sound too hard to do.

I guess one advantage for macvlan (over the bridge) is that you can
program in all information you have for the ports attached to it, e.g.
MAC addresses and multicast addresses. So you could take advantage of
that whereas the bridge always floods multicast frames to all ports.

How would this work though, if the OS inside the guest wants to register
to a particular multicast address? Is this propagated through the backend
drivers to the macvlan/macvtap interface?

Anna

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Stephen Hemminger
2009-08-10 16:51:18 UTC
Permalink
On Mon, 10 Aug 2009 16:32:01 +0000
Post by Fischer, Anna
Post by Fischer, Anna
Subject: Re: [evb] RE: [PATCH][RFC] net/bridge: add basic VEPA support
On Sun, 09 Aug 2009 14:19:08 +0300, Or Gerlitz
Post by Or Gerlitz
Looking in macvlan_set_multicast_list() it acts in a similar manner
to
Post by Or Gerlitz
macvlan_set_mac_address() in the sense that it calls dev_mc_sync().
I
Post by Or Gerlitz
assume what's left is to add macvlan_hash_xxx multicast logic to
map/unmap multicast groups to what macvlan devices want to receive
them
Post by Or Gerlitz
and this way the flooding can be removed, correct?
The device can just flood all multicast packets, since the filtering
is done on the receive path anyway.
Is this handled by one of the additional patches? In the current kernel tree
macvlan code it looks as if multicast filtering is only handled by the
physical device driver, but not on particular macvlan devices.
Post by Fischer, Anna
But we'd still have to copy the frames to user space (for both
macvtap and raw packet sockets) and exit from the guest to inject
it into its stack, right?
I think it would be nice if you can implement what Or describes for
macvlan and avoid flooding, and it doesn't sound too hard to do.
I guess one advantage for macvlan (over the bridge) is that you can
program in all information you have for the ports attached to it, e.g.
MAC addresses and multicast addresses. So you could take advantage of
that whereas the bridge always floods multicast frames to all ports.
How would this work though, if the OS inside the guest wants to register
to a particular multicast address? Is this propagated through the backend
drivers to the macvlan/macvtap interface?
Sure filtering is better, but multicast performance with large number
of guests is really a corner case, not the real performance issue.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Arnd Bergmann
2009-08-10 19:18:49 UTC
Permalink
Post by Stephen Hemminger
Post by Fischer, Anna
How would this work though, if the OS inside the guest wants to register
to a particular multicast address? Is this propagated through the backend
drivers to the macvlan/macvtap interface?
Sure filtering is better, but multicast performance with large number
of guests is really a corner case, not the real performance issue.
Well, right now, qemu does not care at all about this, it essentially
leaves the tun device in ALLMULTI state. I should check whether macvtap
at this stage can receive multicast frames at all, but if it does,
it will get them all ;-).

If we want to implement this with kvm, we would have to start with
the qemu virtio-net implementation, to move the receive filter into
the tap device. With tun/tap that will mean less copying to user
space, with macvtap (after implementing TUNSETTXFILTER) we get already
pretty far because we no longer need to have the external interface
in ALLMULTI state. Once that is in place, we can start thinking about
filtering per virtual device.

Arnd <><
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Or Gerlitz
2009-08-27 12:35:55 UTC
Permalink
Looking in macvlan_set_multicast_list() it acts in a similar manner to macvlan_set_mac_address() in the sense that it calls dev_mc_sync(). I assume what's left is to add macvlan_hash_xxx multicast logic to map/unmap multicast groups to what macvlan devices want to receive them and this way the flooding can be removed, correct?
The device can just flood all multicast packets, since the filtering is done on the receive path anyway.
for each multicast packet, macvlan_broadcast is invoked and calls
skb_clone/ netif_rx for each device, now a smart scheme that takes into
account (hash) the multicast list of the different macvlan devices
would save the skb_clone call, isn't it?

Or.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Benny Amorsen
2009-08-08 08:50:10 UTC
Permalink
Post by Fischer, Anna
If you do have a SRIOV NIC that supports VEPA, then I would think that
you do not have QEMU or macvtap in the setup any more though. Simply
because in that case the VM can directly access the VF on the physical
device. That would be ideal.
I'm just trying to understand how this all works, so I'm probably asking
a stupid question:

Would a SRIOV NIC with VEPA support show up as multiple devices? I.e.
would I get e.g. eth0-eth7 for a NIC with support for 8 virtual
interfaces? Would they have different MAC addresses?


/Benny
Arnd Bergmann
2009-08-08 09:44:39 UTC
Permalink
Post by Benny Amorsen
Would a SRIOV NIC with VEPA support show up as multiple devices? I.e.
would I get e.g. eth0-eth7 for a NIC with support for 8 virtual
interfaces? Would they have different MAC addresses?
It could, but the idea of SR-IOV is that it shows up as 8 PCI
devices. One of them is owned by the host and is seen as eth0
there. The other seven PCI devices (virtual functions) are meant
to be assigned to the guest using PCI passthrough and will show
up as the guests eth0, each one with its own MAC address.

An other mode of operation is VMDq, where the host owns all
interfaces and you might see eth0-eth7 there. You can then attach
a qemu process with a raw packet socket or a single macvtap port
for each of those interfaces. This is not yet implemented in Linux,
so how it will be done is still open. It might all be integrated
into macvlan or some new subsystem alternatively.

AFAIK, every SR-IOV adapter can also be operated as a VMDq adapter,
but there are VMDq adapters that do not support SR-IOV.

Arnd <><

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Yaron Haviv
2009-08-07 20:35:34 UTC
Permalink
Paul,

I also think that bridge may not be the right place for VEPA, but rather a simpler sw/hw mux
Although the VEPA support may reside in multiple places (I.e. also in the bridge)

As Arnd pointed out Or already added an extension to qemu that allow direct guest virtual NIC mapping to an interface device (vs using tap), this was done specifically to address VEPA, and result in much faster performance and lower cpu overhead (Or and some others are planning additional meaningful performance optimizations)

The interface multiplexing can be achieved using macvlan driver or using an SR-IOV capable NIC (the preferred option), macvlan may need to be extended to support VEPA multicast handling, this looks like a rather simple task

It may be counter intuitive for some, but we expect the (completed) qemu VEPA mode + SR-IOV + certain switches with hairpin (vepa) mode to perform faster than using bridge+tap even for connecting 2 VMs on the same host


Yaron

Sent from BlackBerry

________________________________

From: ***@yahoogroups.com
To: 'Stephen Hemminger' ; 'Fischer, Anna'
Cc: ***@lists.linux-foundation.org ; linux-***@vger.kernel.org ; ***@vger.kernel.org ; ***@lists.linux-foundation.org ; ***@yahoogroups.com ; ***@davemloft.net ; ***@trash.net ; ***@gmail.com ; 'Arnd Bergmann'
Sent: Fri Aug 07 21:58:00 2009
Subject: [evb] RE: [PATCH][RFC] net/bridge: add basic VEPA support
Post by Stephen Hemminger
After reading more about this, I am not convinced this should be part
forwarding table and optional spanning tree. Well the VEPA code short
circuits both of these; it can't imagine it working with STP turned
on. The only part of bridge code that really gets used by this are the
receive packet hooks and the crufty old API.
So instead of adding more stuff to existing bridge code, why not have
a new driver for just VEPA. You could do it with a simple version of
macvlan type driver.
Stephen,

Thanks for your comments and questions. We do believe the bridge code is
the right place for this, so I'd like to embellish on that a bit more to
help persuade you. Sorry for the long winded response, but here are some
thoughts:

- First and foremost, VEPA is going to be a standard addition to the IEEE
802.1Q specification. The working group agreed at the last meeting to
pursue a project to augment the bridge standard with hairpin mode (aka
reflective relay) and a remote filtering service (VEPA). See for details:
http://www.ieee802.org/1/files/public/docs2009/new-evb-congdon-evbPar5C-0709 <http://www.ieee802.org/1/files/public/docs2009/new-evb-congdon-evbPar5C-0709>
-v01.pdf

- The VEPA functionality was really a pretty small change to the code with
low risk and wouldn't seem to warrant an entire new driver or module.

- There are good use cases where VMs will want to have some of their
interfaces attached to bridges and others to bridges operating in VEPA mode.
In other words, we see simultaneous operation of the bridge code and VEPA
occurring, so having as much of the underlying code as common as possible
would seem to be beneficial.

- By augmenting the bridge code with VEPA there is a great amount of re-use
achieved. It works wherever the bridge code works and doesn't need anything
special to support KVM, XEN, and all the hooks, etc...

- The hardware vendors building SR-IOV NICs with embedded switches will be
adding VEPA mode, so by keeping the bridge module in sync would be
consistent with this trend and direction. It will be possible to extend the
hardware implementations by cascading a software bridge and/or VEPA, so
being in sync with the architecture would make this more consistent.

- The forwarding table is still needed and used on inbound traffic to
deliver frames to the correct virtual interfaces and to filter any reflected
frames. A new driver would have to basically implement an equivalent
forwarding table anyway. As I understand the current macvlan type driver,
it wouldn't filter multicast frames properly without such a table.

- It seems the hairpin mode would be needed in the bridge module whether
VEPA was added to the bridge module or a new driver. Having the associated
changes together in the same code could aid in understanding and deployment.

As I understand the macvlan code, it currently doesn't allow two VMs on the
same machine to communicate with one another. I could imagine a hairpin
mode on the adjacent bridge making this possible, but the macvlan code would
need to be updated to filter reflected frames so a source did not receive
his own packet. I could imagine this being done as well, but to also
support selective multicast usage, something similar to the bridge
forwarding table would be needed. I think putting VEPA into a new driver
would cause you to implement many things the bridge code already supports.
Given that we expect the bridge standard to ultimately include VEPA, and the
new functions are basic forwarding operations, it seems to make most sense
to keep this consistent with the bridge module.

Paul



__._,_.___
Messages in this topic <http://groups.yahoo.com/group/evb/message/167;_ylc=X3oDMTMzb3FibzIzBF9TAzk3MzU5NzE0BGdycElkAzIzODk2NDQ3BGdycHNwSWQDMTcwNTAwNDc1MARtc2dJZAMyMTIEc2VjA2Z0cgRzbGsDdnRwYwRzdGltZQMxMjQ5NjcxNTEwBHRwY0lkAzE2Nw--> (9) Reply (via web post) <http://groups.yahoo.com/group/evb/post;_ylc=X3oDMTJwcDZzNTZqBF9TAzk3MzU5NzE0BGdycElkAzIzODk2NDQ3BGdycHNwSWQDMTcwNTAwNDc1MARtc2dJZAMyMTIEc2VjA2Z0cgRzbGsDcnBseQRzdGltZQMxMjQ5NjcxNTEw?act=reply&messageNum=212> | Start a new topic <http://groups.yahoo.com/group/evb/post;_ylc=X3oDMTJmZW52ZmhiBF9TAzk3MzU5NzE0BGdycElkAzIzODk2NDQ3BGdycHNwSWQDMTcwNTAwNDc1MARzZWMDZnRyBHNsawNudHBjBHN0aW1lAzEyNDk2NzE1MTA->
Messages <http://groups.yahoo.com/group/evb/messages;_ylc=X3oDMTJmODJ0MmU2BF9TAzk3MzU5NzE0BGdycElkAzIzODk2NDQ3BGdycHNwSWQDMTcwNTAwNDc1MARzZWMDZnRyBHNsawNtc2dzBHN0aW1lAzEyNDk2NzE1MTA-> | Files <http://groups.yahoo.com/group/evb/files;_ylc=X3oDMTJnaGdsYXI4BF9TAzk3MzU5NzE0BGdycElkAzIzODk2NDQ3BGdycHNwSWQDMTcwNTAwNDc1MARzZWMDZnRyBHNsawNmaWxlcwRzdGltZQMxMjQ5NjcxNTEw> | Photos <http://groups.yahoo.com/group/evb/photos;_ylc=X3oDMTJmYm90MmpqBF9TAzk3MzU5NzE0BGdycElkAzIzODk2NDQ3BGdycHNwSWQDMTcwNTAwNDc1MARzZWMDZnRyBHNsawNwaG90BHN0aW1lAzEyNDk2NzE1MTA-> | Links <http://groups.yahoo.com/group/evb/links;_ylc=X3oDMTJnbWdyaTZnBF9TAzk3MzU5NzE0BGdycElkAzIzODk2NDQ3BGdycHNwSWQDMTcwNTAwNDc1MARzZWMDZnRyBHNsawNsaW5rcwRzdGltZQMxMjQ5NjcxNTEw> | Database <http://groups.yahoo.com/group/evb/database;_ylc=X3oDMTJkZW1ka3FhBF9TAzk3MzU5NzE0BGdycElkAzIzODk2NDQ3BGdycHNwSWQDMTcwNTAwNDc1MARzZWMDZnRyBHNsawNkYgRzdGltZQMxMjQ5NjcxNTEw> | Polls <http://groups.yahoo.com/group/evb/polls;_ylc=X3oDMTJnMG9lZTJuBF9TAzk3MzU5NzE0BGdycElkAzIzODk2NDQ3BGdycHNwSWQDMTcwNTAwNDc1MARzZWMDZnRyBHNsawNwb2xscwRzdGltZQMxMjQ5NjcxNTEw> | Members <http://groups.yahoo.com/group/evb/members;_ylc=X3oDMTJmMWdwYXViBF9TAzk3MzU5NzE0BGdycElkAzIzODk2NDQ3BGdycHNwSWQDMTcwNTAwNDc1MARzZWMDZnRyBHNsawNtYnJzBHN0aW1lAzEyNDk2NzE1MTA-> | Calendar <http://groups.yahoo.com/group/evb/calendar;_ylc=X3oDMTJlZnQ1N25iBF9TAzk3MzU5NzE0BGdycElkAzIzODk2NDQ3BGdycHNwSWQDMTcwNTAwNDc1MARzZWMDZnRyBHNsawNjYWwEc3RpbWUDMTI0OTY3MTUxMA-->
Yahoo! Groups <http://groups.yahoo.com/;_ylc=X3oDMTJlNDhoZDY1BF9TAzk3MzU5NzE0BGdycElkAzIzODk2NDQ3BGdycHNwSWQDMTcwNTAwNDc1MARzZWMDZnRyBHNsawNnZnAEc3RpbWUDMTI0OTY3MTUxMA-->
Change settings via the Web <http://groups.yahoo.com/group/evb/join;_ylc=X3oDMTJna2g4aW9zBF9TAzk3MzU5NzE0BGdycElkAzIzODk2NDQ3BGdycHNwSWQDMTcwNTAwNDc1MARzZWMDZnRyBHNsawNzdG5ncwRzdGltZQMxMjQ5NjcxNTEw> (Yahoo! ID required)
Change settings via email: Switch delivery to Daily Digest <mailto:evb-***@yahoogroups.com?subject=Email Delivery: Digest> | Switch format to Traditional <mailto:evb-***@yahoogroups.com?subject=Change Delivery Format: Traditional>
Visit Your Group <http://groups.yahoo.com/group/evb;_ylc=X3oDMTJlN2ZwMTRxBF9TAzk3MzU5NzE0BGdycElkAzIzODk2NDQ3BGdycHNwSWQDMTcwNTAwNDc1MARzZWMDZnRyBHNsawNocGYEc3RpbWUDMTI0OTY3MTUxMA--> | Yahoo! Groups Terms of Use <http://docs.yahoo.com/info/terms/> | Unsubscribe <mailto:evb-***@yahoogroups.com?subject=>
Recent Activity

Visit Your Group <http://groups.yahoo.com/group/evb;_ylc=X3oDMTJmYW91dGs2BF9TAzk3MzU5NzE0BGdycElkAzIzODk2NDQ3BGdycHNwSWQDMTcwNTAwNDc1MARzZWMDdnRsBHNsawN2Z2hwBHN0aW1lAzEyNDk2NzE1MTA->
Give Back

Yahoo! for Good <http://us.lrd.yahoo.com/_ylc=X3oDMTJuam45aG04BF9TAzk3MzU5NzE0BF9wAzEEZ3JwSWQDMjM4OTY0NDcEZ3Jwc3BJZAMxNzA1MDA0NzUwBHNlYwNuY21vZARzbGsDYnJhbmQEc3RpbWUDMTI0OTY3MTUxMA--;_ylg=1/SIG=11314uv3k/**http%3A//brand.yahoo.com/forgood>

Get inspired

by a good cause.

Y! Toolbar

Get it Free! <http://us.lrd.yahoo.com/_ylc=X3oDMTJwbGY0NzUzBF9TAzk3MzU5NzE0BF9wAzIEZ3JwSWQDMjM4OTY0NDcEZ3Jwc3BJZAMxNzA1MDA0NzUwBHNlYwNuY21vZARzbGsDdG9vbGJhcgRzdGltZQMxMjQ5NjcxNTEw;_ylg=1/SIG=11c6dvmk9/**http%3A//toolbar.yahoo.com/%3F.cpdl=ygrps>

easy 1-click access

to your groups.

Yahoo! Groups

Start a group <http://groups.yahoo.com/start;_ylc=X3oDMTJwdjNqdTNiBF9TAzk3MzU5NzE0BF9wAzMEZ3JwSWQDMjM4OTY0NDcEZ3Jwc3BJZAMxNzA1MDA0NzUwBHNlYwNuY21vZARzbGsDZ3JvdXBzMgRzdGltZQMxMjQ5NjcxNTEw>

in 3 easy steps.

Connect with others.

.
<http://geo.yahoo.com/serv?s=97359714/grpId=23896447/grpspId=1705004750/msgId=212/stime=1249671510/nc1=1/nc2=2/nc3=3>

__,_._,___
Loading...