Discussion:
[Bridge] [PATCH 0/3 v3] macvtap driver
Arnd Bergmann
2010-01-27 10:04:20 UTC
Permalink
This is the third version of the macvtap device driver, following another major restructuring and a lot of bug fixes:

* Change macvtap to be based around a struct sock
* macvtap: fix initialization
* return 0 to netlink
* don't use rcu for q->file and q->vlan pointers
* macvtap: checkpatch.pl fixes
* macvtap: fix tun IFF flags
* Use a struct socket to make tx flow control work
* disable BH processing during transmit
* only add an ethernet header for receive not forward
* allocate the SKB using GFP_NOWAIT since we're
in rcu_read_lock
* use atomic allocation for socket
* fix blocking on send
* do not destroy netdev twice in error path

There are still known problems, but unless there
are fundamental concerns, I'd like this to go
into net-next as an experimental driver,
fixing up the remaining problems by 2.6.34-rc1.

Arnd
Arnd Bergmann
2010-01-27 10:06:49 UTC
Permalink
This makes it possible to hook into the macvlan driver
from another kernel module. In particular, the goal is
to extend it with the macvtap backend that provides
a tun/tap compatible interface directly on the macvlan
device.

Signed-off-by: Arnd Bergmann <arnd at arndb.de>
---
drivers/net/macvlan.c | 113 +++++++++++++++++++-------------------------
include/linux/if_macvlan.h | 70 +++++++++++++++++++++++++++
2 files changed, 119 insertions(+), 64 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index e0436fd..1517537 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -39,31 +39,6 @@ struct macvlan_port {
struct list_head vlans;
};

-/**
- * struct macvlan_rx_stats - MACVLAN percpu rx stats
- * @rx_packets: number of received packets
- * @rx_bytes: number of received bytes
- * @multicast: number of received multicast packets
- * @rx_errors: number of errors
- */
-struct macvlan_rx_stats {
- unsigned long rx_packets;
- unsigned long rx_bytes;
- unsigned long multicast;
- unsigned long rx_errors;
-};
-
-struct macvlan_dev {
- struct net_device *dev;
- struct list_head list;
- struct hlist_node hlist;
- struct macvlan_port *port;
- struct net_device *lowerdev;
- struct macvlan_rx_stats *rx_stats;
- enum macvlan_mode mode;
-};
-
-
static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port,
const unsigned char *addr)
{
@@ -118,31 +93,17 @@ static int macvlan_addr_busy(const struct macvlan_port *port,
return 0;
}

-static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
- unsigned int len, bool success,
- bool multicast)
-{
- struct macvlan_rx_stats *rx_stats;
-
- rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id());
- if (likely(success)) {
- rx_stats->rx_packets++;;
- rx_stats->rx_bytes += len;
- if (multicast)
- rx_stats->multicast++;
- } else {
- rx_stats->rx_errors++;
- }
-}

-static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev,
+static int macvlan_broadcast_one(struct sk_buff *skb,
+ const struct macvlan_dev *vlan,
const struct ethhdr *eth, bool local)
{
+ struct net_device *dev = vlan->dev;
if (!skb)
return NET_RX_DROP;

if (local)
- return dev_forward_skb(dev, skb);
+ return vlan->forward(dev, skb);

skb->dev = dev;
if (!compare_ether_addr_64bits(eth->h_dest,
@@ -151,7 +112,7 @@ static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev,
else
skb->pkt_type = PACKET_MULTICAST;

- return netif_receive_skb(skb);
+ return vlan->receive(skb);
}

static void macvlan_broadcast(struct sk_buff *skb,
@@ -175,7 +136,7 @@ static void macvlan_broadcast(struct sk_buff *skb,
continue;

nskb = skb_clone(skb, GFP_ATOMIC);
- err = macvlan_broadcast_one(nskb, vlan->dev, eth,
+ err = macvlan_broadcast_one(nskb, vlan, eth,
mode == MACVLAN_MODE_BRIDGE);
macvlan_count_rx(vlan, skb->len + ETH_HLEN,
err == NET_RX_SUCCESS, 1);
@@ -238,7 +199,7 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb)
skb->dev = dev;
skb->pkt_type = PACKET_HOST;

- netif_receive_skb(skb);
+ vlan->receive(skb);
return NULL;
}

@@ -260,7 +221,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
dest = macvlan_hash_lookup(port, eth->h_dest);
if (dest && dest->mode == MACVLAN_MODE_BRIDGE) {
unsigned int length = skb->len + ETH_HLEN;
- int ret = dev_forward_skb(dest->dev, skb);
+ int ret = dest->forward(dest->dev, skb);
macvlan_count_rx(dest, length,
ret == NET_RX_SUCCESS, 0);

@@ -273,8 +234,8 @@ xmit_world:
return dev_queue_xmit(skb);
}

-static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
- struct net_device *dev)
+netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
int i = skb_get_queue_mapping(skb);
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
@@ -290,6 +251,7 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,

return ret;
}
+EXPORT_SYMBOL_GPL(macvlan_start_xmit);

static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type, const void *daddr,
@@ -623,8 +585,11 @@ static int macvlan_get_tx_queues(struct net *net,
return 0;
}

-static int macvlan_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[])
+int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ int (*receive)(struct sk_buff *skb),
+ int (*forward)(struct net_device *dev,
+ struct sk_buff *skb))
{
struct macvlan_dev *vlan = netdev_priv(dev);
struct macvlan_port *port;
@@ -664,6 +629,8 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev,
vlan->lowerdev = lowerdev;
vlan->dev = dev;
vlan->port = port;
+ vlan->receive = receive;
+ vlan->forward = forward;

vlan->mode = MACVLAN_MODE_VEPA;
if (data && data[IFLA_MACVLAN_MODE])
@@ -677,8 +644,17 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev,
netif_stacked_transfer_operstate(lowerdev, dev);
return 0;
}
+EXPORT_SYMBOL_GPL(macvlan_common_newlink);

-static void macvlan_dellink(struct net_device *dev, struct list_head *head)
+static int macvlan_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ return macvlan_common_newlink(src_net, dev, tb, data,
+ netif_receive_skb,
+ dev_forward_skb);
+}
+
+void macvlan_dellink(struct net_device *dev, struct list_head *head)
{
struct macvlan_dev *vlan = netdev_priv(dev);
struct macvlan_port *port = vlan->port;
@@ -689,6 +665,7 @@ static void macvlan_dellink(struct net_device *dev, struct list_head *head)
if (list_empty(&port->vlans))
macvlan_port_destroy(port->dev);
}
+EXPORT_SYMBOL_GPL(macvlan_dellink);

static int macvlan_changelink(struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
@@ -720,19 +697,27 @@ static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = {
[IFLA_MACVLAN_MODE] = { .type = NLA_U32 },
};

-static struct rtnl_link_ops macvlan_link_ops __read_mostly = {
+int macvlan_link_register(struct rtnl_link_ops *ops)
+{
+ /* common fields */
+ ops->priv_size = sizeof(struct macvlan_dev);
+ ops->get_tx_queues = macvlan_get_tx_queues;
+ ops->setup = macvlan_setup;
+ ops->validate = macvlan_validate;
+ ops->maxtype = IFLA_MACVLAN_MAX;
+ ops->policy = macvlan_policy;
+ ops->changelink = macvlan_changelink;
+ ops->get_size = macvlan_get_size;
+ ops->fill_info = macvlan_fill_info;
+
+ return rtnl_link_register(ops);
+};
+EXPORT_SYMBOL_GPL(macvlan_link_register);
+
+static struct rtnl_link_ops macvlan_link_ops = {
.kind = "macvlan",
- .priv_size = sizeof(struct macvlan_dev),
- .get_tx_queues = macvlan_get_tx_queues,
- .setup = macvlan_setup,
- .validate = macvlan_validate,
.newlink = macvlan_newlink,
.dellink = macvlan_dellink,
- .maxtype = IFLA_MACVLAN_MAX,
- .policy = macvlan_policy,
- .changelink = macvlan_changelink,
- .get_size = macvlan_get_size,
- .fill_info = macvlan_fill_info,
};

static int macvlan_device_event(struct notifier_block *unused,
@@ -761,7 +746,7 @@ static int macvlan_device_event(struct notifier_block *unused,
break;
case NETDEV_UNREGISTER:
list_for_each_entry_safe(vlan, next, &port->vlans, list)
- macvlan_dellink(vlan->dev, NULL);
+ vlan->dev->rtnl_link_ops->dellink(vlan->dev, NULL);
break;
}
return NOTIFY_DONE;
@@ -778,7 +763,7 @@ static int __init macvlan_init_module(void)
register_netdevice_notifier(&macvlan_notifier_block);
macvlan_handle_frame_hook = macvlan_handle_frame;

- err = rtnl_link_register(&macvlan_link_ops);
+ err = macvlan_link_register(&macvlan_link_ops);
if (err < 0)
goto err1;
return 0;
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 5f200ba..9a11544 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -1,6 +1,76 @@
#ifndef _LINUX_IF_MACVLAN_H
#define _LINUX_IF_MACVLAN_H

+#include <linux/if_link.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <net/netlink.h>
+
+struct macvlan_port;
+struct macvtap_queue;
+
+/**
+ * struct macvlan_rx_stats - MACVLAN percpu rx stats
+ * @rx_packets: number of received packets
+ * @rx_bytes: number of received bytes
+ * @multicast: number of received multicast packets
+ * @rx_errors: number of errors
+ */
+struct macvlan_rx_stats {
+ unsigned long rx_packets;
+ unsigned long rx_bytes;
+ unsigned long multicast;
+ unsigned long rx_errors;
+};
+
+struct macvlan_dev {
+ struct net_device *dev;
+ struct list_head list;
+ struct hlist_node hlist;
+ struct macvlan_port *port;
+ struct net_device *lowerdev;
+ struct macvlan_rx_stats *rx_stats;
+ enum macvlan_mode mode;
+ int (*receive)(struct sk_buff *skb);
+ int (*forward)(struct net_device *dev, struct sk_buff *skb);
+};
+
+static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
+ unsigned int len, bool success,
+ bool multicast)
+{
+ struct macvlan_rx_stats *rx_stats;
+
+ rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id());
+ if (likely(success)) {
+ rx_stats->rx_packets++;;
+ rx_stats->rx_bytes += len;
+ if (multicast)
+ rx_stats->multicast++;
+ } else {
+ rx_stats->rx_errors++;
+ }
+}
+
+extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ int (*receive)(struct sk_buff *skb),
+ int (*forward)(struct net_device *dev,
+ struct sk_buff *skb));
+
+extern void macvlan_count_rx(const struct macvlan_dev *vlan,
+ unsigned int len, bool success,
+ bool multicast);
+
+extern void macvlan_dellink(struct net_device *dev, struct list_head *head);
+
+extern int macvlan_link_register(struct rtnl_link_ops *ops);
+
+extern netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
+ struct net_device *dev);
+
+
extern struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *);

#endif /* _LINUX_IF_MACVLAN_H */
--
1.6.3.3
Arnd Bergmann
2010-01-27 10:05:15 UTC
Permalink
In the vlan and macvlan drivers, the start_xmit function forwards
data to the dev_queue_xmit function for another device, which may
potentially belong to a different namespace.

To make sure that classification stays within a single namespace,
this resets the potentially critical fields.

Signed-off-by: Arnd Bergmann <arnd at arndb.de>
---
drivers/net/macvlan.c | 2 +-
include/linux/netdevice.h | 9 +++++++++
net/8021q/vlan_dev.c | 2 +-
net/core/dev.c | 35 +++++++++++++++++++++++++++++++----
4 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index bad1303..e0436fd 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -269,7 +269,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
}

xmit_world:
- skb->dev = vlan->lowerdev;
+ skb_set_dev(skb, vlan->lowerdev);
return dev_queue_xmit(skb);
}

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 93a32a5..622ba5a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1004,6 +1004,15 @@ static inline bool netdev_uses_dsa_tags(struct net_device *dev)
return 0;
}

+#ifndef CONFIG_NET_NS
+static inline void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
+{
+ skb->dev = dev;
+}
+#else /* CONFIG_NET_NS */
+void skb_set_dev(struct sk_buff *skb, struct net_device *dev);
+#endif
+
static inline bool netdev_uses_trailer_tags(struct net_device *dev)
{
#ifdef CONFIG_NET_DSA_TAG_TRAILER
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 77a49ff..95034a8 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -322,7 +322,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
}


- skb->dev = vlan_dev_info(dev)->real_dev;
+ skb_set_dev(skb, vlan_dev_info(dev)->real_dev);
len = skb->len;
ret = dev_queue_xmit(skb);

diff --git a/net/core/dev.c b/net/core/dev.c
index 2cba5c5..e80403a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1448,13 +1448,10 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
if (skb->len > (dev->mtu + dev->hard_header_len))
return NET_RX_DROP;

- skb_dst_drop(skb);
+ skb_set_dev(skb, dev);
skb->tstamp.tv64 = 0;
skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, dev);
- skb->mark = 0;
- secpath_reset(skb);
- nf_reset(skb);
return netif_rx(skb);
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
@@ -1614,6 +1611,36 @@ static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
return false;
}

+/**
+ * skb_dev_set -- assign a buffer to a new device
+ * @skb: buffer for the new device
+ * @dev: network device
+ *
+ * If an skb is owned by a device already, we have to reset
+ * all data private to the namespace a device belongs to
+ * before assigning it a new device.
+ */
+#ifdef CONFIG_NET_NS
+void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
+{
+ skb_dst_drop(skb);
+ if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
+ secpath_reset(skb);
+ nf_reset(skb);
+ skb_init_secmark(skb);
+ skb->mark = 0;
+ skb->priority = 0;
+ skb->nf_trace = 0;
+ skb->ipvs_property = 0;
+#ifdef CONFIG_NET_SCHED
+ skb->tc_index = 0;
+#endif
+ }
+ skb->dev = dev;
+}
+EXPORT_SYMBOL(skb_set_dev);
+#endif /* CONFIG_NET_NS */
+
/*
* Invalidate hardware checksum when packet is to be mangled, and
* complete checksum manually on outgoing path.
--
1.6.3.3
David Miller
2010-01-29 05:33:44 UTC
Permalink
From: Arnd Bergmann <arnd at arndb.de>
Date: Wed, 27 Jan 2010 11:05:15 +0100
Post by Arnd Bergmann
+ * skb_dev_set -- assign a buffer to a new device
My english is terrible, but I think this should be
"assign a new device to a buffer".

If you agree, please fix this up when you resubmit patches #1 and #2
along with the fix you already plan to make to patch #3.

Thanks!
Arnd Bergmann
2010-01-29 10:12:05 UTC
Permalink
Post by David Miller
From: Arnd Bergmann <arnd at arndb.de>
Date: Wed, 27 Jan 2010 11:05:15 +0100
Post by Arnd Bergmann
+ * skb_dev_set -- assign a buffer to a new device
My english is terrible, but I think this should be
"assign a new device to a buffer".
Right, that seems clearer.
Post by David Miller
If you agree, please fix this up when you resubmit patches #1 and #2
along with the fix you already plan to make to patch #3.
Ok, will do.

Thanks,

Arnd
Arnd Bergmann
2010-01-27 10:06:49 UTC
Permalink
This makes it possible to hook into the macvlan driver
from another kernel module. In particular, the goal is
to extend it with the macvtap backend that provides
a tun/tap compatible interface directly on the macvlan
device.

Signed-off-by: Arnd Bergmann <arnd at arndb.de>
---
drivers/net/macvlan.c | 113 +++++++++++++++++++-------------------------
include/linux/if_macvlan.h | 70 +++++++++++++++++++++++++++
2 files changed, 119 insertions(+), 64 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index e0436fd..1517537 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -39,31 +39,6 @@ struct macvlan_port {
struct list_head vlans;
};

-/**
- * struct macvlan_rx_stats - MACVLAN percpu rx stats
- * @rx_packets: number of received packets
- * @rx_bytes: number of received bytes
- * @multicast: number of received multicast packets
- * @rx_errors: number of errors
- */
-struct macvlan_rx_stats {
- unsigned long rx_packets;
- unsigned long rx_bytes;
- unsigned long multicast;
- unsigned long rx_errors;
-};
-
-struct macvlan_dev {
- struct net_device *dev;
- struct list_head list;
- struct hlist_node hlist;
- struct macvlan_port *port;
- struct net_device *lowerdev;
- struct macvlan_rx_stats *rx_stats;
- enum macvlan_mode mode;
-};
-
-
static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port,
const unsigned char *addr)
{
@@ -118,31 +93,17 @@ static int macvlan_addr_busy(const struct macvlan_port *port,
return 0;
}

-static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
- unsigned int len, bool success,
- bool multicast)
-{
- struct macvlan_rx_stats *rx_stats;
-
- rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id());
- if (likely(success)) {
- rx_stats->rx_packets++;;
- rx_stats->rx_bytes += len;
- if (multicast)
- rx_stats->multicast++;
- } else {
- rx_stats->rx_errors++;
- }
-}

-static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev,
+static int macvlan_broadcast_one(struct sk_buff *skb,
+ const struct macvlan_dev *vlan,
const struct ethhdr *eth, bool local)
{
+ struct net_device *dev = vlan->dev;
if (!skb)
return NET_RX_DROP;

if (local)
- return dev_forward_skb(dev, skb);
+ return vlan->forward(dev, skb);

skb->dev = dev;
if (!compare_ether_addr_64bits(eth->h_dest,
@@ -151,7 +112,7 @@ static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev,
else
skb->pkt_type = PACKET_MULTICAST;

- return netif_receive_skb(skb);
+ return vlan->receive(skb);
}

static void macvlan_broadcast(struct sk_buff *skb,
@@ -175,7 +136,7 @@ static void macvlan_broadcast(struct sk_buff *skb,
continue;

nskb = skb_clone(skb, GFP_ATOMIC);
- err = macvlan_broadcast_one(nskb, vlan->dev, eth,
+ err = macvlan_broadcast_one(nskb, vlan, eth,
mode == MACVLAN_MODE_BRIDGE);
macvlan_count_rx(vlan, skb->len + ETH_HLEN,
err == NET_RX_SUCCESS, 1);
@@ -238,7 +199,7 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb)
skb->dev = dev;
skb->pkt_type = PACKET_HOST;

- netif_receive_skb(skb);
+ vlan->receive(skb);
return NULL;
}

@@ -260,7 +221,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
dest = macvlan_hash_lookup(port, eth->h_dest);
if (dest && dest->mode == MACVLAN_MODE_BRIDGE) {
unsigned int length = skb->len + ETH_HLEN;
- int ret = dev_forward_skb(dest->dev, skb);
+ int ret = dest->forward(dest->dev, skb);
macvlan_count_rx(dest, length,
ret == NET_RX_SUCCESS, 0);

@@ -273,8 +234,8 @@ xmit_world:
return dev_queue_xmit(skb);
}

-static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
- struct net_device *dev)
+netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
int i = skb_get_queue_mapping(skb);
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
@@ -290,6 +251,7 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,

return ret;
}
+EXPORT_SYMBOL_GPL(macvlan_start_xmit);

static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type, const void *daddr,
@@ -623,8 +585,11 @@ static int macvlan_get_tx_queues(struct net *net,
return 0;
}

-static int macvlan_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[])
+int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ int (*receive)(struct sk_buff *skb),
+ int (*forward)(struct net_device *dev,
+ struct sk_buff *skb))
{
struct macvlan_dev *vlan = netdev_priv(dev);
struct macvlan_port *port;
@@ -664,6 +629,8 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev,
vlan->lowerdev = lowerdev;
vlan->dev = dev;
vlan->port = port;
+ vlan->receive = receive;
+ vlan->forward = forward;

vlan->mode = MACVLAN_MODE_VEPA;
if (data && data[IFLA_MACVLAN_MODE])
@@ -677,8 +644,17 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev,
netif_stacked_transfer_operstate(lowerdev, dev);
return 0;
}
+EXPORT_SYMBOL_GPL(macvlan_common_newlink);

-static void macvlan_dellink(struct net_device *dev, struct list_head *head)
+static int macvlan_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ return macvlan_common_newlink(src_net, dev, tb, data,
+ netif_receive_skb,
+ dev_forward_skb);
+}
+
+void macvlan_dellink(struct net_device *dev, struct list_head *head)
{
struct macvlan_dev *vlan = netdev_priv(dev);
struct macvlan_port *port = vlan->port;
@@ -689,6 +665,7 @@ static void macvlan_dellink(struct net_device *dev, struct list_head *head)
if (list_empty(&port->vlans))
macvlan_port_destroy(port->dev);
}
+EXPORT_SYMBOL_GPL(macvlan_dellink);

static int macvlan_changelink(struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
@@ -720,19 +697,27 @@ static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = {
[IFLA_MACVLAN_MODE] = { .type = NLA_U32 },
};

-static struct rtnl_link_ops macvlan_link_ops __read_mostly = {
+int macvlan_link_register(struct rtnl_link_ops *ops)
+{
+ /* common fields */
+ ops->priv_size = sizeof(struct macvlan_dev);
+ ops->get_tx_queues = macvlan_get_tx_queues;
+ ops->setup = macvlan_setup;
+ ops->validate = macvlan_validate;
+ ops->maxtype = IFLA_MACVLAN_MAX;
+ ops->policy = macvlan_policy;
+ ops->changelink = macvlan_changelink;
+ ops->get_size = macvlan_get_size;
+ ops->fill_info = macvlan_fill_info;
+
+ return rtnl_link_register(ops);
+};
+EXPORT_SYMBOL_GPL(macvlan_link_register);
+
+static struct rtnl_link_ops macvlan_link_ops = {
.kind = "macvlan",
- .priv_size = sizeof(struct macvlan_dev),
- .get_tx_queues = macvlan_get_tx_queues,
- .setup = macvlan_setup,
- .validate = macvlan_validate,
.newlink = macvlan_newlink,
.dellink = macvlan_dellink,
- .maxtype = IFLA_MACVLAN_MAX,
- .policy = macvlan_policy,
- .changelink = macvlan_changelink,
- .get_size = macvlan_get_size,
- .fill_info = macvlan_fill_info,
};

static int macvlan_device_event(struct notifier_block *unused,
@@ -761,7 +746,7 @@ static int macvlan_device_event(struct notifier_block *unused,
break;
case NETDEV_UNREGISTER:
list_for_each_entry_safe(vlan, next, &port->vlans, list)
- macvlan_dellink(vlan->dev, NULL);
+ vlan->dev->rtnl_link_ops->dellink(vlan->dev, NULL);
break;
}
return NOTIFY_DONE;
@@ -778,7 +763,7 @@ static int __init macvlan_init_module(void)
register_netdevice_notifier(&macvlan_notifier_block);
macvlan_handle_frame_hook = macvlan_handle_frame;

- err = rtnl_link_register(&macvlan_link_ops);
+ err = macvlan_link_register(&macvlan_link_ops);
if (err < 0)
goto err1;
return 0;
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 5f200ba..9a11544 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -1,6 +1,76 @@
#ifndef _LINUX_IF_MACVLAN_H
#define _LINUX_IF_MACVLAN_H

+#include <linux/if_link.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <net/netlink.h>
+
+struct macvlan_port;
+struct macvtap_queue;
+
+/**
+ * struct macvlan_rx_stats - MACVLAN percpu rx stats
+ * @rx_packets: number of received packets
+ * @rx_bytes: number of received bytes
+ * @multicast: number of received multicast packets
+ * @rx_errors: number of errors
+ */
+struct macvlan_rx_stats {
+ unsigned long rx_packets;
+ unsigned long rx_bytes;
+ unsigned long multicast;
+ unsigned long rx_errors;
+};
+
+struct macvlan_dev {
+ struct net_device *dev;
+ struct list_head list;
+ struct hlist_node hlist;
+ struct macvlan_port *port;
+ struct net_device *lowerdev;
+ struct macvlan_rx_stats *rx_stats;
+ enum macvlan_mode mode;
+ int (*receive)(struct sk_buff *skb);
+ int (*forward)(struct net_device *dev, struct sk_buff *skb);
+};
+
+static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
+ unsigned int len, bool success,
+ bool multicast)
+{
+ struct macvlan_rx_stats *rx_stats;
+
+ rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id());
+ if (likely(success)) {
+ rx_stats->rx_packets++;;
+ rx_stats->rx_bytes += len;
+ if (multicast)
+ rx_stats->multicast++;
+ } else {
+ rx_stats->rx_errors++;
+ }
+}
+
+extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ int (*receive)(struct sk_buff *skb),
+ int (*forward)(struct net_device *dev,
+ struct sk_buff *skb));
+
+extern void macvlan_count_rx(const struct macvlan_dev *vlan,
+ unsigned int len, bool success,
+ bool multicast);
+
+extern void macvlan_dellink(struct net_device *dev, struct list_head *head);
+
+extern int macvlan_link_register(struct rtnl_link_ops *ops);
+
+extern netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
+ struct net_device *dev);
+
+
extern struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *);

#endif /* _LINUX_IF_MACVLAN_H */
--
1.6.3.3
Arnd Bergmann
2010-01-27 21:09:27 UTC
Permalink
In order to use macvlan with qemu and other tools that require
a tap file descriptor, the macvtap driver adds a small backend
with a character device with the same interface as the tun
driver, with a minimum set of features.

Macvtap interfaces are created in the same way as macvlan
interfaces using ip link, but the netif is just used as a
handle for configuration and accounting, while the data
goes through the chardev. Each macvtap interface has its
own character device, simplifying permission management
significantly over the generic tun/tap driver.

Cc: Patrick McHardy <kaber at trash.net>
Cc: Stephen Hemminger <shemminger at linux-foundation.org>
Cc: David S. Miller" <davem at davemloft.net>
Cc: "Michael S. Tsirkin" <mst at redhat.com>
Cc: Herbert Xu <herbert at gondor.apana.org.au>
Cc: Or Gerlitz <ogerlitz at voltaire.com>
Cc: netdev at vger.kernel.org
Cc: bridge at lists.linux-foundation.org
Cc: linux-kernel at vger.kernel.org
Signed-off-by: Arnd Bergmann <arnd at arndb.de>
---
drivers/net/Kconfig | 12 +
drivers/net/Makefile | 1 +
drivers/net/macvtap.c | 572 ++++++++++++++++++++++++++++++++++++++++++++
include/linux/if_macvlan.h | 1 +
4 files changed, 586 insertions(+), 0 deletions(-)
create mode 100644 drivers/net/macvtap.c

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index cb0e534..411e207 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -90,6 +90,18 @@ config MACVLAN
To compile this driver as a module, choose M here: the module
will be called macvlan.

+config MACVTAP
+ tristate "MAC-VLAN based tap driver (EXPERIMENTAL)"
+ depends on MACVLAN
+ help
+ This adds a specialized tap character device driver that is based
+ on the MAC-VLAN network interface, called macvtap. A macvtap device
+ can be added in the same way as a macvlan device, using 'type
+ macvlan', and then be accessed through the tap user space interface.
+
+ To compile this driver as a module, choose M here: the module
+ will be called macvtap.
+
config EQUALIZER
tristate "EQL (serial line load balancing) support"
---help---
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 0b763cb..9595803 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -169,6 +169,7 @@ obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
obj-$(CONFIG_DUMMY) += dummy.o
obj-$(CONFIG_IFB) += ifb.o
obj-$(CONFIG_MACVLAN) += macvlan.o
+obj-$(CONFIG_MACVTAP) += macvtap.o
obj-$(CONFIG_DE600) += de600.o
obj-$(CONFIG_DE620) += de620.o
obj-$(CONFIG_LANCE) += lance.o
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
new file mode 100644
index 0000000..2916202
--- /dev/null
+++ b/drivers/net/macvtap.c
@@ -0,0 +1,572 @@
+#include <linux/etherdevice.h>
+#include <linux/if_macvlan.h>
+#include <linux/interrupt.h>
+#include <linux/nsproxy.h>
+#include <linux/compat.h>
+#include <linux/if_tun.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/cache.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/cdev.h>
+#include <linux/fs.h>
+
+#include <net/net_namespace.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+
+/*
+ * A macvtap queue is the central object of this driver, it connects
+ * an open character device to a macvlan interface. There can be
+ * multiple queues on one interface, which map back to queues
+ * implemented in hardware on the underlying device.
+ *
+ * macvtap_proto is used to allocate queues through the sock allocation
+ * mechanism.
+ *
+ * TODO: multiqueue support is currently not implemented, even though
+ * macvtap is basically prepared for that. We will need to add this
+ * here as well as in virtio-net and qemu to get line rate on 10gbit
+ * adapters from a guest.
+ */
+struct macvtap_queue {
+ struct sock sk;
+ struct socket sock;
+ struct macvlan_dev *vlan;
+ struct file *file;
+};
+
+static struct proto macvtap_proto = {
+ .name = "macvtap",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof (struct macvtap_queue),
+};
+
+/*
+ * Minor number matches netdev->ifindex, so need a potentially
+ * large value. This also makes it possible to split the
+ * tap functionality out again in the future by offering it
+ * from other drivers besides macvtap. As long as every device
+ * only has one tap, the interface numbers assure that the
+ * device nodes are unique.
+ */
+static unsigned int macvtap_major;
+#define MACVTAP_NUM_DEVS 65536
+static struct class *macvtap_class;
+static struct cdev macvtap_cdev;
+
+/*
+ * RCU usage:
+ * The macvtap_queue is referenced both from the chardev struct file
+ * and from the struct macvlan_dev using rcu_read_lock.
+ *
+ * We never actually update the contents of a macvtap_queue atomically
+ * with RCU but it is used for race-free destruction of a queue when
+ * either the file or the macvlan_dev goes away. Pointers back to
+ * the dev and the file are implicitly valid as long as the queue
+ * exists.
+ *
+ * The callbacks from macvlan are always done with rcu_read_lock held
+ * already, while in the file_operations, we get it ourselves.
+ *
+ * When destroying a queue, we remove the pointers from the file and
+ * from the dev and then synchronize_rcu to make sure no thread is
+ * still using the queue. There may still be references to the struct
+ * sock inside of the queue from outbound SKBs, but these never
+ * reference back to the file or the dev. The data structure is freed
+ * through __sk_free when both our references and any pending SKBs
+ * are gone.
+ *
+ * macvtap_lock is only used to prevent multiple concurrent open()
+ * calls to assign a new vlan->tap pointer. It could be moved into
+ * the macvlan_dev itself but is extremely rarely used.
+ */
+static DEFINE_SPINLOCK(macvtap_lock);
+
+/*
+ * Choose the next free queue, for now there is only one
+ */
+static int macvtap_set_queue(struct net_device *dev, struct file *file,
+ struct macvtap_queue *q)
+{
+ struct macvlan_dev *vlan = netdev_priv(dev);
+ int err = -EBUSY;
+
+ spin_lock(&macvtap_lock);
+ if (rcu_dereference(vlan->tap))
+ goto out;
+
+ err = 0;
+ q->vlan = vlan;
+ rcu_assign_pointer(vlan->tap, q);
+
+ q->file = file;
+ rcu_assign_pointer(file->private_data, q);
+
+out:
+ spin_unlock(&macvtap_lock);
+ return err;
+}
+
+/*
+ * We must destroy each queue exactly once, when either
+ * the netdev or the file go away.
+ *
+ * Using the spinlock makes sure that we don't get
+ * to the queue again after destroying it.
+ *
+ * synchronize_rcu serializes with the packet flow
+ * that uses rcu_read_lock.
+ */
+static void macvtap_del_queue(struct macvtap_queue **qp)
+{
+ struct macvtap_queue *q;
+
+ spin_lock(&macvtap_lock);
+ q = rcu_dereference(*qp);
+ if (!q) {
+ spin_unlock(&macvtap_lock);
+ return;
+ }
+
+ rcu_assign_pointer(q->vlan->tap, NULL);
+ rcu_assign_pointer(q->file->private_data, NULL);
+ spin_unlock(&macvtap_lock);
+
+ synchronize_rcu();
+ sock_put(&q->sk);
+}
+
+/*
+ * Since we only support one queue, just dereference the pointer.
+ */
+static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
+ struct sk_buff *skb)
+{
+ struct macvlan_dev *vlan = netdev_priv(dev);
+
+ return rcu_dereference(vlan->tap);
+}
+
+static void macvtap_del_queues(struct net_device *dev)
+{
+ struct macvlan_dev *vlan = netdev_priv(dev);
+ macvtap_del_queue(&vlan->tap);
+}
+
+static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file)
+{
+ rcu_read_lock_bh();
+ return rcu_dereference(file->private_data);
+}
+
+static inline void macvtap_file_put_queue(void)
+{
+ rcu_read_unlock_bh();
+}
+
+/*
+ * Forward happens for data that gets sent from one macvlan
+ * endpoint to another one in bridge mode. We just take
+ * the skb and put it into the receive queue.
+ */
+static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
+{
+ struct macvtap_queue *q = macvtap_get_queue(dev, skb);
+ if (!q)
+ return -ENOLINK;
+
+ skb_queue_tail(&q->sk.sk_receive_queue, skb);
+ wake_up(q->sk.sk_sleep);
+ return 0;
+}
+
+/*
+ * Receive is for data from the external interface (lowerdev),
+ * in case of macvtap, we can treat that the same way as
+ * forward, which macvlan cannot.
+ */
+static int macvtap_receive(struct sk_buff *skb)
+{
+ skb_push(skb, ETH_HLEN);
+ return macvtap_forward(skb->dev, skb);
+}
+
+static int macvtap_newlink(struct net *src_net,
+ struct net_device *dev,
+ struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct device *classdev;
+ dev_t devt;
+ int err;
+
+ err = macvlan_common_newlink(src_net, dev, tb, data,
+ macvtap_receive, macvtap_forward);
+ if (err)
+ goto out;
+
+ devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
+
+ classdev = device_create(macvtap_class, &dev->dev, devt,
+ dev, "tap%d", dev->ifindex);
+ if (IS_ERR(classdev)) {
+ err = PTR_ERR(classdev);
+ macvtap_del_queues(dev);
+ }
+
+out:
+ return err;
+}
+
+static void macvtap_dellink(struct net_device *dev,
+ struct list_head *head)
+{
+ device_destroy(macvtap_class,
+ MKDEV(MAJOR(macvtap_major), dev->ifindex));
+
+ macvtap_del_queues(dev);
+ macvlan_dellink(dev, head);
+}
+
+static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
+ .kind = "macvtap",
+ .newlink = macvtap_newlink,
+ .dellink = macvtap_dellink,
+};
+
+
+static void macvtap_sock_write_space(struct sock *sk)
+{
+ if (!sock_writeable(sk) ||
+ !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
+ return;
+
+ if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ wake_up_interruptible_sync(sk->sk_sleep);
+}
+
+static int macvtap_open(struct inode *inode, struct file *file)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct net_device *dev = dev_get_by_index(net, iminor(inode));
+ struct macvtap_queue *q;
+ int err;
+
+ err = -ENODEV;
+ if (!dev)
+ goto out;
+
+ /* check if this is a macvtap device */
+ err = -EINVAL;
+ if (dev->rtnl_link_ops != &macvtap_link_ops)
+ goto out;
+
+ err = -ENOMEM;
+ q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
+ &macvtap_proto);
+ if (!q)
+ goto out;
+
+ init_waitqueue_head(&q->sock.wait);
+ q->sock.type = SOCK_RAW;
+ q->sock.state = SS_CONNECTED;
+ sock_init_data(&q->sock, &q->sk);
+ q->sk.sk_allocation = GFP_ATOMIC; /* for now */
+ q->sk.sk_write_space = macvtap_sock_write_space;
+
+ err = macvtap_set_queue(dev, file, q);
+ if (err)
+ sock_put(&q->sk);
+
+out:
+ return err;
+}
+
+static int macvtap_release(struct inode *inode, struct file *file)
+{
+ macvtap_del_queue((struct macvtap_queue **)&file->private_data);
+ return 0;
+}
+
+static unsigned int macvtap_poll(struct file *file, poll_table * wait)
+{
+ struct macvtap_queue *q = macvtap_file_get_queue(file);
+ unsigned int mask = POLLERR;
+
+ if (!q)
+ goto out;
+
+ mask = 0;
+ poll_wait(file, &q->sock.wait, wait);
+
+ if (!skb_queue_empty(&q->sk.sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ if (sock_writeable(&q->sk) ||
+ (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) &&
+ sock_writeable(&q->sk)))
+ mask |= POLLOUT | POLLWRNORM;
+
+out:
+ macvtap_file_put_queue();
+ return mask;
+}
+
+/* Get packet from user space buffer */
+static ssize_t macvtap_get_user(struct macvtap_queue *q,
+ const struct iovec *iv, size_t count,
+ int noblock)
+{
+ struct sk_buff *skb;
+ size_t len = count;
+ int err;
+
+ if (unlikely(len < ETH_HLEN))
+ return -EINVAL;
+
+ skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
+
+ if (!skb) {
+ macvlan_count_rx(q->vlan, 0, false, false);
+ return err;
+ }
+
+ skb_reserve(skb, NET_IP_ALIGN);
+ skb_put(skb, count);
+
+ if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
+ macvlan_count_rx(q->vlan, 0, false, false);
+ kfree_skb(skb);
+ return -EFAULT;
+ }
+
+ skb_set_network_header(skb, ETH_HLEN);
+
+ macvlan_start_xmit(skb, q->vlan->dev);
+
+ return count;
+}
+
+static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t result = -ENOLINK;
+ struct macvtap_queue *q = macvtap_file_get_queue(file);
+
+ if (!q)
+ goto out;
+
+ result = macvtap_get_user(q, iv, iov_length(iv, count),
+ file->f_flags & O_NONBLOCK);
+out:
+ macvtap_file_put_queue();
+ return result;
+}
+
+/* Put packet to the user space buffer */
+static ssize_t macvtap_put_user(struct macvtap_queue *q,
+ const struct sk_buff *skb,
+ const struct iovec *iv, int len)
+{
+ struct macvlan_dev *vlan = q->vlan;
+ int ret;
+
+ len = min_t(int, skb->len, len);
+
+ ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len);
+
+ macvlan_count_rx(vlan, len, ret == 0, 0);
+
+ return ret ? ret : len;
+}
+
+static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct macvtap_queue *q = macvtap_file_get_queue(file);
+
+ DECLARE_WAITQUEUE(wait, current);
+ struct sk_buff *skb;
+ ssize_t len, ret = 0;
+
+ if (!q) {
+ ret = -ENOLINK;
+ goto out;
+ }
+
+ len = iov_length(iv, count);
+ if (len < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ add_wait_queue(q->sk.sk_sleep, &wait);
+ while (len) {
+ current->state = TASK_INTERRUPTIBLE;
+
+ /* Read frames from the queue */
+ skb = skb_dequeue(&q->sk.sk_receive_queue);
+ if (!skb) {
+ if (file->f_flags & O_NONBLOCK) {
+ ret = -EAGAIN;
+ break;
+ }
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ /* Nothing to read, let's sleep */
+ schedule();
+ continue;
+ }
+ ret = macvtap_put_user(q, skb, iv, len);
+ kfree_skb(skb);
+ break;
+ }
+
+ current->state = TASK_RUNNING;
+ remove_wait_queue(q->sk.sk_sleep, &wait);
+
+out:
+ macvtap_file_put_queue();
+ return ret;
+}
+
+/*
+ * provide compatibility with generic tun/tap interface
+ */
+static long macvtap_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct macvtap_queue *q;
+ void __user *argp = (void __user *)arg;
+ struct ifreq __user *ifr = argp;
+ unsigned int __user *up = argp;
+ unsigned int u;
+ int err;
+
+ switch (cmd) {
+ case TUNSETIFF:
+ /* ignore the name, just look at flags */
+ if (get_user(u, &ifr->ifr_flags))
+ return -EFAULT;
+ if (u != (IFF_TAP | IFF_NO_PI))
+ return -EINVAL;
+ return 0;
+
+ case TUNGETIFF:
+ q = macvtap_file_get_queue(file);
+ if (!q)
+ return -ENOLINK;
+ err = 0;
+ if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
+ put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags))
+ err = -EFAULT;
+ macvtap_file_put_queue();
+ return err;
+
+ case TUNGETFEATURES:
+ if (put_user((IFF_TAP | IFF_NO_PI), up))
+ return -EFAULT;
+ return 0;
+
+ case TUNSETSNDBUF:
+ /* ignore */
+ return 0;
+
+ case TUNSETOFFLOAD:
+ /* let the user check for future flags */
+ if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
+ TUN_F_TSO_ECN | TUN_F_UFO))
+ return -EINVAL;
+
+ /* TODO: add support for these, so far we don't
+ support any offload */
+ if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
+ TUN_F_TSO_ECN | TUN_F_UFO))
+ return -EINVAL;
+
+ return 0;
+
+ default:
+ return -EINVAL;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static const struct file_operations macvtap_fops = {
+ .owner = THIS_MODULE,
+ .open = macvtap_open,
+ .release = macvtap_release,
+ .aio_read = macvtap_aio_read,
+ .aio_write = macvtap_aio_write,
+ .poll = macvtap_poll,
+ .llseek = no_llseek,
+ .unlocked_ioctl = macvtap_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = macvtap_compat_ioctl,
+#endif
+};
+
+static int macvtap_init(void)
+{
+ int err;
+
+ err = alloc_chrdev_region(&macvtap_major, 0,
+ MACVTAP_NUM_DEVS, "macvtap");
+ if (err)
+ goto out1;
+
+ cdev_init(&macvtap_cdev, &macvtap_fops);
+ err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
+ if (err)
+ goto out2;
+
+ macvtap_class = class_create(THIS_MODULE, "macvtap");
+ if (IS_ERR(macvtap_class)) {
+ err = PTR_ERR(macvtap_class);
+ goto out3;
+ }
+
+ err = macvlan_link_register(&macvtap_link_ops);
+ if (err)
+ goto out4;
+
+ return 0;
+
+out4:
+ class_unregister(macvtap_class);
+out3:
+ cdev_del(&macvtap_cdev);
+out2:
+ unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
+out1:
+ return err;
+}
+module_init(macvtap_init);
+
+static void macvtap_exit(void)
+{
+ rtnl_link_unregister(&macvtap_link_ops);
+ class_unregister(macvtap_class);
+ cdev_del(&macvtap_cdev);
+ unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
+}
+module_exit(macvtap_exit);
+
+MODULE_ALIAS_RTNL_LINK("macvtap");
+MODULE_AUTHOR("Arnd Bergmann <arnd at arndb.de>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 9a11544..51f1512 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -34,6 +34,7 @@ struct macvlan_dev {
enum macvlan_mode mode;
int (*receive)(struct sk_buff *skb);
int (*forward)(struct net_device *dev, struct sk_buff *skb);
+ struct macvtap_queue *tap;
};

static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
--
1.6.3.3
Michael S. Tsirkin
2010-01-28 17:34:38 UTC
Permalink
Post by Arnd Bergmann
In order to use macvlan with qemu and other tools that require
a tap file descriptor, the macvtap driver adds a small backend
with a character device with the same interface as the tun
driver, with a minimum set of features.
Macvtap interfaces are created in the same way as macvlan
interfaces using ip link, but the netif is just used as a
handle for configuration and accounting, while the data
goes through the chardev. Each macvtap interface has its
own character device, simplifying permission management
significantly over the generic tun/tap driver.
Cc: Patrick McHardy <kaber at trash.net>
Cc: Stephen Hemminger <shemminger at linux-foundation.org>
Cc: David S. Miller" <davem at davemloft.net>
Cc: "Michael S. Tsirkin" <mst at redhat.com>
Cc: Herbert Xu <herbert at gondor.apana.org.au>
Cc: Or Gerlitz <ogerlitz at voltaire.com>
Cc: netdev at vger.kernel.org
Cc: bridge at lists.linux-foundation.org
Cc: linux-kernel at vger.kernel.org
Signed-off-by: Arnd Bergmann <arnd at arndb.de>
---
drivers/net/Kconfig | 12 +
drivers/net/Makefile | 1 +
drivers/net/macvtap.c | 572 ++++++++++++++++++++++++++++++++++++++++++++
include/linux/if_macvlan.h | 1 +
4 files changed, 586 insertions(+), 0 deletions(-)
create mode 100644 drivers/net/macvtap.c
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index cb0e534..411e207 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -90,6 +90,18 @@ config MACVLAN
To compile this driver as a module, choose M here: the module
will be called macvlan.
+config MACVTAP
+ tristate "MAC-VLAN based tap driver (EXPERIMENTAL)"
+ depends on MACVLAN
+ help
+ This adds a specialized tap character device driver that is based
+ on the MAC-VLAN network interface, called macvtap. A macvtap device
+ can be added in the same way as a macvlan device, using 'type
+ macvlan', and then be accessed through the tap user space interface.
+
+ To compile this driver as a module, choose M here: the module
+ will be called macvtap.
+
config EQUALIZER
tristate "EQL (serial line load balancing) support"
---help---
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 0b763cb..9595803 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -169,6 +169,7 @@ obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
obj-$(CONFIG_DUMMY) += dummy.o
obj-$(CONFIG_IFB) += ifb.o
obj-$(CONFIG_MACVLAN) += macvlan.o
+obj-$(CONFIG_MACVTAP) += macvtap.o
obj-$(CONFIG_DE600) += de600.o
obj-$(CONFIG_DE620) += de620.o
obj-$(CONFIG_LANCE) += lance.o
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
new file mode 100644
index 0000000..2916202
--- /dev/null
+++ b/drivers/net/macvtap.c
@@ -0,0 +1,572 @@
+#include <linux/etherdevice.h>
+#include <linux/if_macvlan.h>
+#include <linux/interrupt.h>
+#include <linux/nsproxy.h>
+#include <linux/compat.h>
+#include <linux/if_tun.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/cache.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/cdev.h>
+#include <linux/fs.h>
+
+#include <net/net_namespace.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+
+/*
+ * A macvtap queue is the central object of this driver, it connects
+ * an open character device to a macvlan interface. There can be
+ * multiple queues on one interface, which map back to queues
+ * implemented in hardware on the underlying device.
+ *
+ * macvtap_proto is used to allocate queues through the sock allocation
+ * mechanism.
+ *
+ * TODO: multiqueue support is currently not implemented, even though
+ * macvtap is basically prepared for that. We will need to add this
+ * here as well as in virtio-net and qemu to get line rate on 10gbit
+ * adapters from a guest.
+ */
+struct macvtap_queue {
+ struct sock sk;
+ struct socket sock;
+ struct macvlan_dev *vlan;
+ struct file *file;
+};
+
+static struct proto macvtap_proto = {
+ .name = "macvtap",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof (struct macvtap_queue),
+};
+
+/*
+ * Minor number matches netdev->ifindex, so need a potentially
+ * large value. This also makes it possible to split the
+ * tap functionality out again in the future by offering it
+ * from other drivers besides macvtap. As long as every device
+ * only has one tap, the interface numbers assure that the
+ * device nodes are unique.
+ */
+static unsigned int macvtap_major;
+#define MACVTAP_NUM_DEVS 65536
+static struct class *macvtap_class;
+static struct cdev macvtap_cdev;
+
+/*
+ * The macvtap_queue is referenced both from the chardev struct file
+ * and from the struct macvlan_dev using rcu_read_lock.
+ *
+ * We never actually update the contents of a macvtap_queue atomically
+ * with RCU but it is used for race-free destruction of a queue when
+ * either the file or the macvlan_dev goes away. Pointers back to
+ * the dev and the file are implicitly valid as long as the queue
+ * exists.
+ *
+ * The callbacks from macvlan are always done with rcu_read_lock held
+ * already, while in the file_operations, we get it ourselves.
+ *
+ * When destroying a queue, we remove the pointers from the file and
+ * from the dev and then synchronize_rcu to make sure no thread is
+ * still using the queue. There may still be references to the struct
+ * sock inside of the queue from outbound SKBs, but these never
+ * reference back to the file or the dev. The data structure is freed
+ * through __sk_free when both our references and any pending SKBs
+ * are gone.
+ *
+ * macvtap_lock is only used to prevent multiple concurrent open()
+ * calls to assign a new vlan->tap pointer. It could be moved into
+ * the macvlan_dev itself but is extremely rarely used.
+ */
+static DEFINE_SPINLOCK(macvtap_lock);
+
+/*
+ * Choose the next free queue, for now there is only one
+ */
+static int macvtap_set_queue(struct net_device *dev, struct file *file,
+ struct macvtap_queue *q)
+{
+ struct macvlan_dev *vlan = netdev_priv(dev);
+ int err = -EBUSY;
+
+ spin_lock(&macvtap_lock);
+ if (rcu_dereference(vlan->tap))
+ goto out;
+
+ err = 0;
+ q->vlan = vlan;
+ rcu_assign_pointer(vlan->tap, q);
+
+ q->file = file;
+ rcu_assign_pointer(file->private_data, q);
+
+ spin_unlock(&macvtap_lock);
+ return err;
+}
+
+/*
+ * We must destroy each queue exactly once, when either
+ * the netdev or the file go away.
+ *
+ * Using the spinlock makes sure that we don't get
+ * to the queue again after destroying it.
+ *
+ * synchronize_rcu serializes with the packet flow
+ * that uses rcu_read_lock.
+ */
+static void macvtap_del_queue(struct macvtap_queue **qp)
+{
+ struct macvtap_queue *q;
+
+ spin_lock(&macvtap_lock);
+ q = rcu_dereference(*qp);
+ if (!q) {
+ spin_unlock(&macvtap_lock);
+ return;
+ }
+
+ rcu_assign_pointer(q->vlan->tap, NULL);
+ rcu_assign_pointer(q->file->private_data, NULL);
+ spin_unlock(&macvtap_lock);
+
+ synchronize_rcu();
+ sock_put(&q->sk);
+}
+
+/*
+ * Since we only support one queue, just dereference the pointer.
+ */
+static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
+ struct sk_buff *skb)
+{
+ struct macvlan_dev *vlan = netdev_priv(dev);
+
+ return rcu_dereference(vlan->tap);
+}
+
+static void macvtap_del_queues(struct net_device *dev)
+{
+ struct macvlan_dev *vlan = netdev_priv(dev);
+ macvtap_del_queue(&vlan->tap);
+}
+
+static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file)
+{
+ rcu_read_lock_bh();
+ return rcu_dereference(file->private_data);
+}
+
+static inline void macvtap_file_put_queue(void)
+{
+ rcu_read_unlock_bh();
+}
+
I find such wrappers around rcu obscure this,
already sufficiently complex, pattern.
Might be just me.
Post by Arnd Bergmann
+/*
+ * Forward happens for data that gets sent from one macvlan
+ * endpoint to another one in bridge mode. We just take
+ * the skb and put it into the receive queue.
+ */
+static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
+{
+ struct macvtap_queue *q = macvtap_get_queue(dev, skb);
+ if (!q)
+ return -ENOLINK;
+
+ skb_queue_tail(&q->sk.sk_receive_queue, skb);
+ wake_up(q->sk.sk_sleep);
+ return 0;
+}
+
+/*
+ * Receive is for data from the external interface (lowerdev),
+ * in case of macvtap, we can treat that the same way as
+ * forward, which macvlan cannot.
+ */
+static int macvtap_receive(struct sk_buff *skb)
+{
+ skb_push(skb, ETH_HLEN);
+ return macvtap_forward(skb->dev, skb);
+}
+
+static int macvtap_newlink(struct net *src_net,
+ struct net_device *dev,
+ struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct device *classdev;
+ dev_t devt;
+ int err;
+
+ err = macvlan_common_newlink(src_net, dev, tb, data,
+ macvtap_receive, macvtap_forward);
+ if (err)
+ goto out;
+
+ devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
+
+ classdev = device_create(macvtap_class, &dev->dev, devt,
+ dev, "tap%d", dev->ifindex);
+ if (IS_ERR(classdev)) {
+ err = PTR_ERR(classdev);
+ macvtap_del_queues(dev);
+ }
+
+ return err;
+}
+
+static void macvtap_dellink(struct net_device *dev,
+ struct list_head *head)
+{
+ device_destroy(macvtap_class,
+ MKDEV(MAJOR(macvtap_major), dev->ifindex));
+
+ macvtap_del_queues(dev);
+ macvlan_dellink(dev, head);
+}
+
+static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
+ .kind = "macvtap",
+ .newlink = macvtap_newlink,
+ .dellink = macvtap_dellink,
+};
+
+
+static void macvtap_sock_write_space(struct sock *sk)
+{
+ if (!sock_writeable(sk) ||
+ !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
+ return;
+
+ if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ wake_up_interruptible_sync(sk->sk_sleep);
+}
+
+static int macvtap_open(struct inode *inode, struct file *file)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct net_device *dev = dev_get_by_index(net, iminor(inode));
+ struct macvtap_queue *q;
+ int err;
+
This seems to keep reference to device as long as character device is
open, which, if I understand correctly, will start printing error
messages to kernel log about once a second if you try to remove the
device.

I suspect the best way to fix this issue would be to use some kind
of notifier so that macvtap can disconnect on device removal.
Post by Arnd Bergmann
+ err = -ENODEV;
+ if (!dev)
+ goto out;
+
+ /* check if this is a macvtap device */
+ err = -EINVAL;
+ if (dev->rtnl_link_ops != &macvtap_link_ops)
+ goto out;
+
+ err = -ENOMEM;
+ q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
+ &macvtap_proto);
+ if (!q)
+ goto out;
+
+ init_waitqueue_head(&q->sock.wait);
+ q->sock.type = SOCK_RAW;
+ q->sock.state = SS_CONNECTED;
+ sock_init_data(&q->sock, &q->sk);
+ q->sk.sk_allocation = GFP_ATOMIC; /* for now */
+ q->sk.sk_write_space = macvtap_sock_write_space;
+
+ err = macvtap_set_queue(dev, file, q);
+ if (err)
+ sock_put(&q->sk);
+
+ return err;
+}
+
+static int macvtap_release(struct inode *inode, struct file *file)
+{
+ macvtap_del_queue((struct macvtap_queue **)&file->private_data);
+ return 0;
+}
+
+static unsigned int macvtap_poll(struct file *file, poll_table * wait)
+{
+ struct macvtap_queue *q = macvtap_file_get_queue(file);
+ unsigned int mask = POLLERR;
+
+ if (!q)
+ goto out;
+
+ mask = 0;
+ poll_wait(file, &q->sock.wait, wait);
+
+ if (!skb_queue_empty(&q->sk.sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ if (sock_writeable(&q->sk) ||
+ (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) &&
+ sock_writeable(&q->sk)))
+ mask |= POLLOUT | POLLWRNORM;
+
+ macvtap_file_put_queue();
+ return mask;
+}
+
+/* Get packet from user space buffer */
+static ssize_t macvtap_get_user(struct macvtap_queue *q,
+ const struct iovec *iv, size_t count,
+ int noblock)
+{
+ struct sk_buff *skb;
+ size_t len = count;
+ int err;
+
+ if (unlikely(len < ETH_HLEN))
+ return -EINVAL;
+
+ skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
+
+ if (!skb) {
+ macvlan_count_rx(q->vlan, 0, false, false);
+ return err;
+ }
+
+ skb_reserve(skb, NET_IP_ALIGN);
+ skb_put(skb, count);
+
+ if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
+ macvlan_count_rx(q->vlan, 0, false, false);
+ kfree_skb(skb);
+ return -EFAULT;
+ }
+
+ skb_set_network_header(skb, ETH_HLEN);
+
+ macvlan_start_xmit(skb, q->vlan->dev);
+
+ return count;
+}
+
I am surprised there's no GSO support. Would it be a good idea to share
code with tun driver? That already has GSO ...
Also, network header pointer seems off for vlan packets?
Post by Arnd Bergmann
+static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t result = -ENOLINK;
+ struct macvtap_queue *q = macvtap_file_get_queue(file);
+
+ if (!q)
+ goto out;
+
+ result = macvtap_get_user(q, iv, iov_length(iv, count),
+ file->f_flags & O_NONBLOCK);
+ macvtap_file_put_queue();
+ return result;
+}
+
+/* Put packet to the user space buffer */
+static ssize_t macvtap_put_user(struct macvtap_queue *q,
+ const struct sk_buff *skb,
+ const struct iovec *iv, int len)
+{
+ struct macvlan_dev *vlan = q->vlan;
+ int ret;
+
+ len = min_t(int, skb->len, len);
+
+ ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len);
+
+ macvlan_count_rx(vlan, len, ret == 0, 0);
+
+ return ret ? ret : len;
+}
+
+static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct macvtap_queue *q = macvtap_file_get_queue(file);
+
+ DECLARE_WAITQUEUE(wait, current);
+ struct sk_buff *skb;
+ ssize_t len, ret = 0;
+
+ if (!q) {
+ ret = -ENOLINK;
+ goto out;
+ }
+
+ len = iov_length(iv, count);
+ if (len < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ add_wait_queue(q->sk.sk_sleep, &wait);
+ while (len) {
+ current->state = TASK_INTERRUPTIBLE;
+
+ /* Read frames from the queue */
+ skb = skb_dequeue(&q->sk.sk_receive_queue);
+ if (!skb) {
+ if (file->f_flags & O_NONBLOCK) {
+ ret = -EAGAIN;
+ break;
+ }
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ /* Nothing to read, let's sleep */
+ schedule();
+ continue;
+ }
+ ret = macvtap_put_user(q, skb, iv, len);
+ kfree_skb(skb);
+ break;
+ }
+
+ current->state = TASK_RUNNING;
+ remove_wait_queue(q->sk.sk_sleep, &wait);
+
+ macvtap_file_put_queue();
+ return ret;
+}
+
Same GSO comment here.
Post by Arnd Bergmann
+/*
+ * provide compatibility with generic tun/tap interface
+ */
+static long macvtap_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
All of these seem to be stubs, and tun has many more that you didn't
stub out. So, why do you bother to support any ioctls at all?
Post by Arnd Bergmann
+ struct macvtap_queue *q;
+ void __user *argp = (void __user *)arg;
+ struct ifreq __user *ifr = argp;
+ unsigned int __user *up = argp;
+ unsigned int u;
+ int err;
+
+ switch (cmd) {
+ /* ignore the name, just look at flags */
+ if (get_user(u, &ifr->ifr_flags))
+ return -EFAULT;
+ if (u != (IFF_TAP | IFF_NO_PI))
+ return -EINVAL;
+ return 0;
+
+ q = macvtap_file_get_queue(file);
+ if (!q)
+ return -ENOLINK;
+ err = 0;
+ if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
+ put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags))
+ err = -EFAULT;
+ macvtap_file_put_queue();
+ return err;
+
+ if (put_user((IFF_TAP | IFF_NO_PI), up))
+ return -EFAULT;
+ return 0;
+
+ /* ignore */
+ return 0;
+
+ /* let the user check for future flags */
+ if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
+ TUN_F_TSO_ECN | TUN_F_UFO))
+ return -EINVAL;
+
+ /* TODO: add support for these, so far we don't
+ support any offload */
+ if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
+ TUN_F_TSO_ECN | TUN_F_UFO))
+ return -EINVAL;
+
+ return 0;
+
+ return -EINVAL;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static const struct file_operations macvtap_fops = {
+ .owner = THIS_MODULE,
+ .open = macvtap_open,
+ .release = macvtap_release,
+ .aio_read = macvtap_aio_read,
+ .aio_write = macvtap_aio_write,
+ .poll = macvtap_poll,
+ .llseek = no_llseek,
+ .unlocked_ioctl = macvtap_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = macvtap_compat_ioctl,
+#endif
+};
+
+static int macvtap_init(void)
+{
+ int err;
+
+ err = alloc_chrdev_region(&macvtap_major, 0,
+ MACVTAP_NUM_DEVS, "macvtap");
+ if (err)
+ goto out1;
+
+ cdev_init(&macvtap_cdev, &macvtap_fops);
+ err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
+ if (err)
+ goto out2;
+
+ macvtap_class = class_create(THIS_MODULE, "macvtap");
+ if (IS_ERR(macvtap_class)) {
+ err = PTR_ERR(macvtap_class);
+ goto out3;
+ }
+
+ err = macvlan_link_register(&macvtap_link_ops);
+ if (err)
+ goto out4;
+
+ return 0;
+
+ class_unregister(macvtap_class);
+ cdev_del(&macvtap_cdev);
+ unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
+ return err;
+}
+module_init(macvtap_init);
+
+static void macvtap_exit(void)
+{
+ rtnl_link_unregister(&macvtap_link_ops);
+ class_unregister(macvtap_class);
+ cdev_del(&macvtap_cdev);
+ unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
+}
+module_exit(macvtap_exit);
+
+MODULE_ALIAS_RTNL_LINK("macvtap");
+MODULE_AUTHOR("Arnd Bergmann <arnd at arndb.de>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 9a11544..51f1512 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -34,6 +34,7 @@ struct macvlan_dev {
enum macvlan_mode mode;
int (*receive)(struct sk_buff *skb);
int (*forward)(struct net_device *dev, struct sk_buff *skb);
+ struct macvtap_queue *tap;
};
static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
--
1.6.3.3
Arnd Bergmann
2010-01-28 20:18:08 UTC
Permalink
Post by Michael S. Tsirkin
Post by Arnd Bergmann
+static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file)
+{
+ rcu_read_lock_bh();
+ return rcu_dereference(file->private_data);
+}
+
+static inline void macvtap_file_put_queue(void)
+{
+ rcu_read_unlock_bh();
+}
+
I find such wrappers around rcu obscure this,
already sufficiently complex, pattern.
Might be just me.
Obviously I find them useful here, but if more people feel the
same as you, I'll just open-code them.
Post by Michael S. Tsirkin
Post by Arnd Bergmann
+static int macvtap_open(struct inode *inode, struct file *file)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct net_device *dev = dev_get_by_index(net, iminor(inode));
+ struct macvtap_queue *q;
+ int err;
+
This seems to keep reference to device as long as character device is
open, which, if I understand correctly, will start printing error
messages to kernel log about once a second if you try to remove the
device.
I suspect the best way to fix this issue would be to use some kind
of notifier so that macvtap can disconnect on device removal.
I think I'm just missing the put in the open function, the code
already handles the netif and the file disappearing independently.

Thanks for spotting this one, I'll fix that in the next post.
Post by Michael S. Tsirkin
Post by Arnd Bergmann
+ skb_reserve(skb, NET_IP_ALIGN);
+ skb_put(skb, count);
+
+ if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
+ macvlan_count_rx(q->vlan, 0, false, false);
+ kfree_skb(skb);
+ return -EFAULT;
+ }
+
+ skb_set_network_header(skb, ETH_HLEN);
+
+ macvlan_start_xmit(skb, q->vlan->dev);
+
+ return count;
+}
+
I am surprised there's no GSO support. Would it be a good idea to share
code with tun driver? That already has GSO ...
The driver still only does the minimum feature set to get things going.
GSO is an obvious extension, but I wanted the code to be as simple
as possible to find all the basic bugs before we do anything fancy.
Post by Michael S. Tsirkin
Also, network header pointer seems off for vlan packets?
That may well be, I haven't tried vlan. What do you think it should do
then?
Post by Michael S. Tsirkin
Post by Arnd Bergmann
+/*
+ * provide compatibility with generic tun/tap interface
+ */
+static long macvtap_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
All of these seem to be stubs, and tun has many more that you didn't
stub out. So, why do you bother to support any ioctls at all?
Again, minimum features to get things going. qemu fails to open
the device if these ioctls are not implemented, but any of the
more advanced features are left out.

Thansk for the review,

Arnd
Michael S. Tsirkin
2010-01-29 11:21:41 UTC
Permalink
Post by Arnd Bergmann
Post by Michael S. Tsirkin
Post by Arnd Bergmann
+static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file)
+{
+ rcu_read_lock_bh();
+ return rcu_dereference(file->private_data);
+}
+
+static inline void macvtap_file_put_queue(void)
+{
+ rcu_read_unlock_bh();
+}
+
I find such wrappers around rcu obscure this,
already sufficiently complex, pattern.
Might be just me.
Obviously I find them useful here, but if more people feel the
same as you, I'll just open-code them.
Post by Michael S. Tsirkin
Post by Arnd Bergmann
+static int macvtap_open(struct inode *inode, struct file *file)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct net_device *dev = dev_get_by_index(net, iminor(inode));
+ struct macvtap_queue *q;
+ int err;
+
This seems to keep reference to device as long as character device is
open, which, if I understand correctly, will start printing error
messages to kernel log about once a second if you try to remove the
device.
I suspect the best way to fix this issue would be to use some kind
of notifier so that macvtap can disconnect on device removal.
I think I'm just missing the put in the open function, the code
already handles the netif and the file disappearing independently.
Thanks for spotting this one, I'll fix that in the next post.
Post by Michael S. Tsirkin
Post by Arnd Bergmann
+ skb_reserve(skb, NET_IP_ALIGN);
+ skb_put(skb, count);
+
+ if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
+ macvlan_count_rx(q->vlan, 0, false, false);
+ kfree_skb(skb);
+ return -EFAULT;
+ }
+
+ skb_set_network_header(skb, ETH_HLEN);
+
+ macvlan_start_xmit(skb, q->vlan->dev);
+
+ return count;
+}
+
I am surprised there's no GSO support. Would it be a good idea to share
code with tun driver? That already has GSO ...
The driver still only does the minimum feature set to get things going.
GSO is an obvious extension, but I wanted the code to be as simple
as possible to find all the basic bugs before we do anything fancy.
Post by Michael S. Tsirkin
Also, network header pointer seems off for vlan packets?
That may well be, I haven't tried vlan. What do you think it should do
then?
Look at eth_type for a more complete packet parsing.
Post by Arnd Bergmann
Post by Michael S. Tsirkin
Post by Arnd Bergmann
+/*
+ * provide compatibility with generic tun/tap interface
+ */
+static long macvtap_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
All of these seem to be stubs, and tun has many more that you didn't
stub out. So, why do you bother to support any ioctls at all?
Again, minimum features to get things going. qemu fails to open
the device if these ioctls are not implemented, but any of the
more advanced features are left out.
This is strange, could be application bug. E.g. send buf size is
relatively new and apps should handle failure gracefully. IMO,
returning success and ignoring the value is not a good idea. How about
we just fix qemu? What about other apps?
Post by Arnd Bergmann
Thansk for the review,
Arnd
Arnd Bergmann
2010-01-29 19:49:52 UTC
Permalink
Post by Michael S. Tsirkin
Post by Arnd Bergmann
That may well be, I haven't tried vlan. What do you think it should do
then?
Look at eth_type for a more complete packet parsing.
ok. I initially called that but it crashed because the skb was not initialized
properly at that point. I'll have a look.
Post by Michael S. Tsirkin
Post by Arnd Bergmann
Post by Michael S. Tsirkin
Post by Arnd Bergmann
+/*
+ * provide compatibility with generic tun/tap interface
+ */
+static long macvtap_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
All of these seem to be stubs, and tun has many more that you didn't
stub out. So, why do you bother to support any ioctls at all?
Again, minimum features to get things going. qemu fails to open
the device if these ioctls are not implemented, but any of the
more advanced features are left out.
This is strange, could be application bug. E.g. send buf size is
relatively new and apps should handle failure gracefully. IMO,
returning success and ignoring the value is not a good idea. How about
we just fix qemu? What about other apps?
Ok, I'll go through the ioctls again and make sure they behave correctly
they way you said. I haven't tried against against anything but qemu and
cat.

Arnd
Arnd Bergmann
2010-01-27 10:04:20 UTC
Permalink
This is the third version of the macvtap device driver, following another major restructuring and a lot of bug fixes:

* Change macvtap to be based around a struct sock
* macvtap: fix initialization
* return 0 to netlink
* don't use rcu for q->file and q->vlan pointers
* macvtap: checkpatch.pl fixes
* macvtap: fix tun IFF flags
* Use a struct socket to make tx flow control work
* disable BH processing during transmit
* only add an ethernet header for receive not forward
* allocate the SKB using GFP_NOWAIT since we're
in rcu_read_lock
* use atomic allocation for socket
* fix blocking on send
* do not destroy netdev twice in error path

There are still known problems, but unless there
are fundamental concerns, I'd like this to go
into net-next as an experimental driver,
fixing up the remaining problems by 2.6.34-rc1.

Arnd
Arnd Bergmann
2010-01-27 10:05:15 UTC
Permalink
In the vlan and macvlan drivers, the start_xmit function forwards
data to the dev_queue_xmit function for another device, which may
potentially belong to a different namespace.

To make sure that classification stays within a single namespace,
this resets the potentially critical fields.

Signed-off-by: Arnd Bergmann <arnd at arndb.de>
---
drivers/net/macvlan.c | 2 +-
include/linux/netdevice.h | 9 +++++++++
net/8021q/vlan_dev.c | 2 +-
net/core/dev.c | 35 +++++++++++++++++++++++++++++++----
4 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index bad1303..e0436fd 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -269,7 +269,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
}

xmit_world:
- skb->dev = vlan->lowerdev;
+ skb_set_dev(skb, vlan->lowerdev);
return dev_queue_xmit(skb);
}

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 93a32a5..622ba5a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1004,6 +1004,15 @@ static inline bool netdev_uses_dsa_tags(struct net_device *dev)
return 0;
}

+#ifndef CONFIG_NET_NS
+static inline void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
+{
+ skb->dev = dev;
+}
+#else /* CONFIG_NET_NS */
+void skb_set_dev(struct sk_buff *skb, struct net_device *dev);
+#endif
+
static inline bool netdev_uses_trailer_tags(struct net_device *dev)
{
#ifdef CONFIG_NET_DSA_TAG_TRAILER
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 77a49ff..95034a8 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -322,7 +322,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
}


- skb->dev = vlan_dev_info(dev)->real_dev;
+ skb_set_dev(skb, vlan_dev_info(dev)->real_dev);
len = skb->len;
ret = dev_queue_xmit(skb);

diff --git a/net/core/dev.c b/net/core/dev.c
index 2cba5c5..e80403a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1448,13 +1448,10 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
if (skb->len > (dev->mtu + dev->hard_header_len))
return NET_RX_DROP;

- skb_dst_drop(skb);
+ skb_set_dev(skb, dev);
skb->tstamp.tv64 = 0;
skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, dev);
- skb->mark = 0;
- secpath_reset(skb);
- nf_reset(skb);
return netif_rx(skb);
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
@@ -1614,6 +1611,36 @@ static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
return false;
}

+/**
+ * skb_dev_set -- assign a buffer to a new device
+ * @skb: buffer for the new device
+ * @dev: network device
+ *
+ * If an skb is owned by a device already, we have to reset
+ * all data private to the namespace a device belongs to
+ * before assigning it a new device.
+ */
+#ifdef CONFIG_NET_NS
+void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
+{
+ skb_dst_drop(skb);
+ if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
+ secpath_reset(skb);
+ nf_reset(skb);
+ skb_init_secmark(skb);
+ skb->mark = 0;
+ skb->priority = 0;
+ skb->nf_trace = 0;
+ skb->ipvs_property = 0;
+#ifdef CONFIG_NET_SCHED
+ skb->tc_index = 0;
+#endif
+ }
+ skb->dev = dev;
+}
+EXPORT_SYMBOL(skb_set_dev);
+#endif /* CONFIG_NET_NS */
+
/*
* Invalidate hardware checksum when packet is to be mangled, and
* complete checksum manually on outgoing path.
--
1.6.3.3
Arnd Bergmann
2010-01-27 21:59:31 UTC
Permalink
Post by Arnd Bergmann
There are still known problems, but unless there
are fundamental concerns, I'd like this to go
into net-next as an experimental driver,
fixing up the remaining problems by 2.6.34-rc1.
I should have been more specific here. The one
really annoying problem is a reference counting
problem I introduced in one of the last changes
that prevents you from destroying a device after
it has been used.

Unfortunately, I'm still traveling after LCA,
and haven't had a chance to look into this before
sending out the patches as I had originally
planned. I've also seen crashes that are not
fully reproducible, any bug reports on those
are appreciated.

Arnd
Arnd Bergmann
2010-01-30 22:24:26 UTC
Permalink
In order to use macvlan with qemu and other tools that require
a tap file descriptor, the macvtap driver adds a small backend
with a character device with the same interface as the tun
driver, with a minimum set of features.

Macvtap interfaces are created in the same way as macvlan
interfaces using ip link, but the netif is just used as a
handle for configuration and accounting, while the data
goes through the chardev. Each macvtap interface has its
own character device, simplifying permission management
significantly over the generic tun/tap driver.

Cc: Patrick McHardy <kaber at trash.net>
Cc: Stephen Hemminger <shemminger at linux-foundation.org>
Cc: David S. Miller" <davem at davemloft.net>
Cc: "Michael S. Tsirkin" <mst at redhat.com>
Cc: Herbert Xu <herbert at gondor.apana.org.au>
Cc: Or Gerlitz <ogerlitz at voltaire.com>
Cc: netdev at vger.kernel.org
Cc: bridge at lists.linux-foundation.org
Cc: linux-kernel at vger.kernel.org
Signed-off-by: Arnd Bergmann <arnd at arndb.de>
---
drivers/net/Kconfig | 12 +
drivers/net/Makefile | 1 +
drivers/net/macvtap.c | 581 ++++++++++++++++++++++++++++++++++++++++++++
include/linux/if_macvlan.h | 1 +
4 files changed, 595 insertions(+), 0 deletions(-)
create mode 100644 drivers/net/macvtap.c

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index cb0e534..411e207 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -90,6 +90,18 @@ config MACVLAN
To compile this driver as a module, choose M here: the module
will be called macvlan.

+config MACVTAP
+ tristate "MAC-VLAN based tap driver (EXPERIMENTAL)"
+ depends on MACVLAN
+ help
+ This adds a specialized tap character device driver that is based
+ on the MAC-VLAN network interface, called macvtap. A macvtap device
+ can be added in the same way as a macvlan device, using 'type
+ macvlan', and then be accessed through the tap user space interface.
+
+ To compile this driver as a module, choose M here: the module
+ will be called macvtap.
+
config EQUALIZER
tristate "EQL (serial line load balancing) support"
---help---
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 0b763cb..9595803 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -169,6 +169,7 @@ obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o
obj-$(CONFIG_DUMMY) += dummy.o
obj-$(CONFIG_IFB) += ifb.o
obj-$(CONFIG_MACVLAN) += macvlan.o
+obj-$(CONFIG_MACVTAP) += macvtap.o
obj-$(CONFIG_DE600) += de600.o
obj-$(CONFIG_DE620) += de620.o
obj-$(CONFIG_LANCE) += lance.o
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
new file mode 100644
index 0000000..ad1f6ef
--- /dev/null
+++ b/drivers/net/macvtap.c
@@ -0,0 +1,581 @@
+#include <linux/etherdevice.h>
+#include <linux/if_macvlan.h>
+#include <linux/interrupt.h>
+#include <linux/nsproxy.h>
+#include <linux/compat.h>
+#include <linux/if_tun.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/cache.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/cdev.h>
+#include <linux/fs.h>
+
+#include <net/net_namespace.h>
+#include <net/rtnetlink.h>
+#include <net/sock.h>
+
+/*
+ * A macvtap queue is the central object of this driver, it connects
+ * an open character device to a macvlan interface. There can be
+ * multiple queues on one interface, which map back to queues
+ * implemented in hardware on the underlying device.
+ *
+ * macvtap_proto is used to allocate queues through the sock allocation
+ * mechanism.
+ *
+ * TODO: multiqueue support is currently not implemented, even though
+ * macvtap is basically prepared for that. We will need to add this
+ * here as well as in virtio-net and qemu to get line rate on 10gbit
+ * adapters from a guest.
+ */
+struct macvtap_queue {
+ struct sock sk;
+ struct socket sock;
+ struct macvlan_dev *vlan;
+ struct file *file;
+};
+
+static struct proto macvtap_proto = {
+ .name = "macvtap",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof (struct macvtap_queue),
+};
+
+/*
+ * Minor number matches netdev->ifindex, so need a potentially
+ * large value. This also makes it possible to split the
+ * tap functionality out again in the future by offering it
+ * from other drivers besides macvtap. As long as every device
+ * only has one tap, the interface numbers assure that the
+ * device nodes are unique.
+ */
+static unsigned int macvtap_major;
+#define MACVTAP_NUM_DEVS 65536
+static struct class *macvtap_class;
+static struct cdev macvtap_cdev;
+
+/*
+ * RCU usage:
+ * The macvtap_queue is referenced both from the chardev struct file
+ * and from the struct macvlan_dev using rcu_read_lock.
+ *
+ * We never actually update the contents of a macvtap_queue atomically
+ * with RCU but it is used for race-free destruction of a queue when
+ * either the file or the macvlan_dev goes away. Pointers back to
+ * the dev and the file are implicitly valid as long as the queue
+ * exists.
+ *
+ * The callbacks from macvlan are always done with rcu_read_lock held
+ * already, while in the file_operations, we get it ourselves.
+ *
+ * When destroying a queue, we remove the pointers from the file and
+ * from the dev and then synchronize_rcu to make sure no thread is
+ * still using the queue. There may still be references to the struct
+ * sock inside of the queue from outbound SKBs, but these never
+ * reference back to the file or the dev. The data structure is freed
+ * through __sk_free when both our references and any pending SKBs
+ * are gone.
+ *
+ * macvtap_lock is only used to prevent multiple concurrent open()
+ * calls to assign a new vlan->tap pointer. It could be moved into
+ * the macvlan_dev itself but is extremely rarely used.
+ */
+static DEFINE_SPINLOCK(macvtap_lock);
+
+/*
+ * Choose the next free queue, for now there is only one
+ */
+static int macvtap_set_queue(struct net_device *dev, struct file *file,
+ struct macvtap_queue *q)
+{
+ struct macvlan_dev *vlan = netdev_priv(dev);
+ int err = -EBUSY;
+
+ spin_lock(&macvtap_lock);
+ if (rcu_dereference(vlan->tap))
+ goto out;
+
+ err = 0;
+ q->vlan = vlan;
+ rcu_assign_pointer(vlan->tap, q);
+
+ q->file = file;
+ rcu_assign_pointer(file->private_data, q);
+
+out:
+ spin_unlock(&macvtap_lock);
+ return err;
+}
+
+/*
+ * We must destroy each queue exactly once, when either
+ * the netdev or the file go away.
+ *
+ * Using the spinlock makes sure that we don't get
+ * to the queue again after destroying it.
+ *
+ * synchronize_rcu serializes with the packet flow
+ * that uses rcu_read_lock.
+ */
+static void macvtap_del_queue(struct macvtap_queue **qp)
+{
+ struct macvtap_queue *q;
+
+ spin_lock(&macvtap_lock);
+ q = rcu_dereference(*qp);
+ if (!q) {
+ spin_unlock(&macvtap_lock);
+ return;
+ }
+
+ rcu_assign_pointer(q->vlan->tap, NULL);
+ rcu_assign_pointer(q->file->private_data, NULL);
+ spin_unlock(&macvtap_lock);
+
+ synchronize_rcu();
+ sock_put(&q->sk);
+}
+
+/*
+ * Since we only support one queue, just dereference the pointer.
+ */
+static struct macvtap_queue *macvtap_get_queue(struct net_device *dev,
+ struct sk_buff *skb)
+{
+ struct macvlan_dev *vlan = netdev_priv(dev);
+
+ return rcu_dereference(vlan->tap);
+}
+
+static void macvtap_del_queues(struct net_device *dev)
+{
+ struct macvlan_dev *vlan = netdev_priv(dev);
+ macvtap_del_queue(&vlan->tap);
+}
+
+static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file)
+{
+ rcu_read_lock_bh();
+ return rcu_dereference(file->private_data);
+}
+
+static inline void macvtap_file_put_queue(void)
+{
+ rcu_read_unlock_bh();
+}
+
+/*
+ * Forward happens for data that gets sent from one macvlan
+ * endpoint to another one in bridge mode. We just take
+ * the skb and put it into the receive queue.
+ */
+static int macvtap_forward(struct net_device *dev, struct sk_buff *skb)
+{
+ struct macvtap_queue *q = macvtap_get_queue(dev, skb);
+ if (!q)
+ return -ENOLINK;
+
+ skb_queue_tail(&q->sk.sk_receive_queue, skb);
+ wake_up(q->sk.sk_sleep);
+ return 0;
+}
+
+/*
+ * Receive is for data from the external interface (lowerdev),
+ * in case of macvtap, we can treat that the same way as
+ * forward, which macvlan cannot.
+ */
+static int macvtap_receive(struct sk_buff *skb)
+{
+ skb_push(skb, ETH_HLEN);
+ return macvtap_forward(skb->dev, skb);
+}
+
+static int macvtap_newlink(struct net *src_net,
+ struct net_device *dev,
+ struct nlattr *tb[],
+ struct nlattr *data[])
+{
+ struct device *classdev;
+ dev_t devt;
+ int err;
+
+ err = macvlan_common_newlink(src_net, dev, tb, data,
+ macvtap_receive, macvtap_forward);
+ if (err)
+ goto out;
+
+ devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
+
+ classdev = device_create(macvtap_class, &dev->dev, devt,
+ dev, "tap%d", dev->ifindex);
+ if (IS_ERR(classdev)) {
+ err = PTR_ERR(classdev);
+ macvtap_del_queues(dev);
+ }
+
+out:
+ return err;
+}
+
+static void macvtap_dellink(struct net_device *dev,
+ struct list_head *head)
+{
+ device_destroy(macvtap_class,
+ MKDEV(MAJOR(macvtap_major), dev->ifindex));
+
+ macvtap_del_queues(dev);
+ macvlan_dellink(dev, head);
+}
+
+static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
+ .kind = "macvtap",
+ .newlink = macvtap_newlink,
+ .dellink = macvtap_dellink,
+};
+
+
+static void macvtap_sock_write_space(struct sock *sk)
+{
+ if (!sock_writeable(sk) ||
+ !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
+ return;
+
+ if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+ wake_up_interruptible_sync(sk->sk_sleep);
+}
+
+static int macvtap_open(struct inode *inode, struct file *file)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct net_device *dev = dev_get_by_index(net, iminor(inode));
+ struct macvtap_queue *q;
+ int err;
+
+ err = -ENODEV;
+ if (!dev)
+ goto out;
+
+ /* check if this is a macvtap device */
+ err = -EINVAL;
+ if (dev->rtnl_link_ops != &macvtap_link_ops)
+ goto out;
+
+ err = -ENOMEM;
+ q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
+ &macvtap_proto);
+ if (!q)
+ goto out;
+
+ init_waitqueue_head(&q->sock.wait);
+ q->sock.type = SOCK_RAW;
+ q->sock.state = SS_CONNECTED;
+ sock_init_data(&q->sock, &q->sk);
+ q->sk.sk_allocation = GFP_ATOMIC; /* for now */
+ q->sk.sk_write_space = macvtap_sock_write_space;
+
+ err = macvtap_set_queue(dev, file, q);
+ if (err)
+ sock_put(&q->sk);
+
+out:
+ if (dev)
+ dev_put(dev);
+
+ return err;
+}
+
+static int macvtap_release(struct inode *inode, struct file *file)
+{
+ macvtap_del_queue((struct macvtap_queue **)&file->private_data);
+ return 0;
+}
+
+static unsigned int macvtap_poll(struct file *file, poll_table * wait)
+{
+ struct macvtap_queue *q = macvtap_file_get_queue(file);
+ unsigned int mask = POLLERR;
+
+ if (!q)
+ goto out;
+
+ mask = 0;
+ poll_wait(file, &q->sock.wait, wait);
+
+ if (!skb_queue_empty(&q->sk.sk_receive_queue))
+ mask |= POLLIN | POLLRDNORM;
+
+ if (sock_writeable(&q->sk) ||
+ (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) &&
+ sock_writeable(&q->sk)))
+ mask |= POLLOUT | POLLWRNORM;
+
+out:
+ macvtap_file_put_queue();
+ return mask;
+}
+
+/* Get packet from user space buffer */
+static ssize_t macvtap_get_user(struct macvtap_queue *q,
+ const struct iovec *iv, size_t count,
+ int noblock)
+{
+ struct sk_buff *skb;
+ size_t len = count;
+ int err;
+
+ if (unlikely(len < ETH_HLEN))
+ return -EINVAL;
+
+ skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
+
+ if (!skb) {
+ macvlan_count_rx(q->vlan, 0, false, false);
+ return err;
+ }
+
+ skb_reserve(skb, NET_IP_ALIGN);
+ skb_put(skb, count);
+
+ if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
+ macvlan_count_rx(q->vlan, 0, false, false);
+ kfree_skb(skb);
+ return -EFAULT;
+ }
+
+ skb_set_network_header(skb, ETH_HLEN);
+
+ macvlan_start_xmit(skb, q->vlan->dev);
+
+ return count;
+}
+
+static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t result = -ENOLINK;
+ struct macvtap_queue *q = macvtap_file_get_queue(file);
+
+ if (!q)
+ goto out;
+
+ result = macvtap_get_user(q, iv, iov_length(iv, count),
+ file->f_flags & O_NONBLOCK);
+out:
+ macvtap_file_put_queue();
+ return result;
+}
+
+/* Put packet to the user space buffer */
+static ssize_t macvtap_put_user(struct macvtap_queue *q,
+ const struct sk_buff *skb,
+ const struct iovec *iv, int len)
+{
+ struct macvlan_dev *vlan = q->vlan;
+ int ret;
+
+ len = min_t(int, skb->len, len);
+
+ ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len);
+
+ macvlan_count_rx(vlan, len, ret == 0, 0);
+
+ return ret ? ret : len;
+}
+
+static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
+ unsigned long count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct macvtap_queue *q = macvtap_file_get_queue(file);
+
+ DECLARE_WAITQUEUE(wait, current);
+ struct sk_buff *skb;
+ ssize_t len, ret = 0;
+
+ if (!q) {
+ ret = -ENOLINK;
+ goto out;
+ }
+
+ len = iov_length(iv, count);
+ if (len < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ add_wait_queue(q->sk.sk_sleep, &wait);
+ while (len) {
+ current->state = TASK_INTERRUPTIBLE;
+
+ /* Read frames from the queue */
+ skb = skb_dequeue(&q->sk.sk_receive_queue);
+ if (!skb) {
+ if (file->f_flags & O_NONBLOCK) {
+ ret = -EAGAIN;
+ break;
+ }
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ /* Nothing to read, let's sleep */
+ schedule();
+ continue;
+ }
+ ret = macvtap_put_user(q, skb, iv, len);
+ kfree_skb(skb);
+ break;
+ }
+
+ current->state = TASK_RUNNING;
+ remove_wait_queue(q->sk.sk_sleep, &wait);
+
+out:
+ macvtap_file_put_queue();
+ return ret;
+}
+
+/*
+ * provide compatibility with generic tun/tap interface
+ */
+static long macvtap_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct macvtap_queue *q;
+ void __user *argp = (void __user *)arg;
+ struct ifreq __user *ifr = argp;
+ unsigned int __user *up = argp;
+ unsigned int u;
+ char devname[IFNAMSIZ];
+
+ switch (cmd) {
+ case TUNSETIFF:
+ /* ignore the name, just look at flags */
+ if (get_user(u, &ifr->ifr_flags))
+ return -EFAULT;
+ if (u != (IFF_TAP | IFF_NO_PI))
+ return -EINVAL;
+ return 0;
+
+ case TUNGETIFF:
+ q = macvtap_file_get_queue(file);
+ if (!q)
+ return -ENOLINK;
+ memcpy(devname, q->vlan->dev->name, sizeof(devname));
+ macvtap_file_put_queue();
+
+ if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) ||
+ put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags))
+ return -EFAULT;
+ return 0;
+
+ case TUNGETFEATURES:
+ if (put_user((IFF_TAP | IFF_NO_PI), up))
+ return -EFAULT;
+ return 0;
+
+ case TUNSETSNDBUF:
+ if (get_user(u, up))
+ return -EFAULT;
+
+ q = macvtap_file_get_queue(file);
+ q->sk.sk_sndbuf = u;
+ macvtap_file_put_queue();
+ return 0;
+
+ case TUNSETOFFLOAD:
+ /* let the user check for future flags */
+ if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
+ TUN_F_TSO_ECN | TUN_F_UFO))
+ return -EINVAL;
+
+ /* TODO: add support for these, so far we don't
+ support any offload */
+ if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 |
+ TUN_F_TSO_ECN | TUN_F_UFO))
+ return -EINVAL;
+
+ return 0;
+
+ default:
+ return -EINVAL;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static long macvtap_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static const struct file_operations macvtap_fops = {
+ .owner = THIS_MODULE,
+ .open = macvtap_open,
+ .release = macvtap_release,
+ .aio_read = macvtap_aio_read,
+ .aio_write = macvtap_aio_write,
+ .poll = macvtap_poll,
+ .llseek = no_llseek,
+ .unlocked_ioctl = macvtap_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = macvtap_compat_ioctl,
+#endif
+};
+
+static int macvtap_init(void)
+{
+ int err;
+
+ err = alloc_chrdev_region(&macvtap_major, 0,
+ MACVTAP_NUM_DEVS, "macvtap");
+ if (err)
+ goto out1;
+
+ cdev_init(&macvtap_cdev, &macvtap_fops);
+ err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS);
+ if (err)
+ goto out2;
+
+ macvtap_class = class_create(THIS_MODULE, "macvtap");
+ if (IS_ERR(macvtap_class)) {
+ err = PTR_ERR(macvtap_class);
+ goto out3;
+ }
+
+ err = macvlan_link_register(&macvtap_link_ops);
+ if (err)
+ goto out4;
+
+ return 0;
+
+out4:
+ class_unregister(macvtap_class);
+out3:
+ cdev_del(&macvtap_cdev);
+out2:
+ unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
+out1:
+ return err;
+}
+module_init(macvtap_init);
+
+static void macvtap_exit(void)
+{
+ rtnl_link_unregister(&macvtap_link_ops);
+ class_unregister(macvtap_class);
+ cdev_del(&macvtap_cdev);
+ unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
+}
+module_exit(macvtap_exit);
+
+MODULE_ALIAS_RTNL_LINK("macvtap");
+MODULE_AUTHOR("Arnd Bergmann <arnd at arndb.de>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 9a11544..51f1512 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -34,6 +34,7 @@ struct macvlan_dev {
enum macvlan_mode mode;
int (*receive)(struct sk_buff *skb);
int (*forward)(struct net_device *dev, struct sk_buff *skb);
+ struct macvtap_queue *tap;
};

static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
--
1.6.3.3
Arnd Bergmann
2010-01-30 22:23:40 UTC
Permalink
This makes it possible to hook into the macvlan driver
from another kernel module. In particular, the goal is
to extend it with the macvtap backend that provides
a tun/tap compatible interface directly on the macvlan
device.

Signed-off-by: Arnd Bergmann <arnd at arndb.de>
---
drivers/net/macvlan.c | 113 +++++++++++++++++++-------------------------
include/linux/if_macvlan.h | 70 +++++++++++++++++++++++++++
2 files changed, 119 insertions(+), 64 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index d32e0bd..40faa36 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -39,31 +39,6 @@ struct macvlan_port {
struct list_head vlans;
};

-/**
- * struct macvlan_rx_stats - MACVLAN percpu rx stats
- * @rx_packets: number of received packets
- * @rx_bytes: number of received bytes
- * @multicast: number of received multicast packets
- * @rx_errors: number of errors
- */
-struct macvlan_rx_stats {
- unsigned long rx_packets;
- unsigned long rx_bytes;
- unsigned long multicast;
- unsigned long rx_errors;
-};
-
-struct macvlan_dev {
- struct net_device *dev;
- struct list_head list;
- struct hlist_node hlist;
- struct macvlan_port *port;
- struct net_device *lowerdev;
- struct macvlan_rx_stats *rx_stats;
- enum macvlan_mode mode;
-};
-
-
static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port,
const unsigned char *addr)
{
@@ -118,31 +93,17 @@ static int macvlan_addr_busy(const struct macvlan_port *port,
return 0;
}

-static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
- unsigned int len, bool success,
- bool multicast)
-{
- struct macvlan_rx_stats *rx_stats;
-
- rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id());
- if (likely(success)) {
- rx_stats->rx_packets++;;
- rx_stats->rx_bytes += len;
- if (multicast)
- rx_stats->multicast++;
- } else {
- rx_stats->rx_errors++;
- }
-}

-static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev,
+static int macvlan_broadcast_one(struct sk_buff *skb,
+ const struct macvlan_dev *vlan,
const struct ethhdr *eth, bool local)
{
+ struct net_device *dev = vlan->dev;
if (!skb)
return NET_RX_DROP;

if (local)
- return dev_forward_skb(dev, skb);
+ return vlan->forward(dev, skb);

skb->dev = dev;
if (!compare_ether_addr_64bits(eth->h_dest,
@@ -151,7 +112,7 @@ static int macvlan_broadcast_one(struct sk_buff *skb, struct net_device *dev,
else
skb->pkt_type = PACKET_MULTICAST;

- return netif_rx(skb);
+ return vlan->receive(skb);
}

static void macvlan_broadcast(struct sk_buff *skb,
@@ -175,7 +136,7 @@ static void macvlan_broadcast(struct sk_buff *skb,
continue;

nskb = skb_clone(skb, GFP_ATOMIC);
- err = macvlan_broadcast_one(nskb, vlan->dev, eth,
+ err = macvlan_broadcast_one(nskb, vlan, eth,
mode == MACVLAN_MODE_BRIDGE);
macvlan_count_rx(vlan, skb->len + ETH_HLEN,
err == NET_RX_SUCCESS, 1);
@@ -238,7 +199,7 @@ static struct sk_buff *macvlan_handle_frame(struct sk_buff *skb)
skb->dev = dev;
skb->pkt_type = PACKET_HOST;

- netif_rx(skb);
+ vlan->receive(skb);
return NULL;
}

@@ -260,7 +221,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
dest = macvlan_hash_lookup(port, eth->h_dest);
if (dest && dest->mode == MACVLAN_MODE_BRIDGE) {
unsigned int length = skb->len + ETH_HLEN;
- int ret = dev_forward_skb(dest->dev, skb);
+ int ret = dest->forward(dest->dev, skb);
macvlan_count_rx(dest, length,
ret == NET_RX_SUCCESS, 0);

@@ -273,8 +234,8 @@ xmit_world:
return dev_queue_xmit(skb);
}

-static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
- struct net_device *dev)
+netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
int i = skb_get_queue_mapping(skb);
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
@@ -290,6 +251,7 @@ static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,

return ret;
}
+EXPORT_SYMBOL_GPL(macvlan_start_xmit);

static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type, const void *daddr,
@@ -623,8 +585,11 @@ static int macvlan_get_tx_queues(struct net *net,
return 0;
}

-static int macvlan_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[])
+int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ int (*receive)(struct sk_buff *skb),
+ int (*forward)(struct net_device *dev,
+ struct sk_buff *skb))
{
struct macvlan_dev *vlan = netdev_priv(dev);
struct macvlan_port *port;
@@ -664,6 +629,8 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev,
vlan->lowerdev = lowerdev;
vlan->dev = dev;
vlan->port = port;
+ vlan->receive = receive;
+ vlan->forward = forward;

vlan->mode = MACVLAN_MODE_VEPA;
if (data && data[IFLA_MACVLAN_MODE])
@@ -677,8 +644,17 @@ static int macvlan_newlink(struct net *src_net, struct net_device *dev,
netif_stacked_transfer_operstate(lowerdev, dev);
return 0;
}
+EXPORT_SYMBOL_GPL(macvlan_common_newlink);

-static void macvlan_dellink(struct net_device *dev, struct list_head *head)
+static int macvlan_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ return macvlan_common_newlink(src_net, dev, tb, data,
+ netif_rx,
+ dev_forward_skb);
+}
+
+void macvlan_dellink(struct net_device *dev, struct list_head *head)
{
struct macvlan_dev *vlan = netdev_priv(dev);
struct macvlan_port *port = vlan->port;
@@ -689,6 +665,7 @@ static void macvlan_dellink(struct net_device *dev, struct list_head *head)
if (list_empty(&port->vlans))
macvlan_port_destroy(port->dev);
}
+EXPORT_SYMBOL_GPL(macvlan_dellink);

static int macvlan_changelink(struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
@@ -720,19 +697,27 @@ static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = {
[IFLA_MACVLAN_MODE] = { .type = NLA_U32 },
};

-static struct rtnl_link_ops macvlan_link_ops __read_mostly = {
+int macvlan_link_register(struct rtnl_link_ops *ops)
+{
+ /* common fields */
+ ops->priv_size = sizeof(struct macvlan_dev);
+ ops->get_tx_queues = macvlan_get_tx_queues;
+ ops->setup = macvlan_setup;
+ ops->validate = macvlan_validate;
+ ops->maxtype = IFLA_MACVLAN_MAX;
+ ops->policy = macvlan_policy;
+ ops->changelink = macvlan_changelink;
+ ops->get_size = macvlan_get_size;
+ ops->fill_info = macvlan_fill_info;
+
+ return rtnl_link_register(ops);
+};
+EXPORT_SYMBOL_GPL(macvlan_link_register);
+
+static struct rtnl_link_ops macvlan_link_ops = {
.kind = "macvlan",
- .priv_size = sizeof(struct macvlan_dev),
- .get_tx_queues = macvlan_get_tx_queues,
- .setup = macvlan_setup,
- .validate = macvlan_validate,
.newlink = macvlan_newlink,
.dellink = macvlan_dellink,
- .maxtype = IFLA_MACVLAN_MAX,
- .policy = macvlan_policy,
- .changelink = macvlan_changelink,
- .get_size = macvlan_get_size,
- .fill_info = macvlan_fill_info,
};

static int macvlan_device_event(struct notifier_block *unused,
@@ -761,7 +746,7 @@ static int macvlan_device_event(struct notifier_block *unused,
break;
case NETDEV_UNREGISTER:
list_for_each_entry_safe(vlan, next, &port->vlans, list)
- macvlan_dellink(vlan->dev, NULL);
+ vlan->dev->rtnl_link_ops->dellink(vlan->dev, NULL);
break;
}
return NOTIFY_DONE;
@@ -778,7 +763,7 @@ static int __init macvlan_init_module(void)
register_netdevice_notifier(&macvlan_notifier_block);
macvlan_handle_frame_hook = macvlan_handle_frame;

- err = rtnl_link_register(&macvlan_link_ops);
+ err = macvlan_link_register(&macvlan_link_ops);
if (err < 0)
goto err1;
return 0;
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 5f200ba..9a11544 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -1,6 +1,76 @@
#ifndef _LINUX_IF_MACVLAN_H
#define _LINUX_IF_MACVLAN_H

+#include <linux/if_link.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <net/netlink.h>
+
+struct macvlan_port;
+struct macvtap_queue;
+
+/**
+ * struct macvlan_rx_stats - MACVLAN percpu rx stats
+ * @rx_packets: number of received packets
+ * @rx_bytes: number of received bytes
+ * @multicast: number of received multicast packets
+ * @rx_errors: number of errors
+ */
+struct macvlan_rx_stats {
+ unsigned long rx_packets;
+ unsigned long rx_bytes;
+ unsigned long multicast;
+ unsigned long rx_errors;
+};
+
+struct macvlan_dev {
+ struct net_device *dev;
+ struct list_head list;
+ struct hlist_node hlist;
+ struct macvlan_port *port;
+ struct net_device *lowerdev;
+ struct macvlan_rx_stats *rx_stats;
+ enum macvlan_mode mode;
+ int (*receive)(struct sk_buff *skb);
+ int (*forward)(struct net_device *dev, struct sk_buff *skb);
+};
+
+static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
+ unsigned int len, bool success,
+ bool multicast)
+{
+ struct macvlan_rx_stats *rx_stats;
+
+ rx_stats = per_cpu_ptr(vlan->rx_stats, smp_processor_id());
+ if (likely(success)) {
+ rx_stats->rx_packets++;;
+ rx_stats->rx_bytes += len;
+ if (multicast)
+ rx_stats->multicast++;
+ } else {
+ rx_stats->rx_errors++;
+ }
+}
+
+extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ int (*receive)(struct sk_buff *skb),
+ int (*forward)(struct net_device *dev,
+ struct sk_buff *skb));
+
+extern void macvlan_count_rx(const struct macvlan_dev *vlan,
+ unsigned int len, bool success,
+ bool multicast);
+
+extern void macvlan_dellink(struct net_device *dev, struct list_head *head);
+
+extern int macvlan_link_register(struct rtnl_link_ops *ops);
+
+extern netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
+ struct net_device *dev);
+
+
extern struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *);

#endif /* _LINUX_IF_MACVLAN_H */
--
1.6.3.3
Arnd Bergmann
2010-01-30 22:22:15 UTC
Permalink
This is the fourth version of the macvtap driver,
based on the comments I got for the last version
I got a few days ago. Very few changes:

* release netdev in chardev open function so
we can destroy it properly.
* Implement TUNSETSNDBUF
* fix sleeping call in rcu_read_lock
* Fix comment in namespace isolation patch
* Fix small context difference to make it apply
to net-next

I can't really test here while travelling, so please
give it a go if you're interested in this driver.

Arnd
Arnd Bergmann
2010-01-30 22:23:03 UTC
Permalink
In the vlan and macvlan drivers, the start_xmit function forwards
data to the dev_queue_xmit function for another device, which may
potentially belong to a different namespace.

To make sure that classification stays within a single namespace,
this resets the potentially critical fields.

Signed-off-by: Arnd Bergmann <arnd at arndb.de>
---
drivers/net/macvlan.c | 2 +-
include/linux/netdevice.h | 9 +++++++++
net/8021q/vlan_dev.c | 2 +-
net/core/dev.c | 35 +++++++++++++++++++++++++++++++----
4 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index fa0dc51..d32e0bd 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -269,7 +269,7 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
}

xmit_world:
- skb->dev = vlan->lowerdev;
+ skb_set_dev(skb, vlan->lowerdev);
return dev_queue_xmit(skb);
}

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 93a32a5..622ba5a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1004,6 +1004,15 @@ static inline bool netdev_uses_dsa_tags(struct net_device *dev)
return 0;
}

+#ifndef CONFIG_NET_NS
+static inline void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
+{
+ skb->dev = dev;
+}
+#else /* CONFIG_NET_NS */
+void skb_set_dev(struct sk_buff *skb, struct net_device *dev);
+#endif
+
static inline bool netdev_uses_trailer_tags(struct net_device *dev)
{
#ifdef CONFIG_NET_DSA_TAG_TRAILER
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index a9e1f17..9e83272 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -322,7 +322,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
}


- skb->dev = vlan_dev_info(dev)->real_dev;
+ skb_set_dev(skb, vlan_dev_info(dev)->real_dev);
len = skb->len;
ret = dev_queue_xmit(skb);

diff --git a/net/core/dev.c b/net/core/dev.c
index 2cba5c5..94c1eee 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1448,13 +1448,10 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
if (skb->len > (dev->mtu + dev->hard_header_len))
return NET_RX_DROP;

- skb_dst_drop(skb);
+ skb_set_dev(skb, dev);
skb->tstamp.tv64 = 0;
skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, dev);
- skb->mark = 0;
- secpath_reset(skb);
- nf_reset(skb);
return netif_rx(skb);
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
@@ -1614,6 +1611,36 @@ static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
return false;
}

+/**
+ * skb_dev_set -- assign a new device to a buffer
+ * @skb: buffer for the new device
+ * @dev: network device
+ *
+ * If an skb is owned by a device already, we have to reset
+ * all data private to the namespace a device belongs to
+ * before assigning it a new device.
+ */
+#ifdef CONFIG_NET_NS
+void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
+{
+ skb_dst_drop(skb);
+ if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
+ secpath_reset(skb);
+ nf_reset(skb);
+ skb_init_secmark(skb);
+ skb->mark = 0;
+ skb->priority = 0;
+ skb->nf_trace = 0;
+ skb->ipvs_property = 0;
+#ifdef CONFIG_NET_SCHED
+ skb->tc_index = 0;
+#endif
+ }
+ skb->dev = dev;
+}
+EXPORT_SYMBOL(skb_set_dev);
+#endif /* CONFIG_NET_NS */
+
/*
* Invalidate hardware checksum when packet is to be mangled, and
* complete checksum manually on outgoing path.
--
1.6.3.3
David Miller
2010-02-04 04:21:04 UTC
Permalink
From: Arnd Bergmann <arnd at arndb.de>
Date: Sat, 30 Jan 2010 23:22:15 +0100
Post by Arnd Bergmann
This is the fourth version of the macvtap driver,
based on the comments I got for the last version
* release netdev in chardev open function so
we can destroy it properly.
* Implement TUNSETSNDBUF
* fix sleeping call in rcu_read_lock
* Fix comment in namespace isolation patch
* Fix small context difference to make it apply
to net-next
I can't really test here while travelling, so please
give it a go if you're interested in this driver.
All applied to net-next-2.6, thanks!

Loading...