[PATCH 03/24] net, diet: Decrease ip defrag hash tables and max length with BASE_SMALL

Discussion:

[PATCH 03/24] net, diet: Decrease ip defrag hash tables and max length with BASE_SMALL

Andi Kleen

2014-05-05 22:25:52 UTC

From: Andi Kleen <***@linux.intel.com>

When CONFIG_BASE_SMALL is set only use 16 entries in the IP defrag
hash table. Also limit the max length of chains to 32 packets.

The sizes are somewhat arbitary and could be changed.

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
include/net/inet_frag.h | 6 ++++++
1 file changed, 6 insertions(+)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 6f59de9..a8c5948 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -41,6 +41,10 @@ struct inet_frag_queue {
struct netns_frags *net;
};

+#ifdef CONFIG_BASE_SMALL
+#define INETFRAGS_HASHSZ 16
+#define INETFRAGS_MAXDEPTH 32
+#else
#define INETFRAGS_HASHSZ 1024

/* averaged:
@@ -50,6 +54,8 @@ struct inet_frag_queue {
*/
#define INETFRAGS_MAXDEPTH 128

+#endif
+
struct inet_frag_bucket {
struct hlist_head chain;
spinlock_t chain_lock;

--
1.9.0

Yuchung Cheng

2014-05-05 23:18:05 UTC

This is all the code that saves connection information
between different sockets. Not really essential for
small systems.
Saves about 5.5k text
text data bss dec hex filename
492952 19571 13480 526003 806b3 net/built-in.o-with-metrics
487675 19275 13480 520430 7f0ee net/built-in.o-without-metrics
---
include/net/tcp.h | 25 +++++++++++++++++++++++++
net/ipv4/Kconfig | 6 ++++++
net/ipv4/Makefile | 3 ++-
net/ipv4/sysctl_net_ipv4.c | 2 ++
4 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 87d8774..d741d2f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -419,14 +419,29 @@ int tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb);
void tcp_enter_loss(struct sock *sk, int how);
void tcp_clear_retrans(struct tcp_sock *tp);
+#ifdef CONFIG_TCP_METRICS
void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk);
void tcp_metrics_init(void);
+
bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
bool paws_check);
bool tcp_remember_stamp(struct sock *sk);
bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
+#else
+static inline void tcp_update_metrics(struct sock *sk) {}
+static inline void tcp_init_metrics(struct sock *sk) {}
+static inline void tcp_metrics_init(void) {}
+static inline bool tcp_peer_is_proven(struct request_sock *req,
+ struct dst_entry *dst,
+ bool paws_check) { return false; }
+static inline bool tcp_remember_stamp(struct sock *sk) { return false; }
+static inline bool
+tcp_tw_remember_stamp(struct inet_timewait_sock *tw) { return false; }
+static inline void
+tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) {}
+#endif
void tcp_disable_fack(struct tcp_sock *tp);
void tcp_close(struct sock *sk, long timeout);
void tcp_init_sock(struct sock *sk);
@@ -1296,11 +1311,21 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp,
const struct tcp_md5sig_key *key);
/* From tcp_fastopen.c */
+#ifdef CONFIG_TCP_METRICS
void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie, int *syn_loss,
unsigned long *last_syn_loss);
void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
struct tcp_fastopen_cookie *cookie, bool syn_lost);
+#else
+static inline void
+tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie, int *syn_loss,
+ unsigned long *last_syn_loss) {}
+static inline void
+tcp_fastopen_cache_set(struct sock *sk, u16 mss,
+ struct tcp_fastopen_cookie *cookie, bool syn_lost) {}
+#endif
struct tcp_fastopen_request {
/* Fast Open cookie. Size 0 means a cookie request */
struct tcp_fastopen_cookie cookie;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6146b1b..db2dada 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -264,6 +264,12 @@ config IP_PIMSM_V2
gated-5). This routing protocol is not used widely, so say N unless
you want to play with it.
+config TCP_METRICS
+ bool "Report TCP metrics over netlink"
+ ---help---
+ Enable support in TCP to save host information between different
+ connections.

Please add that "Certain TCP features such as active TCP Fast Open
depends on this."

+
config SYN_COOKIES
bool "IP: TCP syncookie support"
---help---
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 756855c..8b17b83 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \
ip_output.o ip_sockglue.o inet_hashtables.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
- tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+ tcp_minisocks.o tcp_cong.o tcp_fastopen.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
@@ -17,6 +17,7 @@ obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_IP_PING) += ping.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_TCP_METRICS) += tcp_metrics.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
obj-$(CONFIG_NET_IPIP) += ipip.o
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 44eba05..2110d2e 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -573,6 +573,7 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+#ifdef CONFIG_TCP_METRICS
{
.procname = "tcp_no_metrics_save",
.data = &sysctl_tcp_nometrics_save,
@@ -580,6 +581,7 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#endif
{
.procname = "tcp_moderate_rcvbuf",
.data = &sysctl_tcp_moderate_rcvbuf,
--
1.9.0
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
More majordomo info at http://vger.kernel.org/majordomo-info.html

Andi Kleen

2014-05-05 23:28:21 UTC

Post by Yuchung Cheng

+config TCP_METRICS
+ bool "Report TCP metrics over netlink"
+ ---help---
+ Enable support in TCP to save host information between different
+ connections.

Please add that "Certain TCP features such as active TCP Fast Open
depends on this."

I added a select.

Also will fix the inaccurate help line.

-Andi

Andi Kleen

2014-05-05 22:25:56 UTC

From: Andi Kleen <***@linux.intel.com>

This saves about 6k text/data. ping still works fine using raw
sockets like it always did.

text data bss dec hex filename
268128 11555 7872 287555 46343 net/ipv4/built-in.o-noping
273610 11843 8176 293629 47afd net/ipv4/built-in.o-ping

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
include/net/ping.h | 15 +++++++++++----
include/net/transp_v6.h | 2 --
net/ipv4/Kconfig | 5 +++++
net/ipv4/Makefile | 3 ++-
net/ipv4/af_inet.c | 9 ++++-----
net/ipv4/icmp.c | 1 +
net/ipv4/ping.c | 3 +++
net/ipv6/Makefile | 3 ++-
net/ipv6/af_inet6.c | 8 ++++++--
9 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/include/net/ping.h b/include/net/ping.h
index 026479b..ca6b89d 100644
--- a/include/net/ping.h
+++ b/include/net/ping.h
@@ -71,7 +71,6 @@ void ping_unhash(struct sock *sk);
int ping_init_sock(struct sock *sk);
void ping_close(struct sock *sk, long timeout);
int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len);
-void ping_err(struct sk_buff *skb, int offset, u32 info);
int ping_getfrag(void *from, char *to, int offset, int fraglen, int odd,
struct sk_buff *);

@@ -82,7 +81,6 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len);
int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
-void ping_rcv(struct sk_buff *skb);

#ifdef CONFIG_PROC_FS
struct ping_seq_afinfo {
@@ -100,12 +98,21 @@ void ping_seq_stop(struct seq_file *seq, void *v);
int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo);
void ping_proc_unregister(struct net *net, struct ping_seq_afinfo *afinfo);

-int __init ping_proc_init(void);
-void ping_proc_exit(void);
#endif

+#ifdef CONFIG_IP_PING
+void ping_rcv(struct sk_buff *skb);
+void ping_err(struct sk_buff *skb, int offset, u32 info);
void __init ping_init(void);
int __init pingv6_init(void);
void pingv6_exit(void);
+#else
+static inline void ping_init(void) {}
+static inline int pingv6_init(void) { return 0; }
+static inline void pingv6_exit(void) { }
+static inline void ping_err(struct sk_buff *skb, int offset, u32 info) {}
+static inline void ping_rcv(struct sk_buff *skb) {}
+#endif
+

#endif /* _PING_H */
diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h
index b927413..7a0e229 100644
--- a/include/net/transp_v6.h
+++ b/include/net/transp_v6.h
@@ -19,8 +19,6 @@ int ipv6_frag_init(void);
void ipv6_frag_exit(void);

/* transport protocols */
-int pingv6_init(void);
-void pingv6_exit(void);
int rawv6_init(void);
void rawv6_exit(void);
int udpv6_init(void);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 05c57f0..6146b1b 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -52,6 +52,11 @@ config IP_ADVANCED_ROUTER

If unsure, say N here.

+config IP_PING
+ bool "IP: ping sockets"
+ ---help---
+ Enable ping sockets to enable suid-less ping.
+
config IP_FIB_TRIE_STATS
bool "FIB TRIE statistics"
depends on IP_ADVANCED_ROUTER
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f032688..756855c 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -11,9 +11,10 @@ obj-y := route.o inetpeer.o protocol.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
- inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o
+ inet_fragment.o ip_tunnel_core.o gre_offload.o

obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
+obj-$(CONFIG_IP_PING) += ping.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8c54870..c275ce5 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1015,7 +1015,7 @@ static struct inet_protosw inetsw_array[] =
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
-
+#ifdef CONFIG_IP_PING
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
@@ -1024,6 +1024,7 @@ static struct inet_protosw inetsw_array[] =
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
},
+#endif

{
.type = SOCK_RAW,
@@ -1719,7 +1720,9 @@ static int __init inet_init(void)
if (rc)
goto out_unregister_udp_proto;

+#ifdef CONFIG_IP_PING
rc = proto_register(&ping_prot, 1);
+#endif
if (rc)
goto out_unregister_raw_proto;

@@ -1836,15 +1839,11 @@ static int __init ipv4_proc_init(void)
goto out_tcp;
if (udp4_proc_init())
goto out_udp;
- if (ping_proc_init())
- goto out_ping;
if (ip_misc_proc_init())
goto out_misc;
out:
return rc;
out_misc:
- ping_proc_exit();
-out_ping:
udp4_proc_exit();
out_udp:
tcp4_proc_exit();
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 0134663..df3872b 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -989,6 +989,7 @@ void icmp_err(struct sk_buff *skb, u32 info)
return;
}

+ /* RED-PEN dead code? the if above will eat all. */
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0);
else if (type == ICMP_REDIRECT)
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 8210964..3f79243 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1206,6 +1206,9 @@ void ping_proc_exit(void)
unregister_pernet_subsys(&ping_v4_net_ops);
}

+module_init(ping_proc_init);
+module_exit(ping_proc_init);
+
#endif

void __init ping_init(void)
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 2fe6836..6ff7cfd 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -7,13 +7,14 @@ obj-$(CONFIG_IPV6) += ipv6.o
ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
addrlabel.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
- raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
+ raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o

ipv6-offload := ip6_offload.o tcpv6_offload.o udp_offload.o exthdrs_offload.o

ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o
ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o
+ipv6-$(CONFIG_IP_PING) += ping.o

ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
xfrm6_output.o xfrm6_protocol.o
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d935889..68ac214 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -861,16 +861,18 @@ static int __init inet6_init(void)
if (err)
goto out_unregister_udplite_proto;

+#ifdef CONFIG_IP_PING
err = proto_register(&pingv6_prot, 1);
if (err)
- goto out_unregister_ping_proto;
+ goto out_unregister_raw_proto;
+#endif

/* We MUST register RAW sockets before we create the ICMP6,
* IGMP6, or NDISC control sockets.
*/
err = rawv6_init();
if (err)
- goto out_unregister_raw_proto;
+ goto out_unregister_ping_proto;

/* Register the family here so that the init calls below will
* be able to create sockets. (?? is this dangerous ??)
@@ -1022,8 +1024,10 @@ register_pernet_fail:
out_sock_register_fail:
rawv6_exit();
out_unregister_ping_proto:
+#ifdef CONFIG_IP_PING
proto_unregister(&pingv6_prot);
out_unregister_raw_proto:
+#endif
proto_unregister(&rawv6_prot);
out_unregister_udplite_proto:
proto_unregister(&udplitev6_prot);

--
1.9.0

David Miller

2014-05-06 03:04:27 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:25:56 -0700

Post by Andi Kleen
This saves about 6k text/data. ping still works fine using raw
sockets like it always did.
text data bss dec hex filename
268128 11555 7872 287555 46343 net/ipv4/built-in.o-noping
273610 11843 8176 293629 47afd net/ipv4/built-in.o-ping

Sorry, not applying this, it's been discussed already, and it's
exactly small'ish systems that want this facility.

We want to move away from raw sockets, and making this optional
is not going to help us move forward down that path.

Andi Kleen

2014-05-05 22:26:05 UTC

From: Andi Kleen <***@linux.intel.com>

Make TCP fast open a config option. It's not really needed
on small systems. By itself it saves about 3k text,
but the main advantage is that CONFIG_INET doesn't
pull in AES and the crypto subsystem anymore, which
is worth far more savings.

text data bss dec hex filename
6954762 1404960 765952 9125674 8b3f2a vmlinux-with-fastopen
6951618 1400608 765952 9118178 8b21e2 vmlinux-wo-fastopen

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
include/linux/tcp.h | 14 ++++++++++++--
include/net/request_sock.h | 5 +++++
include/net/tcp.h | 9 +++++++++
net/Kconfig | 4 ++--
net/core/request_sock.c | 2 ++
net/ipv4/Kconfig | 4 ++++
net/ipv4/Makefile | 3 ++-
net/ipv4/sysctl_net_ipv4.c | 4 ++++
net/ipv4/tcp.c | 4 ++++
net/ipv4/tcp_ipv4.c | 18 ++++++++++++++++++
10 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 2399468..e0825e2 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -359,6 +359,9 @@ static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
return (struct tcp_timewait_sock *)sk;
}

+extern void tcp_sock_destruct(struct sock *sk);
+
+#ifdef CONFIG_TCP_FASTOPEN
static inline bool tcp_passive_fastopen(const struct sock *sk)
{
return (sk->sk_state == TCP_SYN_RECV &&
@@ -370,8 +373,6 @@ static inline bool fastopen_cookie_present(struct tcp_fastopen_cookie *foc)
return foc->len != -1;
}

-extern void tcp_sock_destruct(struct sock *sk);
-
static inline int fastopen_init_queue(struct sock *sk, int backlog)
{
struct request_sock_queue *queue =
@@ -391,4 +392,13 @@ static inline int fastopen_init_queue(struct sock *sk, int backlog)
return 0;
}

+#else
+static inline bool tcp_passive_fastopen(const struct sock *sk)
+{ return false; }
+static inline bool fastopen_cookie_present(struct tcp_fastopen_cookie *foc)
+{ return false; }
+static inline int fastopen_init_queue(struct sock *sk, int backlog)
+{ return 0; }
+#endif
+
#endif /* _LINUX_TCP_H */
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 7f830ff..ad1f97a 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -168,8 +168,13 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,

void __reqsk_queue_destroy(struct request_sock_queue *queue);
void reqsk_queue_destroy(struct request_sock_queue *queue);
+#ifdef CONFIG_TCP_FASTOPEN
void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
bool reset);
+#else
+static inline void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
+ bool reset) {}
+#endif

static inline struct request_sock *
reqsk_queue_yank_acceptq(struct request_sock_queue *queue)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index ac9f6bd..1a5e91b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -251,7 +251,11 @@ extern int sysctl_tcp_retries1;
extern int sysctl_tcp_retries2;
extern int sysctl_tcp_orphan_retries;
extern int sysctl_tcp_syncookies;
+#ifdef CONFIG_TCP_FASTOPEN
extern int sysctl_tcp_fastopen;
+#else
+#define sysctl_tcp_fastopen 0
+#endif
extern int sysctl_tcp_retrans_collapse;
extern int sysctl_tcp_stdurg;
extern int sysctl_tcp_rfc1337;
@@ -1333,7 +1337,12 @@ struct tcp_fastopen_request {
size_t size;
int copied; /* queued in tcp_connect() */
};
+
+#ifdef CONFIG_TCP_FASTOPEN
void tcp_free_fastopen_req(struct tcp_sock *tp);
+#else
+static inline void tcp_free_fastopen_req(struct tcp_sock *tp) {}
+#endif

extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
int tcp_fastopen_reset_cipher(void *key, unsigned int len);
diff --git a/net/Kconfig b/net/Kconfig
index f5196ba..fe6e856 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -61,8 +61,8 @@ source "net/iucv/Kconfig"

config INET
bool "TCP/IP networking"
- select CRYPTO
- select CRYPTO_AES
+ select CRYPTO if TCP_FASTOPEN
+ select CRYPTO_AES if TCP_FASTOPEN
---help---
These are the protocols used on the Internet and on most local
Ethernets. It is highly recommended to say Y here (this will enlarge
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 467f326..80ad1dd 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -131,6 +131,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
kfree(lopt);
}

+#ifdef CONFIG_TCP_FASTOPEN
/*
* This function is called to set a Fast Open socket's "fastopen_rsk" field
* to NULL when a TFO socket no longer needs to access the request_sock.
@@ -222,3 +223,4 @@ out:
spin_unlock_bh(&fastopenq->lock);
sock_put(lsk);
}
+#endif
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index c1f9899..df5c569 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -320,6 +320,10 @@ config NET_IPVTI
the notion of a secure tunnel for IPSEC and then use routing protocol
on top.

+config TCP_FASTOPEN
+ bool "Enable TCP fastopen"
+ default y
+
config INET_AH
tristate "IP: AH transformation"
select XFRM_ALGO
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index eb129a4..addecef 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \
ip_output.o ip_sockglue.o inet_hashtables.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
- tcp_minisocks.o tcp_cong.o tcp_fastopen.o \
+ tcp_minisocks.o tcp_cong.o \
datagram.o raw.o udp.o udplite.o \
arp.o icmp.o devinet.o af_inet.o \
fib_frontend.o fib_semantics.o fib_trie.o \
@@ -55,6 +55,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_TCP_FASTOPEN) += tcp_fastopen.o
obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index fe5823a..9a9f96c 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -200,6 +200,7 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl,
return ret;
}

+#ifdef CONFIG_TCP_FASTOPEN
static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -246,6 +247,7 @@ bad_key:
kfree(tbl.data);
return ret;
}
+#endif

static struct ctl_table ipv4_table[] = {
{
@@ -388,6 +390,7 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
#endif
+#ifdef CONFIG_TCP_FASTOPEN
{
.procname = "tcp_fastopen",
.data = &sysctl_tcp_fastopen,
@@ -401,6 +404,7 @@ static struct ctl_table ipv4_table[] = {
.maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
.proc_handler = proc_tcp_fastopen_key,
},
+#endif
{
.procname = "tcp_tw_recycle",
.data = &tcp_death_row.sysctl_tw_recycle,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3fd48421..ef14cb6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1036,6 +1036,7 @@ static inline int select_size(const struct sock *sk, bool sg)
return tmp;
}

+#ifdef CONFIG_TCP_FASTOPEN
void tcp_free_fastopen_req(struct tcp_sock *tp)
{
if (tp->fastopen_req != NULL) {
@@ -1069,6 +1070,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
tcp_free_fastopen_req(tp);
return err;
}
+#endif

int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size)
@@ -1084,6 +1086,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
lock_sock(sk);

flags = msg->msg_flags;
+#ifdef CONFIG_TCP_FASTOPEN
if (flags & MSG_FASTOPEN) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
if (err == -EINPROGRESS && copied_syn > 0)
@@ -1092,6 +1095,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
goto out_err;
offset = copied_syn;
}
+#endif

timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 438f3b9..fbddabb 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1260,6 +1260,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
};
#endif

+#ifdef CONFIG_TCP_FASTOPEN
static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
@@ -1440,6 +1441,23 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
WARN_ON(req->sk == NULL);
return 0;
}
+#else
+static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct tcp_fastopen_cookie *foc,
+ struct tcp_fastopen_cookie *valid_foc)
+{
+ return false;
+}
+
+static int tcp_v4_conn_req_fastopen(struct sock *sk,
+ struct sk_buff *skb,
+ struct sk_buff *skb_synack,
+ struct request_sock *req)
+{
+ return 0;
+}
+#endif

int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{

--
1.9.0

David Miller

2014-05-06 03:06:44 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:26:05 -0700

Post by Andi Kleen
Make TCP fast open a config option. It's not really needed
on small systems. By itself it saves about 3k text,

It's for the sake of the remote service not the local client,
sorry I'm not applying this, it's a facility we want to be
ubiquitous and in widespread use on as many systems as possible.

I'm not applying this, sorry.

Andi Kleen

2014-05-05 22:25:58 UTC

From: Andi Kleen <***@linux.intel.com>

Small embedded systems don't need ethtool, so make it optional.

Right now the driver code is not removed, unless the driver
uses SET_ETHTOOL_OPS and LTO (which can eliminate unused code)

Saves about 10k text(without driver code):

text data bss dec hex filename
489877 19371 13480 522728 7f9e8 net/built-in.o
478967 19369 13480 511816 7cf48 net/built-in.o-wo-ethtool

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
include/linux/ethtool.h | 14 ++++++++++++++
include/linux/netdevice.h | 21 ++++++++++++++++++---
net/Kconfig | 6 ++++++
net/core/Makefile | 3 ++-
net/core/dev.c | 2 ++
5 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 0a114d0..e90c958 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -40,8 +40,14 @@ struct compat_ethtool_rxnfc {

#include <linux/rculist.h>

+#ifdef CONFIG_NET_ETHTOOL
extern int __ethtool_get_settings(struct net_device *dev,
struct ethtool_cmd *cmd);
+#else
+static inline int __ethtool_get_settings(struct net_device *dev,
+ struct ethtool_cmd *cmd)
+{ return -EINVAL; }
+#endif

/**
* enum ethtool_phys_id_state - indicator state for physical identification
@@ -61,9 +67,17 @@ enum ethtool_phys_id_state {

struct net_device;

+#ifdef CONFIG_NET_ETHTOOL
/* Some generic methods drivers may use in their ethtool_ops */
u32 ethtool_op_get_link(struct net_device *dev);
int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *eti);
+#else
+/* Some generic methods drivers may use in their ethtool_ops */
+static inline u32 ethtool_op_get_link(struct net_device *dev) { return 0; }
+static inline int
+ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *eti)
+{ return -EINVAL; }
+#endif

/**
* ethtool_rxfh_indir_default - get default value for RX flow hash indirection
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7ed3a3a..29e0409 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -56,12 +56,28 @@ struct device;
struct phy_device;
/* 802.11 specific */
struct wireless_dev;
- /* source back-compat hooks */
+
+#ifdef CONFIG_NET_ETHTOOL
+
+/* When the driver uses this macro ethtool code can be optimized out
+ * when not needed. We still reference it to avoid unused static
+ * warnings.
+ */
#define SET_ETHTOOL_OPS(netdev,ops) \
- ( (netdev)->ethtool_ops = (ops) )
+ ( (void)(ops), (netdev)->ethtool_ops = (ops) )

void netdev_set_default_ethtool_ops(struct net_device *dev,
const struct ethtool_ops *ops);
+int dev_ethtool(struct net *net, struct ifreq *);
+
+#else
+#define SET_ETHTOOL_OPS(netdev,ops) do {} while(0)
+static inline void
+netdev_set_default_ethtool_ops(struct net_device *dev,
+ const struct ethtool_ops *ops) {}
+static inline int
+dev_ethtool(struct net *net, struct ifreq *ifr) { return -EINVAL; }
+#endif

/* Backlog congestion levels */
#define NET_RX_SUCCESS 0 /* keep 'em coming, baby */
@@ -2616,7 +2632,6 @@ void netdev_rx_handler_unregister(struct net_device *dev);

bool dev_valid_name(const char *name);
int dev_ioctl(struct net *net, unsigned int cmd, void __user *);
-int dev_ethtool(struct net *net, struct ifreq *);
unsigned int dev_get_flags(const struct net_device *);
int __dev_change_flags(struct net_device *, unsigned int flags);
int dev_change_flags(struct net_device *, unsigned int);
diff --git a/net/Kconfig b/net/Kconfig
index d92afe4..281d172 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -82,6 +82,12 @@ source "net/netlabel/Kconfig"

endif # if INET

+config NET_ETHTOOL
+ bool "Ethtool support"
+ default y
+ help
+ Support changing ethernet driver parameters from user tools.
+
config NETWORK_SECMARK
bool "Security Marking"
help
diff --git a/net/core/Makefile b/net/core/Makefile
index 826b925..bfd28b1 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -7,12 +7,13 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \

obj-$(CONFIG_SYSCTL) += sysctl_net_core.o

-obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
+obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
sock_diag.o dev_ioctl.o

obj-$(CONFIG_XFRM) += flow.o
obj-y += net-sysfs.o
+obj-$(CONFIG_NET_ETHTOOL) += ethtool.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
diff --git a/net/core/dev.c b/net/core/dev.c
index c6cbe69..cf102a4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6310,6 +6310,7 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)

static const struct ethtool_ops default_ethtool_ops;

+#ifdef CONFIG_NET_ETHTOOL
void netdev_set_default_ethtool_ops(struct net_device *dev,
const struct ethtool_ops *ops)
{
@@ -6317,6 +6318,7 @@ void netdev_set_default_ethtool_ops(struct net_device *dev,
dev->ethtool_ops = ops;
}
EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
+#endif

void netdev_freemem(struct net_device *dev)
{

--
1.9.0

David Miller

2014-05-06 03:22:31 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Tue, 6 May 2014 05:14:12 +0200

You can configure the IP address and transmit and receive packets,
and that is all a small embedded system is ever interested in.

Then let's agree to disagree, I am absolutely certain that small
systems are interested in much more than just these simple
configuration operations.

David Miller

2014-05-06 03:11:40 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:25:58 -0700

Post by Andi Kleen
Small embedded systems don't need ethtool, so make it optional.

You can't do anything non-trivial with an ethernet device without
ethtool.

I am not applying this.

Andi Kleen

2014-05-06 03:14:12 UTC

Post by David Miller
Date: Mon, 5 May 2014 15:25:58 -0700

Post by Andi Kleen
Small embedded systems don't need ethtool, so make it optional.

You can't do anything non-trivial with an ethernet device without
ethtool.

You can configure the IP address and transmit and receive packets,
and that is all a small embedded system is ever interested in.

Remember these systems don't have a shell, they just run a
single application that only does one thing.

-Andi

--
***@linux.intel.com -- Speaking for myself only.

Andi Kleen

2014-05-05 22:26:12 UTC

From: Andi Kleen <***@linux.intel.com>

Useful to remove unneeded driver code when ethtool is disabled.

The patches doing this are submitted too, but let's add
the script too so people can re-use it.

Cc: Julia Lawall <***@lip6.fr>
Cc: Gilles Muller <***@lip6.fr>
Cc: Nicolas Palix <***@imag.fr>
Signed-off-by: Andi Kleen <***@linux.intel.com>
---
scripts/coccinelle/api/ethtool_ops.cocci | 9 +++++++++
1 file changed, 9 insertions(+)
create mode 100644 scripts/coccinelle/api/ethtool_ops.cocci

diff --git a/scripts/coccinelle/api/ethtool_ops.cocci b/scripts/coccinelle/api/ethtool_ops.cocci
new file mode 100644
index 0000000..a7ee6ed
--- /dev/null
+++ b/scripts/coccinelle/api/ethtool_ops.cocci
@@ -0,0 +1,9 @@
+// Convert network drivers to use the SET_ETHTOOL_OPS macro
+// This allows to compile out the ethtool code when not needed.
+//
+@@
+struct ethtool_ops *ops;
+struct net_device *dev;
+@@
+- dev->ethtool_ops = ops;
++ SET_ETHTOOL_OPS(dev, ops);

--
1.9.0

Nicolas Palix

2014-05-06 09:27:27 UTC

Hi David,

I saw you rejected almost the entire patch series.

However, is there any interest to enforce the use of SET_ETHTOOL_OPS ?
Are you likely to merge the patch 24/24 [1] ?

Regards,

[1] https://lkml.org/lkml/2014/5/5/679

Post by Andi Kleen
Useful to remove unneeded driver code when ethtool is disabled.
The patches doing this are submitted too, but let's add
the script too so people can re-use it.
---
scripts/coccinelle/api/ethtool_ops.cocci | 9 +++++++++
1 file changed, 9 insertions(+)
create mode 100644 scripts/coccinelle/api/ethtool_ops.cocci
diff --git a/scripts/coccinelle/api/ethtool_ops.cocci b/scripts/coccinelle/api/ethtool_ops.cocci
new file mode 100644
index 0000000..a7ee6ed
--- /dev/null
+++ b/scripts/coccinelle/api/ethtool_ops.cocci
@@ -0,0 +1,9 @@
+// Convert network drivers to use the SET_ETHTOOL_OPS macro
+// This allows to compile out the ethtool code when not needed.
+//
+struct ethtool_ops *ops;
+struct net_device *dev;
+- dev->ethtool_ops = ops;
++ SET_ETHTOOL_OPS(dev, ops);
--
1.9.0

--
Nicolas Palix
Tel: +33 4 76 51 46 27
http://lig-membres.imag.fr/palix/

David Miller

2014-05-06 15:05:04 UTC

From: Nicolas Palix <***@imag.fr>
Date: Tue, 6 May 2014 11:27:27 +0200

Post by Nicolas Palix
I saw you rejected almost the entire patch series.
However, is there any interest to enforce the use of SET_ETHTOOL_OPS ?
Are you likely to merge the patch 24/24 [1] ?

SET_ETHTOOL_OPS only exists so that drivers could be compiled
on "older" kernels.

But the usefulness of that is long gone, we've had netdev->ethtool_ops
since way before we started using GIT.

I'd rather see a patch which removes it, to be honest.

Andi Kleen

2014-05-05 22:26:03 UTC

From: Andi Kleen <***@linux.intel.com>

When PROC_FS is not compiled in we don't need the statistics
gathering code, as the only way to see the output is through
/proc. Saves about 5k text, likely more in dynamic memory.

text data bss dec hex filename
386302 8993 12564 407859 63933 net/built-in.o-with-mib
381542 8933 12564 403039 6265f net/built-in.o-wo-mib

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
include/net/ip.h | 9 ++++++++
include/net/ipv6.h | 8 ++++++-
include/net/snmp.h | 60 ++++++++++++++++++++++++++++++++++++++++--------
net/ipv4/af_inet.c | 12 ++++++++++
net/ipv6/addrconf.c | 9 ++++++++
net/ipv6/addrconf_core.c | 2 ++
net/ipv6/af_inet6.c | 5 ++++
7 files changed, 95 insertions(+), 10 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 3ec2b0f..6764e30 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -196,6 +196,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
#define NET_ADD_STATS_BH(net, field, adnd) SNMP_ADD_STATS_BH((net)->mib.net_statistics, field, adnd)
#define NET_ADD_STATS_USER(net, field, adnd) SNMP_ADD_STATS_USER((net)->mib.net_statistics, field, adnd)

+#ifdef CONFIG_PROC_FS
unsigned long snmp_fold_field(void __percpu *mib[], int offt);
#if BITS_PER_LONG==32
u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t sync_off);
@@ -217,6 +218,12 @@ static inline void snmp_mib_free(void __percpu *ptr[SNMP_ARRAY_SZ])
ptr[i] = NULL;
}
}
+#else
+#define snmp_mib_init(a,b,c) ({ 0; })
+#define snmp_mib_free(x) do {} while (0)
+#define snmp_fold_field(a, b) ({ 0; })
+#define snmp_fold_field64(a, b, c) ({ 0; })
+#endif

void inet_get_local_port_range(struct net *net, int *low, int *high);

@@ -523,6 +530,8 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,

#ifdef CONFIG_PROC_FS
int ip_misc_proc_init(void);
+#else
+static inline int ip_misc_proc_init(void) { return 0; }
#endif

#endif /* _IP_H */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index d640925..3c4c041 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -830,11 +830,17 @@ int udp6_proc_init(struct net *net);
void udp6_proc_exit(struct net *net);
int udplite6_proc_init(void);
void udplite6_proc_exit(void);
+#ifdef CONFIG_PROC_FS
int ipv6_misc_proc_init(void);
void ipv6_misc_proc_exit(void);
int snmp6_register_dev(struct inet6_dev *idev);
int snmp6_unregister_dev(struct inet6_dev *idev);
-
+#else
+static inline int ipv6_misc_proc_init(void) { return 0; }
+static inline void ipv6_misc_proc_exit(void) {}
+static inline int snmp6_register_dev(struct inet6_dev *idev) { return 0; }
+static inline int snmp6_unregister_dev(struct inet6_dev *idev) { return 0; }
+#endif
#else
static inline int ac6_proc_init(struct net *net) { return 0; }
static inline void ac6_proc_exit(struct net *net) { }
diff --git a/include/net/snmp.h b/include/net/snmp.h
index 7159626..83815f9 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -23,6 +23,8 @@
#include <linux/snmp.h>
#include <linux/smp.h>

+#define MIB_SIZE(x) (IS_ENABLED(CONFIG_PROC_FS) ? (x) : (0))
+
/*
* Mibs are stored in array of unsigned long.
*/
@@ -31,6 +33,7 @@
* - list of entries for particular API (such as /proc/net/snmp)
* - name of entries.
*/
+#ifdef CONFIG_PROC_FS
struct snmp_mib {
const char *name;
int entry;
@@ -45,6 +48,11 @@ struct snmp_mib {
.name = NULL, \
.entry = 0, \
}
+#else
+struct snmp_mib {};
+#define SNMP_MIB_ITEM(_name,_entry) {}
+#define SNMP_MIB_SENTINEL
+#endif

/*
* We use unsigned longs for most mibs but u64 for ipstats.
@@ -52,7 +60,7 @@ struct snmp_mib {
#include <linux/u64_stats_sync.h>

/* IPstats */
-#define IPSTATS_MIB_MAX __IPSTATS_MIB_MAX
+#define IPSTATS_MIB_MAX MIB_SIZE(__IPSTATS_MIB_MAX)
struct ipstats_mib {
/* mibs[] must be first field of struct ipstats_mib */
u64 mibs[IPSTATS_MIB_MAX];
@@ -60,18 +68,18 @@ struct ipstats_mib {
};

/* ICMP */
-#define ICMP_MIB_MAX __ICMP_MIB_MAX
+#define ICMP_MIB_MAX MIB_SIZE(__ICMP_MIB_MAX)
struct icmp_mib {
unsigned long mibs[ICMP_MIB_MAX];
};

-#define ICMPMSG_MIB_MAX __ICMPMSG_MIB_MAX
+#define ICMPMSG_MIB_MAX MIB_SIZE(__ICMPMSG_MIB_MAX)
struct icmpmsg_mib {
atomic_long_t mibs[ICMPMSG_MIB_MAX];
};

/* ICMP6 (IPv6-ICMP) */
-#define ICMP6_MIB_MAX __ICMP6_MIB_MAX
+#define ICMP6_MIB_MAX MIB_SIZE(__ICMP6_MIB_MAX)
/* per network ns counters */
struct icmpv6_mib {
unsigned long mibs[ICMP6_MIB_MAX];
@@ -81,7 +89,7 @@ struct icmpv6_mib_device {
atomic_long_t mibs[ICMP6_MIB_MAX];
};

-#define ICMP6MSG_MIB_MAX __ICMP6MSG_MIB_MAX
+#define ICMP6MSG_MIB_MAX MIB_SIZE(__ICMP6MSG_MIB_MAX)
/* per network ns counters */
struct icmpv6msg_mib {
atomic_long_t mibs[ICMP6MSG_MIB_MAX];
@@ -93,29 +101,31 @@ struct icmpv6msg_mib_device {

/* TCP */
-#define TCP_MIB_MAX __TCP_MIB_MAX
+#define TCP_MIB_MAX MIB_SIZE(__TCP_MIB_MAX)
struct tcp_mib {
unsigned long mibs[TCP_MIB_MAX];
};

/* UDP */
-#define UDP_MIB_MAX __UDP_MIB_MAX
+#define UDP_MIB_MAX MIB_SIZE(__UDP_MIB_MAX)
struct udp_mib {
unsigned long mibs[UDP_MIB_MAX];
};

/* Linux */
-#define LINUX_MIB_MAX __LINUX_MIB_MAX
+#define LINUX_MIB_MAX MIB_SIZE(__LINUX_MIB_MAX)
struct linux_mib {
unsigned long mibs[LINUX_MIB_MAX];
};

/* Linux Xfrm */
-#define LINUX_MIB_XFRMMAX __LINUX_MIB_XFRMMAX
+#define LINUX_MIB_XFRMMAX MIB_SIZE(__LINUX_MIB_XFRMMAX)
struct linux_xfrm_mib {
unsigned long mibs[LINUX_MIB_XFRMMAX];
};

+#ifdef CONFIG_PROC_FS
+
#define SNMP_ARRAY_SZ 1

#define DEFINE_SNMP_STAT(type, name) \
@@ -216,4 +226,36 @@ struct linux_xfrm_mib {
#define SNMP_UPD_PO_STATS64_BH(mib, basefield, addend) SNMP_UPD_PO_STATS_BH(mib, basefield, addend)
#endif

+#else
+
+/* Stub out everything */
+
+#define SNMP_ARRAY_SZ 0
+#define DEFINE_SNMP_STAT(type, name) type *name
+#define DEFINE_SNMP_STAT_ATOMIC(type, name) type *name
+#define DECLARE_SNMP_STAT(type, name) extern type *name
+#define SNMP_INC_STATS_BH(mib, field) do { (void)(mib); } while(0)
+#define SNMP_INC_STATS_USER(mib, field) do { (void)(mib); } while(0)
+#define SNMP_INC_STATS_ATOMIC_LONG(mib, field) do { (void)(mib); } while(0)
+#define SNMP_INC_STATS(mib, field) do { (void)(mib); } while(0)
+#define SNMP_DEC_STATS(mib, field) do { (void)(mib); } while(0)
+#define SNMP_ADD_STATS_BH(mib, field, addend) do { (void)(mib); } while(0)
+#define SNMP_ADD_STATS_USER(mib, field, addend) do { (void)(mib); } while(0)
+#define SNMP_ADD_STATS(mib, field, addend) do { (void)(mib); } while(0)
+#define SNMP_UPD_PO_STATS(mib, basefield, addend) do { (void)(mib); } while(0)
+#define SNMP_UPD_PO_STATS_BH(mib, basefield, addend) \
+ do { (void)(mib); } while(0)
+#define SNMP_ADD_STATS64_BH(mib, field, addend) do { (void)(mib); } while(0)
+#define SNMP_ADD_STATS64_USER(mib, field, addend) do { (void)(mib); } while(0)
+#define SNMP_ADD_STATS64(mib, field, addend) do { (void)(mib); } while(0)
+#define SNMP_INC_STATS64_BH(mib, field) do { (void)(mib); } while(0)
+#define SNMP_INC_STATS64_USER(mib, field) do { (void)(mib); } while(0)
+#define SNMP_INC_STATS64(mib, field) do { (void)(mib); } while(0)
+#define SNMP_UPD_PO_STATS64_BH(mib, basefield, addend) \
+ do { (void)(mib); } while(0)
+#define SNMP_UPD_PO_STATS64(mib, basefield, addend) \
+ do { (void)(mib); } while(0)
+
+#endif /* CONFIG_NET_SNMP */
+
#endif
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e65e750..46b1815 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1482,6 +1482,8 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
}
EXPORT_SYMBOL_GPL(inet_ctl_sock_create);

+#ifdef CONFIG_PROC_FS
+
unsigned long snmp_fold_field(void __percpu *mib[], int offt)
{
unsigned long res = 0;
@@ -1541,6 +1543,8 @@ int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align)
}
EXPORT_SYMBOL_GPL(snmp_mib_init);

+#endif
+
#ifdef CONFIG_IP_MULTICAST
static const struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
@@ -1572,6 +1576,7 @@ static const struct net_protocol icmp_protocol = {
.netns_ok = 1,
};

+#ifdef CONFIG_PROC_FS
static __net_init int ipv4_mib_init_net(struct net *net)
{
int i;
@@ -1656,6 +1661,13 @@ static int __init init_ipv4_mibs(void)
return register_pernet_subsys(&ipv4_mib_ops);
}

+#else
+static int __init init_ipv4_mibs(void)
+{
+ return 0;
+}
+#endif
+
static int ipv4_proc_init(void);

#ifdef CONFIG_IP_OFFLOAD
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 6c7fa08..0d8c820 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -271,6 +271,7 @@ static void addrconf_mod_dad_work(struct inet6_ifaddr *ifp,
mod_delayed_work(addrconf_wq, &ifp->dad_work, delay);
}

+#ifdef CONFIG_PROC_FS
static int snmp6_alloc_dev(struct inet6_dev *idev)
{
int i;
@@ -309,6 +310,9 @@ err_icmp:
err_ip:
return -ENOMEM;
}
+#else
+static int snmp6_alloc_dev(struct inet6_dev *idev) { return 0; }
+#endif

static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
{
@@ -4348,6 +4352,7 @@ static inline size_t inet6_if_nlmsg_size(void)
+ nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */
}

+#ifdef CONFIG_PROC_FS
static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib,
int items, int bytes)
{
@@ -4391,6 +4396,10 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
break;
}
}
+#else
+static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
+ int bytes) {}
+#endif

static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev)
{
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 4c11cbc..80ff00e 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -121,9 +121,11 @@ EXPORT_SYMBOL(in6addr_sitelocal_allrouters);

static void snmp6_free_dev(struct inet6_dev *idev)
{
+#ifdef CONFIG_PROC_FS
kfree(idev->stats.icmpv6msgdev);
kfree(idev->stats.icmpv6dev);
snmp_mib_free((void __percpu **)idev->stats.ipv6);
+#endif
}

/* Nobody refers to this device, we may destroy it. */
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 68ac214..9ff80ad 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -711,6 +711,7 @@ static void ipv6_packet_cleanup(void)
dev_remove_pack(&ipv6_packet_type);
}

+#ifdef CONFIG_PROC_FS
static int __net_init ipv6_init_mibs(struct net *net)
{
int i;
@@ -768,6 +769,10 @@ static void ipv6_cleanup_mibs(struct net *net)
snmp_mib_free((void __percpu **)net->mib.icmpv6_statistics);
kfree(net->mib.icmpv6msg_statistics);
}
+#else
+static inline int __net_init ipv6_init_mibs(struct net *net) { return 0; }
+static inline void ipv6_cleanup_mibs(struct net *net) {}
+#endif

static int __net_init inet6_net_init(struct net *net)
{

--
1.9.0

David Miller

2014-05-06 03:05:58 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:26:03 -0700

Post by Andi Kleen
When PROC_FS is not compiled in we don't need the statistics
gathering code, as the only way to see the output is through
/proc. Saves about 5k text, likely more in dynamic memory.
text data bss dec hex filename
386302 8993 12564 407859 63933 net/built-in.o-with-mib
381542 8933 12564 403039 6265f net/built-in.o-wo-mib

Congratulations, you just broke ipv6 device address netlink
dumps amongst other things.

Andi Kleen

2014-05-05 22:25:50 UTC

From: Andi Kleen <***@linux.intel.com>

Small systems typically only have two network devices, so use
hash shift 2.

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
include/net/net_namespace.h | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 5f9eb26..d2b8c6c 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -37,7 +37,11 @@ struct sock;
struct netns_ipvs;

+#ifdef CONFIG_BASE_SMALL
+#define NETDEV_HASHBITS 2
+#else
#define NETDEV_HASHBITS 8
+#endif
#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)

struct net {

--
1.9.0

David Miller

2014-05-06 03:03:47 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:25:50 -0700

Post by Andi Kleen
Small systems typically only have two network devices, so use
hash shift 2.

Maybe in terms of physical interfaces, but they may also have lots of
software interfaces.

Sorry, this doesn't make any sense, I'm not applying this.

Andi Kleen

2014-05-05 22:25:53 UTC

From: Andi Kleen <***@linux.intel.com>

Just make the FIB hash tables much smaller for small
kernel.

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
include/net/ip_fib.h | 5 +++++
net/ipv4/fib_semantics.c | 4 ++++
2 files changed, 9 insertions(+)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 9922093..cbe60cd 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -65,7 +65,11 @@ struct fnhe_hash_bucket {
struct fib_nh_exception __rcu *chain;
};

+#ifdef CONFIG_BASE_SMALL
+#define FNHE_HASH_SIZE 16
+#else
#define FNHE_HASH_SIZE 2048
+#endif
#define FNHE_RECLAIM_DEPTH 5

struct fib_nh {
@@ -162,6 +166,7 @@ struct fib_result_nl {
#ifdef CONFIG_IP_MULTIPLE_TABLES
#define FIB_TABLE_HASHSZ 256
#else
+/* Can we use 1 for BASE_SMALL? */
#define FIB_TABLE_HASHSZ 2
#endif

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 8a043f0..c3d4e4d 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -51,7 +51,11 @@ static struct hlist_head *fib_info_laddrhash;
static unsigned int fib_info_hash_size;
static unsigned int fib_info_cnt;

+#ifdef CONFIG_BASE_SMALL
+#define DEVINDEX_HASHBITS 2
+#else
#define DEVINDEX_HASHBITS 8
+#endif
#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];

--
1.9.0

David Miller

2014-05-06 03:10:49 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:25:53 -0700

Post by Andi Kleen
Just make the FIB hash tables much smaller for small
kernel.

Sounds like an exploit waiting to happen.

Andi Kleen

2014-05-05 22:26:09 UTC

From: Andi Kleen <***@linux.intel.com>

Not needed on small systems. Already disabled with !SMP,
but also makes sense to explicitely disabled on very
small multi-core SOCKs.

Saves ~6k text, but increases the data segment by a
mysterious ~4k (some padding?)

text data bss dec hex filename
6893698 1392544 634880 8921122 882022 vmlinux-with-xps
6887542 1396544 634880 8918966 8817b6 vmlinux-wo-xps

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
net/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/Kconfig b/net/Kconfig
index cc0264a..2437348 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -259,7 +259,7 @@ config RFS_ACCEL
default y

config XPS
- boolean
+ bool "Support multiqueue transmit queues (XPS)"
depends on SMP
default y

--
1.9.0

Andi Kleen

2014-05-05 22:25:54 UTC

From: Andi Kleen <***@linux.intel.com>

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
net/ipv4/tcp.c | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4bd6d52..3fd48421 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3127,7 +3127,11 @@ EXPORT_SYMBOL_GPL(tcp_done);

extern struct tcp_congestion_ops tcp_reno;

+#ifdef CONFIG_BASE_SMALL
+static __initdata unsigned long thash_entries = 16;
+#else
static __initdata unsigned long thash_entries;
+#endif
static int __init set_thash_entries(char *str)
{
ssize_t ret;

--
1.9.0

David Miller

2014-05-06 03:12:06 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:25:54 -0700
Same comment as per other hash table reductions, this doesn't
make any sense even for the claimed goal.

Andi Kleen

2014-05-05 22:26:08 UTC

From: Andi Kleen <***@linux.intel.com>

Multiple receive queues are not needed on small systems,
so allow disabling them.

This is already disabled with !SMP, but it also makes
sense to disable on very small SMP systems (like
dual core SOC)

Can be worth ~30k code and 6k data

text data bss dec hex filename
6923914 1398176 634880 8956970 88ac2a vmlinux-with-rps
6893698 1392544 634880 8921122 882022 vmlinux-wo-rps

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
net/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/Kconfig b/net/Kconfig
index fe6e856..cc0264a 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -248,7 +248,7 @@ source "net/mpls/Kconfig"
source "net/hsr/Kconfig"

config RPS
- boolean
+ bool "Support multiqueue input queues (RPS)"
depends on SMP && SYSFS
default y

--
1.9.0

Andi Kleen

2014-05-06 03:16:21 UTC

Date: Mon, 5 May 2014 15:26:08 -0700

Post by Andi Kleen
Multiple receive queues are not needed on small systems,
so allow disabling them.

It's only enabled with SMP...

You didn't read the whole description, did you?

-Andi

David Miller

2014-05-06 03:14:33 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:26:08 -0700

Post by Andi Kleen
Multiple receive queues are not needed on small systems,
so allow disabling them.

It's only enabled with SMP...

I'm really totally, and completely, convinced that you wrote this
entire patch series with your brain turned off.

I'm not applying any patch in this series Andi, come back when
you have something specific and small and reasonable to discuss,
rather than this huge patch bomb of total and complete crap.

Bjørn Mork

2014-05-06 08:32:03 UTC

Post by Andi Kleen
Multiple receive queues are not needed on small systems,
so allow disabling them.
This is already disabled with !SMP, but it also makes
sense to disable on very small SMP systems (like
dual core SOC)

Such a pointless design still doesn't mean that you *have* to enable
SMP, does it? How much would you save all over the kernel by simply
disabling SMP?

As for the system design: It seems to me that it would be more useful t=
o
replace one of the CPU cores with some RAM...

Bj=C3=B8rn

Andi Kleen

2014-05-05 22:25:59 UTC

From: Andi Kleen <***@linux.intel.com>

Small systems don't need the LPF filter, so make it all
optional

Saves about 4K text

text data bss dec hex filename
483545 19371 13480 516396 7e12c net/built-in.o-wo-filter
487675 19275 13480 520430 7f0ee net/built-in.o-with-filter

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
drivers/net/team/Kconfig | 1 +
include/linux/filter.h | 28 +++++++++++++++++++++++++++-
init/Kconfig | 7 +++++++
net/Kconfig | 8 ++++++++
net/core/Makefile | 3 ++-
net/netfilter/Kconfig | 1 +
net/sched/Kconfig | 1 +
7 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/drivers/net/team/Kconfig b/drivers/net/team/Kconfig
index c853d84..7c5a373 100644
--- a/drivers/net/team/Kconfig
+++ b/drivers/net/team/Kconfig
@@ -64,6 +64,7 @@ config NET_TEAM_MODE_ACTIVEBACKUP

config NET_TEAM_MODE_LOADBALANCE
tristate "Load-balance mode support"
+ select LPF_FILTER
depends on NET_TEAM
---help---
This mode provides load balancing functionality. Tx port selection
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 024fd03..ec1db56 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -97,9 +97,9 @@ static inline unsigned int sk_filter_size(unsigned int proglen)
#define sk_filter_proglen(fprog) \
(fprog->len * sizeof(fprog->filter[0]))

+#ifdef CONFIG_LPF_FILTER
#define SK_RUN_FILTER(filter, ctx) \
(*filter->bpf_func)(ctx, filter->insnsi)
-
int sk_filter(struct sock *sk, struct sk_buff *skb);

u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx,
@@ -124,6 +124,32 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to);

void sk_filter_charge(struct sock *sk, struct sk_filter *fp);
void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
+#else
+#define SK_RUN_FILTER(filter, ctx) 0
+static inline int
+sk_filter(struct sock *sk, struct sk_buff *skb) { return 0; }
+static inline u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx,
+ const struct sock_filter_int *insni)
+{ return 0; }
+static inline u32 sk_run_filter_int_skb(const struct sk_buff *ctx,
+ const struct sock_filter_int *insni)
+{ return 0; }
+static inline int sk_unattached_filter_create(struct sk_filter **pfp,
+ struct sock_fprog *fprog)
+{ return -EINVAL; }
+static inline void sk_unattached_filter_destroy(struct sk_filter *fp) {}
+static inline int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+{ return -EINVAL; }
+static inline int sk_detach_filter(struct sock *sk) { return -EINVAL; }
+static inline int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
+{ return 0; }
+static inline int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned len)
+{ return -EINVAL; }
+static inline void
+sk_decode_filter(struct sock_filter *filt, struct sock_filter *to) {}
+static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp) {}
+static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) {}
+#endif

#ifdef CONFIG_BPF_JIT
#include <stdarg.h>
diff --git a/init/Kconfig b/init/Kconfig
index 9d3585b..31eccd6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1885,4 +1885,11 @@ config ASN1
inform it as to what tags are to be expected in a stream and what
functions to call on what tags.

+# Automatically enable LPF_FILTER when any architecture sets SECCOMP
+config SECCOMP_ENABLE_LPF
+ bool
+ depends on SECCOMP
+ default y
+ select LPF_FILTER
+
source "kernel/Kconfig.locks"
diff --git a/net/Kconfig b/net/Kconfig
index 281d172..82a5764 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -88,6 +88,13 @@ config NET_ETHTOOL
help
Support changing ethernet driver parameters from user tools.

+# XXX seccomp and other users should auto enable?
+config LPF_FILTER
+ bool "LPF filter"
+ default y
+ help
+ Enable BPF/LPF (Linux Packet Filter) filtering on sockets.
+
config NETWORK_SECMARK
bool "Security Marking"
help
@@ -275,6 +282,7 @@ config BQL
config BPF_JIT
bool "enable BPF Just In Time compiler"
depends on HAVE_BPF_JIT
+ depends on LPF_FILTER
depends on MODULES
---help---
Berkeley Packet Filter filtering capabilities are normally handled
diff --git a/net/core/Makefile b/net/core/Makefile
index bfd28b1..7db2fff 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -8,7 +8,7 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o

obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
- neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
+ neighbour.o rtnetlink.o utils.o link_watch.o \
sock_diag.o dev_ioctl.o

obj-$(CONFIG_XFRM) += flow.o
@@ -16,6 +16,7 @@ obj-y += net-sysfs.o
obj-$(CONFIG_NET_ETHTOOL) += ethtool.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
+obj-$(CONFIG_LPF_FILTER) += filter.o
obj-$(CONFIG_NETPOLL) += netpoll.o
obj-$(CONFIG_NET_DMA) += user_dma.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e9410d1..5bc1ef2 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -921,6 +921,7 @@ config NETFILTER_XT_MATCH_ADDRTYPE
config NETFILTER_XT_MATCH_BPF
tristate '"bpf" match support'
depends on NETFILTER_ADVANCED
+ select LPF_FILTER
help
BPF matching applies a linux socket filter to each packet and
accepts those for which the filter returns non-zero.
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a1a8e29..a0a4f96 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -469,6 +469,7 @@ config NET_CLS_CGROUP
config NET_CLS_BPF
tristate "BPF-based classifier"
select NET_CLS
+ select LPF_FILTER
---help---
If you say Y here, you will be able to classify packets based on
programmable BPF (JIT'ed) filters as an alternative to ematches.

--
1.9.0

David Miller

2014-05-06 03:10:15 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:25:59 -0700

Post by Andi Kleen
Small systems don't need the LPF filter, so make it all
optional
Saves about 4K text
text data bss dec hex filename
483545 19371 13480 516396 7e12c net/built-in.o-wo-filter
487675 19275 13480 520430 7f0ee net/built-in.o-with-filter

I think you highly underestimate how much "small systems" use
packet capturing and thus BPF.

Andi Kleen

2014-05-05 22:25:55 UTC

From: Andi Kleen <***@linux.intel.com>

UDP has two hash tables, for UDP and UDP lite. Default
them to 16 entries each on small kernels. This can be
still overriden on the command line.

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
net/ipv4/udp.c | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4468e1a..90f967b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2415,7 +2415,12 @@ void udp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */

+#ifdef CONFIG_BASE_SMALL
+static __initdata unsigned long uhash_entries = 16;
+#else
static __initdata unsigned long uhash_entries;
+#endif
+
static int __init set_uhash_entries(char *str)
{
ssize_t ret;

--
1.9.0

David Miller

2014-05-06 03:11:21 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:25:55 -0700

Post by Andi Kleen
UDP has two hash tables, for UDP and UDP lite. Default
them to 16 entries each on small kernels. This can be
still overriden on the command line.

Likewise, if you make it too small, overrunning the machine
is way too easy. Maybe it's even more important in a lower
powered machine than a high powered one.

Eric Dumazet

2014-05-06 14:26:42 UTC

Post by Andi Kleen
UDP has two hash tables, for UDP and UDP lite. Default
them to 16 entries each on small kernels. This can be
still overriden on the command line.
---
net/ipv4/udp.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4468e1a..90f967b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2415,7 +2415,12 @@ void udp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
+#ifdef CONFIG_BASE_SMALL
+static __initdata unsigned long uhash_entries = 16;
+#else
static __initdata unsigned long uhash_entries;
+#endif
+
static int __init set_uhash_entries(char *str)
{
ssize_t ret;

Its changed to UDP_HTABLE_SIZE_MIN later in alloc_large_system_hash()

The reason there is a minimum UDP hash size is PORTS_PER_CHAIN,
or DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN) on stack.

This patch has no effect on a small machine, because we already
dynamically size this hash table.

(The factor is one slot per 2MB of low memory)

Eric Dumazet

2014-05-06 19:25:54 UTC

Ok I need to fix that, because a single slot on a 2MB system
is likely not enough.

You have nothing to do.

It will use a 128 slots hash table with current code.

Andi Kleen

2014-05-06 18:23:44 UTC

Post by Eric Dumazet
This patch has no effect on a small machine, because we already
dynamically size this hash table.

Good point.

Post by Eric Dumazet
(The factor is one slot per 2MB of low memory)

Ok I need to fix that, because a single slot on a 2MB system
is likely not enough.

-Andi

Andi Kleen

2014-05-05 22:26:00 UTC

From: Andi Kleen <***@linux.intel.com>

rtnl_lock is used in a lot of places independently from rtnetlink.
Move it to an own file.

Only ugly part is that the rtnetlink init still needs to access
the mutex. do this through a private backdoor, to avoid putting
it into the headers for everyone.

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
net/core/Makefile | 3 ++-
net/core/rtnetlink.c | 42 ++----------------------------------------
net/core/rtnl_lock.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 48 insertions(+), 41 deletions(-)
create mode 100644 net/core/rtnl_lock.c

diff --git a/net/core/Makefile b/net/core/Makefile
index 7db2fff..e05bd9c 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -3,7 +3,8 @@
#

obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
- gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o
+ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o \
+ rtnl_lock.o

obj-$(CONFIG_SYSCTL) += sysctl_net_core.o

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d4ff417..0e802f0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -60,46 +60,6 @@ struct rtnl_link {
rtnl_calcit_func calcit;
};

-static DEFINE_MUTEX(rtnl_mutex);
-
-void rtnl_lock(void)
-{
- mutex_lock(&rtnl_mutex);
-}
-EXPORT_SYMBOL(rtnl_lock);
-
-void __rtnl_unlock(void)
-{
- mutex_unlock(&rtnl_mutex);
-}
-
-void rtnl_unlock(void)
-{
- /* This fellow will unlock it for us. */
- netdev_run_todo();
-}
-EXPORT_SYMBOL(rtnl_unlock);
-
-int rtnl_trylock(void)
-{
- return mutex_trylock(&rtnl_mutex);
-}
-EXPORT_SYMBOL(rtnl_trylock);
-
-int rtnl_is_locked(void)
-{
- return mutex_is_locked(&rtnl_mutex);
-}
-EXPORT_SYMBOL(rtnl_is_locked);
-
-#ifdef CONFIG_PROVE_LOCKING
-int lockdep_rtnl_is_held(void)
-{
- return lockdep_is_held(&rtnl_mutex);
-}
-EXPORT_SYMBOL(lockdep_rtnl_is_held);
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-
static struct rtnl_link *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];

static inline int rtm_msgindex(int msgtype)
@@ -2845,6 +2805,8 @@ static struct notifier_block rtnetlink_dev_notifier = {
.notifier_call = rtnetlink_event,
};

+/* Don't put this into the header, private backdoor. */
+extern struct mutex rtnl_mutex;

static int __net_init rtnetlink_net_init(struct net *net)
{
diff --git a/net/core/rtnl_lock.c b/net/core/rtnl_lock.c
new file mode 100644
index 0000000..f0dfd19
--- /dev/null
+++ b/net/core/rtnl_lock.c
@@ -0,0 +1,44 @@
+#include <linux/rtnetlink.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+
+DEFINE_MUTEX(rtnl_mutex);
+
+void rtnl_lock(void)
+{
+ mutex_lock(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_lock);
+
+void __rtnl_unlock(void)
+{
+ mutex_unlock(&rtnl_mutex);
+}
+
+void rtnl_unlock(void)
+{
+ /* This fellow will unlock it for us. */
+ netdev_run_todo();
+}
+EXPORT_SYMBOL(rtnl_unlock);
+
+int rtnl_trylock(void)
+{
+ return mutex_trylock(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_trylock);
+
+int rtnl_is_locked(void)
+{
+ return mutex_is_locked(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_is_locked);
+
+#ifdef CONFIG_PROVE_LOCKING
+int lockdep_rtnl_is_held(void)
+{
+ return lockdep_is_held(&rtnl_mutex);
+}
+EXPORT_SYMBOL(lockdep_rtnl_is_held);
+#endif /* #ifdef CONFIG_PROVE_LOCKING */

--
1.9.0

Andi Kleen

2014-05-05 22:26:01 UTC

From: Andi Kleen <***@linux.intel.com>

Small systems can use ioctl/ifconfig for routing and
interface configuration. Make rtnetlink optional
This saves ~29k without LTO, more with LTO.

text data bss dec hex filename
483545 19371 13480 516396 7e12c net/built-in.o-with-rtnetlink
454365 19275 12936 486576 76cb0 net/built-in.o-wo-rtnetlink

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
include/linux/rtnetlink.h | 56 +++++++++++++++++++++++++++++++++++++++++------
include/net/rtnetlink.h | 35 +++++++++++++++++++++++++++++
net/Kconfig | 8 +++++++
net/core/Makefile | 3 ++-
net/ipv4/fib_frontend.c | 7 ++++++
net/ipv4/fib_lookup.h | 12 ++++++++++
net/ipv4/fib_semantics.c | 4 ++++
7 files changed, 117 insertions(+), 8 deletions(-)

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 8e3e66a..e876aa2 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -7,15 +7,32 @@
#include <uapi/linux/rtnetlink.h>

extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
-extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
-extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
- u32 group, struct nlmsghdr *nlh, gfp_t flags);
-extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
-extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
u32 id, long expires, u32 error);

+#ifdef CONFIG_RTNETLINK
+void rtnl_set_sk_err(struct net *net, u32 group, int error);
+int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
+void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
+ u32 group, struct nlmsghdr *nlh, gfp_t flags);
void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
+int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
+#else
+static inline int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
+{ return -EIO; }
+
+static inline void
+rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags) {}
+
+static inline void
+rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
+ u32 group, struct nlmsghdr *nlh, gfp_t flags) {}
+
+static inline int
+rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) { return -EINVAL; }
+
+static inline void rtnl_set_sk_err(struct net *net, u32 group, int error) {}
+#endif

/* RTNL is used as a global lock for all changes to network configuration */
extern void rtnl_lock(void);
@@ -59,7 +76,12 @@ static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)

extern struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);

+#ifdef CONFIG_RTNETLINK
extern void rtnetlink_init(void);
+#else
+static inline void rtnetlink_init(void) {}
+#endif
+
extern void __rtnl_unlock(void);

#define ASSERT_RTNL() do { \
@@ -70,6 +92,7 @@ extern void __rtnl_unlock(void);
} \
} while(0)

+#ifdef CONFIG_RTNETLINK
extern int ndo_dflt_fdb_dump(struct sk_buff *skb,
struct netlink_callback *cb,
struct net_device *dev,
@@ -79,11 +102,30 @@ extern int ndo_dflt_fdb_add(struct ndmsg *ndm,
struct net_device *dev,
const unsigned char *addr,
u16 flags);
+extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+ struct net_device *dev, u16 mode);
extern int ndo_dflt_fdb_del(struct ndmsg *ndm,
struct nlattr *tb[],
struct net_device *dev,
const unsigned char *addr);

-extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
- struct net_device *dev, u16 mode);
+#else
+static inline int ndo_dflt_fdb_dump(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct net_device *dev,
+ int idx) { return -EINVAL; }
+static inline int ndo_dflt_fdb_add(struct ndmsg *ndm,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr,
+ u16 flags) { return -EINVAL; }
+static inline int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
+ struct net_device *dev, u16 mode)
+{ return -EINVAL; }
+static inline int ndo_dflt_fdb_del(struct ndmsg *ndm,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr)
+{ return -EINVAL; }
+#endif /* !CONFIG_RTNETLINK */
#endif /* __LINUX_RTNETLINK_H */
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 72240e5..859078b 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -8,12 +8,23 @@ typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *);
typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *);
typedef u16 (*rtnl_calcit_func)(struct sk_buff *, struct nlmsghdr *);

+#ifdef CONFIG_RTNETLINK
int __rtnl_register(int protocol, int msgtype,
rtnl_doit_func, rtnl_dumpit_func, rtnl_calcit_func);
void rtnl_register(int protocol, int msgtype,
rtnl_doit_func, rtnl_dumpit_func, rtnl_calcit_func);
int rtnl_unregister(int protocol, int msgtype);
void rtnl_unregister_all(int protocol);
+#else
+static inline int __rtnl_register(int protocol, int msgtype,
+ rtnl_doit_func d, rtnl_dumpit_func du, rtnl_calcit_func c)
+{ return -EINVAL; }
+static inline void rtnl_register(int protocol, int msgtype,
+ rtnl_doit_func d, rtnl_dumpit_func du, rtnl_calcit_func c)
+{ }
+static inline int rtnl_unregister(int protocol, int msgtype) { return 0; }
+static inline void rtnl_unregister_all(int protocol) {}
+#endif

static inline int rtnl_msg_family(const struct nlmsghdr *nlh)
{
@@ -95,11 +106,25 @@ struct rtnl_link_ops {
const struct net_device *slave_dev);
};

+#ifdef CONFIG_RTNETLINK
int __rtnl_link_register(struct rtnl_link_ops *ops);
void __rtnl_link_unregister(struct rtnl_link_ops *ops);

int rtnl_link_register(struct rtnl_link_ops *ops);
void rtnl_link_unregister(struct rtnl_link_ops *ops);
+#else
+/* Return 0 to make the respective init functions not error out.
+ * We assume the subsystems are still somewhat useful even without
+ * rtnetlink.
+ */
+static inline int __rtnl_link_register(struct rtnl_link_ops *ops)
+{ return 0; }
+static inline void __rtnl_link_unregister(struct rtnl_link_ops *ops) {}
+
+static inline int rtnl_link_register(struct rtnl_link_ops *ops)
+{ return 0; }
+static inline void rtnl_link_unregister(struct rtnl_link_ops *ops) {}
+#endif

/**
* struct rtnl_af_ops - rtnetlink address family operations
@@ -129,10 +154,20 @@ struct rtnl_af_ops {
const struct nlattr *attr);
};

+#ifdef CONFIG_RTNETLINK
+int __rtnl_af_register(struct rtnl_af_ops *ops);
void __rtnl_af_unregister(struct rtnl_af_ops *ops);

void rtnl_af_register(struct rtnl_af_ops *ops);
void rtnl_af_unregister(struct rtnl_af_ops *ops);
+#else
+static inline int __rtnl_af_register(struct rtnl_af_ops *ops)
+{ return -EINVAL; }
+static inline void __rtnl_af_unregister(struct rtnl_af_ops *ops) {}
+
+static inline int rtnl_af_register(struct rtnl_af_ops *ops) { return -EINVAL; }
+static inline void rtnl_af_unregister(struct rtnl_af_ops *ops) {}
+#endif

struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]);
struct net_device *rtnl_create_link(struct net *net, char *ifname,
diff --git a/net/Kconfig b/net/Kconfig
index 82a5764..f5196ba 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -24,6 +24,14 @@ menuconfig NET

if NET

+config RTNETLINK
+ bool "rtnetlink"
+ default y
+ help
+ Enable rtnetlink to configure routing and related setups.
+ This is needed for most modern configuration
+ tools, but old ifconfig can do without it.
+
config WANT_COMPAT_NETLINK_MESSAGES
bool
help
diff --git a/net/core/Makefile b/net/core/Makefile
index e05bd9c..50d4850 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -9,11 +9,12 @@ obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o

obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
- neighbour.o rtnetlink.o utils.o link_watch.o \
+ neighbour.o utils.o link_watch.o \
sock_diag.o dev_ioctl.o

obj-$(CONFIG_XFRM) += flow.o
obj-y += net-sysfs.o
+obj-$(CONFIG_RTNETLINK) += rtnetlink.o
obj-$(CONFIG_NET_ETHTOOL) += ethtool.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 255aa99..3221b0e 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -917,6 +917,9 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
#undef BRD1_OK
}

+#ifdef CONFIG_RTNETLINK
+/* Isn't really rtnetlink, but close enough for this CONFIG. */
+
static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
{

@@ -994,6 +997,10 @@ static void nl_fib_lookup_exit(struct net *net)
netlink_kernel_release(net->ipv4.fibnl);
net->ipv4.fibnl = NULL;
}
+#else
+static inline void nl_fib_lookup_exit(struct net *net) {}
+static inline int nl_fib_lookup_init(struct net *net) { return 0; }
+#endif

static void fib_disable_ip(struct net_device *dev, int force)
{
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 1e4f660..ec29c81 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -27,11 +27,23 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
void fib_release_info(struct fib_info *);
struct fib_info *fib_create_info(struct fib_config *cfg);
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
+#ifdef CONFIG_RTNETLINK
int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi,
unsigned int);
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
+#else
+static inline int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq,
+ int event, u32 tb_id,
+ u8 type, __be32 dst, int dst_len, u8 tos,
+ struct fib_info *fi,
+ unsigned int f) { return -EINVAL; }
+static inline void
+rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
+ u32 tb_id, const struct nl_info *info, unsigned int nlm_flags)
+{}
+#endif
struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);

static inline void fib_result_assign(struct fib_result *res,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c3d4e4d..75be44d 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -356,6 +356,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
return -1;
}

+#ifdef CONFIG_RTNETLINK
static inline size_t fib_nlmsg_size(struct fib_info *fi)
{
size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
@@ -411,6 +412,7 @@ errout:
if (err < 0)
rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
}
+#endif

/* Return the first fib alias matching TOS with
* priority less than or equal to PRIO.
@@ -998,6 +1000,7 @@ failure:
return ERR_PTR(err);
}

+#ifdef CONFIG_RTNETLINK
int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
struct fib_info *fi, unsigned int flags)
@@ -1089,6 +1092,7 @@ nla_put_failure:
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
+#endif

/*
* Update FIB if:

--
1.9.0

David Miller

2014-05-06 03:08:30 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:26:01 -0700

Post by Andi Kleen
Small systems can use ioctl/ifconfig for routing and
interface configuration. Make rtnetlink optional
This saves ~29k without LTO, more with LTO.
text data bss dec hex filename
483545 19371 13480 516396 7e12c net/built-in.o-with-rtnetlink
454365 19275 12936 486576 76cb0 net/built-in.o-wo-rtnetlink

Andi, I've had about enough with this patch series.

We have moved several tools to only use netlink because it's
the only extensible facility, and we are not looking back.

The moment one of these "small" systems, or whatever you want to call
it, tries to use any feature added to device configuration in the last
20 years ioctl doesn't cut it.

Andi Kleen

2014-05-06 03:11:27 UTC

Post by David Miller
The moment one of these "small" systems, or whatever you want to call
it, tries to use any feature added to device configuration in the last
20 years ioctl doesn't cut it.

They will simply not use all that complexity ever, 400k is simply prohibitive
with 2MB.

If you want to keep netlink what would you remove instead?

-Andi

--
***@linux.intel.com -- Speaking for myself only.

Andi Kleen

2014-05-05 22:26:04 UTC

From: Andi Kleen <***@linux.intel.com>

There was already a CONFIG_IP_MULTICAST, but it didn't control
the multicasting code (like igmp) in the IP stack, only some
driver code. Change this.

This disables some multi cast filters in TCP/UDP too, but
that should be ok because we never joing multicast groups
without this options, so the packets should never arrive up-stack.

Worth ~20k when disabled

text data bss dec hex filename
420363 17509 11624 449496 6dbd8 net/built-in.o-with-mcast
399649 17381 11624 428654 68a6e net/built-in.o-wo-mcast

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
include/linux/igmp.h | 31 +++++++++++++++++++++++--------
net/ipv4/Kconfig | 8 +++-----
net/ipv4/Makefile | 3 ++-
net/ipv4/devinet.c | 2 ++
net/ipv4/ip_sockglue.c | 4 ++++
net/ipv4/sysctl_net_ipv4.c | 2 ++
6 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index f47550d..1a1a044 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -108,11 +108,7 @@ struct ip_mc_list {
#define IGMPV3_QQIC(value) IGMPV3_EXP(0x80, 4, 3, value)
#define IGMPV3_MRC(value) IGMPV3_EXP(0x80, 4, 3, value)

-extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u16 proto);
extern int igmp_rcv(struct sk_buff *);
-extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr);
-extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr);
-extern void ip_mc_drop_socket(struct sock *sk);
extern int ip_mc_source(int add, int omode, struct sock *sk,
struct ip_mreq_source *mreqs, int ifindex);
extern int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf,int ifindex);
@@ -120,14 +116,33 @@ extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
struct ip_msfilter __user *optval, int __user *optlen);
extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
struct group_filter __user *optval, int __user *optlen);
-extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt, int dif);
-extern void ip_mc_init_dev(struct in_device *);
-extern void ip_mc_destroy_dev(struct in_device *);
-extern void ip_mc_up(struct in_device *);
extern void ip_mc_down(struct in_device *);
extern void ip_mc_unmap(struct in_device *);
extern void ip_mc_remap(struct in_device *);
extern void ip_mc_dec_group(struct in_device *in_dev, __be32 addr);
extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr);

+#ifdef CONFIG_IP_MULTICAST
+extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt, int dif);
+extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u16 proto);
+extern void ip_mc_up(struct in_device *);
+extern void ip_mc_destroy_dev(struct in_device *);
+extern void ip_mc_init_dev(struct in_device *);
+extern void ip_mc_drop_socket(struct sock *sk);
+extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr);
+extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr);
+#else
+static inline int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt, int dif)
+{ return 0; }
+static inline int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u16 proto)
+{ return 0; }
+static inline void ip_mc_up(struct in_device *d) {}
+static inline void ip_mc_destroy_dev(struct in_device *d) {}
+static inline void ip_mc_init_dev(struct in_device *d) {}
+static inline void ip_mc_drop_socket(struct sock *sk) {}
+static inline int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
+{ return -EINVAL; }
+static inline int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
+{ return -EINVAL; }
+#endif
#endif
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 00a7f76..c1f9899 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -5,11 +5,9 @@ config IP_MULTICAST
bool "IP: multicasting"
help
This is code for addressing several networked computers at once,
- enlarging your kernel by about 2 KB. You need multicasting if you
- intend to participate in the MBONE, a high bandwidth network on top
- of the Internet which carries audio and video broadcasts. More
- information about the MBONE is on the WWW at
- <http://www.savetz.com/mbone/>. For most people, it's safe to say N.
+ enlarging your kernel by about 20 KB. This is needed for many
+ modern networking services on the local network, so you should
+ probably say Y.

config IP_ADVANCED_ROUTER
bool "IP: advanced router"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 784a782..eb129a4 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -9,7 +9,7 @@ obj-y := route.o inetpeer.o protocol.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_fastopen.o \
datagram.o raw.o udp.o udplite.o \
- arp.o icmp.o devinet.o af_inet.o igmp.o \
+ arp.o icmp.o devinet.o af_inet.o \
fib_frontend.o fib_semantics.o fib_trie.o \
inet_fragment.o ip_tunnel_core.o gre_offload.o

@@ -37,6 +37,7 @@ obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
obj-$(CONFIG_IP_PNP) += ipconfig.o
+obj-$(CONFIG_IP_MULTICAST) += igmp.o
obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index bdbf68b..c19266d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1402,6 +1402,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
/* Send gratuitous ARP to notify of link change */
inetdev_send_gratuitous_arp(dev, in_dev);
break;
+#ifdef CONFIG_IP_MULTICAST
case NETDEV_DOWN:
ip_mc_down(in_dev);
break;
@@ -1411,6 +1412,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
case NETDEV_POST_TYPE_CHANGE:
ip_mc_remap(in_dev);
break;
+#endif
case NETDEV_CHANGEMTU:
if (inetdev_valid_mtu(dev->mtu))
break;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 64741b9..ed5c7bd 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -652,6 +652,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
if (!val)
skb_queue_purge(&sk->sk_error_queue);
break;
+#ifdef CONFIG_IP_MULTICAST
case IP_MULTICAST_TTL:
if (sk->sk_type == SOCK_STREAM)
goto e_inval;
@@ -1010,6 +1011,7 @@ mc_msf_out:
goto e_inval;
inet->mc_all = val;
break;
+#endif
case IP_ROUTER_ALERT:
err = ip_ra_control(sk, val ? 1 : 0, NULL);
break;
@@ -1248,6 +1250,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_RECVERR:
val = inet->recverr;
break;
+#ifdef CONFIG_IP_MULTICAST
case IP_MULTICAST_TTL:
val = inet->mc_ttl;
break;
@@ -1310,6 +1313,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_MULTICAST_ALL:
val = inet->mc_all;
break;
+#endif
case IP_PKTOPTIONS:
{
struct msghdr msg;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 2110d2e..fe5823a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -443,6 +443,7 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_do_large_bitmap,
},
+#ifdef CONFIG_IP_MULTICAST
{
.procname = "igmp_max_memberships",
.data = &sysctl_igmp_max_memberships,
@@ -457,6 +458,7 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+#endif
{
.procname = "inet_peer_threshold",
.data = &inet_peer_threshold,

--
1.9.0

Andi Kleen

2014-05-05 22:26:07 UTC

From: Andi Kleen <***@linux.intel.com>

Not really needed on small embedded systems. Saves about 5k text,
more with IPv6.

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
include/net/ip.h | 5 +++++
include/net/ipv6.h | 5 +++++
include/net/raw.h | 15 +++++++++------
include/net/rawv6.h | 8 ++++++++
include/net/transp_v6.h | 5 +++++
net/ipv4/Kconfig | 5 +++++
net/ipv4/Makefile | 3 ++-
net/ipv4/af_inet.c | 8 ++++++++
net/ipv4/ip_input.c | 2 ++
net/ipv4/proc.c | 2 ++
net/ipv6/Kconfig | 1 +
net/ipv6/Makefile | 3 ++-
net/ipv6/af_inet6.c | 4 ++++
net/ipv6/ip6_output.c | 4 ++++
net/ipv6/proc.c | 2 ++
15 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 6764e30..bc878f3 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -445,7 +445,12 @@ static inline int sk_mc_loop(struct sock *sk)
return 1;
}

+#ifdef CONFIG_INET_RAW
bool ip_call_ra_chain(struct sk_buff *skb);
+#else
+static inline bool ip_call_ra_chain(struct sk_buff *skb)
+{ return false; }
+#endif

/*
* Functions provided by ip_fragment.c
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 3c4c041..f4dae3a 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -822,8 +822,13 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
#ifdef CONFIG_PROC_FS
int ac6_proc_init(struct net *net);
void ac6_proc_exit(struct net *net);
+#ifdef CONFIG_INET_RAW
int raw6_proc_init(void);
void raw6_proc_exit(void);
+#else
+static inline int raw6_proc_init(void) { return 0; }
+static inline void raw6_proc_exit(void) {}
+#endif
int tcp6_proc_init(struct net *net);
void tcp6_proc_exit(struct net *net);
int udp6_proc_init(struct net *net);
diff --git a/include/net/raw.h b/include/net/raw.h
index 6a40c65..5ece765 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -23,8 +23,17 @@

extern struct proto raw_prot;

+#ifdef CONFIG_INET_RAW
void raw_icmp_error(struct sk_buff *, int, u32);
int raw_local_deliver(struct sk_buff *, int);
+int raw_proc_init(void);
+void raw_proc_exit(void);
+#else
+static inline void raw_icmp_error(struct sk_buff *skb, int a, u32 b) {}
+static inline int raw_local_deliver(struct sk_buff *skb, int s) { return 0; }
+static inline int raw_proc_init(void) { return 0; }
+static inline void raw_proc_exit(void) {}
+#endif

int raw_rcv(struct sock *, struct sk_buff *);

@@ -35,10 +44,6 @@ struct raw_hashinfo {
struct hlist_head ht[RAW_HTABLE_SIZE];
};

-#ifdef CONFIG_PROC_FS
-int raw_proc_init(void);
-void raw_proc_exit(void);
-
struct raw_iter_state {
struct seq_net_private p;
int bucket;
@@ -55,8 +60,6 @@ void raw_seq_stop(struct seq_file *seq, void *v);
int raw_seq_open(struct inode *ino, struct file *file,
struct raw_hashinfo *h, const struct seq_operations *ops);

-#endif
-
void raw_hash_sk(struct sock *sk);
void raw_unhash_sk(struct sock *sk);

diff --git a/include/net/rawv6.h b/include/net/rawv6.h
index 87783de..acb81dc 100644
--- a/include/net/rawv6.h
+++ b/include/net/rawv6.h
@@ -3,9 +3,17 @@

#include <net/protocol.h>

+
+#ifdef CONFIG_INET_RAW
void raw6_icmp_error(struct sk_buff *, int nexthdr,
u8 type, u8 code, int inner_offset, __be32);
bool raw6_local_deliver(struct sk_buff *, int);
+#else
+static inline void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
+ u8 type, u8 code, int inner_offset, __be32 x) {}
+static inline bool raw6_local_deliver(struct sk_buff *skb, int x)
+{ return false; }
+#endif

int rawv6_rcv(struct sock *sk, struct sk_buff *skb);

diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h
index 7a0e229..801267c 100644
--- a/include/net/transp_v6.h
+++ b/include/net/transp_v6.h
@@ -19,8 +19,13 @@ int ipv6_frag_init(void);
void ipv6_frag_exit(void);

/* transport protocols */
+#ifdef CONFIG_INET_RAW
int rawv6_init(void);
void rawv6_exit(void);
+#else
+static inline int rawv6_init(void) { return 0; }
+static inline void rawv6_exit(void) {}
+#endif
int udpv6_init(void);
void udpv6_exit(void);
int udplitev6_init(void);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index df5c569..cdb4f57 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -62,6 +62,10 @@ config IP_FIB_TRIE_STATS
Keep track of statistics on structure of FIB TRIE table.
Useful for testing and measuring TRIE performance.

+config INET_RAW
+ bool "IP: Support raw sockets"
+ default y
+
config IP_MULTIPLE_TABLES
bool "IP: policy routing"
depends on IP_ADVANCED_ROUTER
@@ -218,6 +222,7 @@ config NET_IPGRE_BROADCAST
config IP_MROUTE
bool "IP: multicast routing"
depends on IP_MULTICAST
+ select INET_RAW
help
This is used if you want your machine to act as a router for IP
packets that have several destination addresses. It is needed on the
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index addecef..9353beb 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o \
- datagram.o raw.o udp.o udplite.o \
+ datagram.o udp.o udplite.o \
arp.o icmp.o devinet.o af_inet.o \
fib_frontend.o fib_semantics.o fib_trie.o \
inet_fragment.o ip_tunnel_core.o gre_offload.o
@@ -58,6 +58,7 @@ obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
obj-$(CONFIG_TCP_FASTOPEN) += tcp_fastopen.o
obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
+obj-$(CONFIG_INET_RAW) += raw.o

obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
xfrm4_output.o xfrm4_protocol.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 46b1815..cdcf1e8 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -956,6 +956,7 @@ const struct proto_ops inet_dgram_ops = {
};
EXPORT_SYMBOL(inet_dgram_ops);

+#ifdef CONFIG_INET_RAW
/*
* For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
* udp_poll
@@ -985,6 +986,7 @@ static const struct proto_ops inet_sockraw_ops = {
.compat_ioctl = inet_compat_ioctl,
#endif
};
+#endif

static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
@@ -1026,6 +1028,7 @@ static struct inet_protosw inetsw_array[] =
},
#endif

+#ifdef CONFIG_INET_RAW
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
@@ -1034,6 +1037,7 @@ static struct inet_protosw inetsw_array[] =
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
+#endif
};

#define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)
@@ -1737,9 +1741,11 @@ static int __init inet_init(void)
if (rc)
goto out_unregister_tcp_proto;

+#ifdef CONFIG_INET_RAW
rc = proto_register(&raw_prot, 1);
if (rc)
goto out_unregister_udp_proto;
+#endif

#ifdef CONFIG_IP_PING
rc = proto_register(&ping_prot, 1);
@@ -1835,8 +1841,10 @@ static int __init inet_init(void)
out:
return rc;
out_unregister_raw_proto:
+#ifdef CONFIG_INET_RAW
proto_unregister(&raw_prot);
out_unregister_udp_proto:
+#endif
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3d4da2c..8497bc1 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -147,6 +147,7 @@
#include <linux/mroute.h>
#include <linux/netlink.h>

+#ifdef CONFIG_INET_RAW
/*
* Process Router Attention IP option (RFC 2113)
*/
@@ -186,6 +187,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
}
return false;
}
+#endif

static int ip_local_deliver_finish(struct sk_buff *skb)
{
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index ad737fa..5f040eb 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -69,8 +69,10 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
proto_memory_allocated(&udp_prot));
seq_printf(seq, "UDPLITE: inuse %d\n",
sock_prot_inuse_get(net, &udplite_prot));
+#ifdef CONFIG_INET_RAW
seq_printf(seq, "RAW: inuse %d\n",
sock_prot_inuse_get(net, &raw_prot));
+#endif
seq_printf(seq, "FRAG: inuse %d memory %d\n",
ip_frag_nqueues(net), ip_frag_mem(net));
return 0;
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 438a73a..bb3ffec 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -88,6 +88,7 @@ config INET6_IPCOMP
config IPV6_MIP6
tristate "IPv6: Mobility"
select XFRM
+ select INET_RAW
---help---
Support for IPv6 Mobility described in RFC 3775.

diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 7ce7aa0..19f0e00 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -7,13 +7,14 @@ obj-$(CONFIG_IPV6) += ipv6.o
ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
addrlabel.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
- raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
+ icmp.o mcast.o reassembly.o tcp_ipv6.o \
exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o

ipv6-offload-$(CONFIG_IP_OFFLOAD) := ip6_offload.o tcpv6_offload.o \
udp_offload.o exthdrs_offload.o

ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o
+ipv6-$(CONFIG_INET_RAW) += raw.o
ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o
ipv6-$(CONFIG_IP_PING) += ping.o

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 327042a..fc36df6 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -865,9 +865,11 @@ static int __init inet6_init(void)
if (err)
goto out_unregister_udp_proto;

+#ifdef CONFIG_INET_RAW
err = proto_register(&rawv6_prot, 1);
if (err)
goto out_unregister_udplite_proto;
+#endif

#ifdef CONFIG_IP_PING
err = proto_register(&pingv6_prot, 1);
@@ -1036,8 +1038,10 @@ out_unregister_ping_proto:
proto_unregister(&pingv6_prot);
out_unregister_raw_proto:
#endif
+#ifdef CONFIG_INET_RAW
proto_unregister(&rawv6_prot);
out_unregister_udplite_proto:
+#endif
proto_unregister(&udplitev6_prot);
out_unregister_udp_proto:
proto_unregister(&udpv6_prot);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 40e7581..fb21dde 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -235,6 +235,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,

EXPORT_SYMBOL(ip6_xmit);

+#ifdef CONFIG_INET_RAW
static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
{
struct ip6_ra_chain *ra;
@@ -263,6 +264,7 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
read_unlock(&ip6_ra_lock);
return 0;
}
+#endif

static int ip6_forward_proxy_check(struct sk_buff *skb)
{
@@ -394,10 +396,12 @@ int ip6_forward(struct sk_buff *skb)
* cannot be fragmented, because there is no warranty
* that different fragments will go along one path. --ANK
*/
+#ifdef CONFIG_INET_RAW
if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
return 0;
}
+#endif

/*
* check and decrement ttl
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 091d066..af0d18a 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -40,8 +40,10 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
sock_prot_inuse_get(net, &udpv6_prot));
seq_printf(seq, "UDPLITE6: inuse %d\n",
sock_prot_inuse_get(net, &udplitev6_prot));
+#ifdef CONFIG_INET_RAW
seq_printf(seq, "RAW6: inuse %d\n",
sock_prot_inuse_get(net, &rawv6_prot));
+#endif
seq_printf(seq, "FRAG6: inuse %d memory %d\n",
ip6_frag_nqueues(net), ip6_frag_mem(net));
return 0;

--
1.9.0

David Miller

2014-05-06 03:12:51 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:26:07 -0700

Post by Andi Kleen
Not really needed on small embedded systems. Saves about 5k text,
more with IPv6.

Sorry, you can't have half a functioning ipv4 stack.

I'm not applying this.

Andi Kleen

2014-05-05 22:26:13 UTC

From: Andi Kleen <***@linux.intel.com>

Convert all drivers to use SET_ETHTOOL_OPS. This allows
the compiler to throw the code away when CONFIG_ETHTOOL
is disabled, saving text size

This is a purely mechanic patch, done with the following
coccinelle script (also available in
scripts/coccinelle/api/ethtool_ops.cocci)

// Convert network drivers to use the SET_ETHTOOL_OPS macro
// This allows to compile out the ethtool code when not needed.
//
@@
struct ethtool_ops *ops;
struct net_device *dev;
@@
- dev->ethtool_ops = ops;
+ SET_ETHTOOL_OPS(dev, ops);

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
arch/um/drivers/net_kern.c | 2 +-
drivers/firewire/net.c | 2 +-
drivers/infiniband/hw/nes/nes_nic.c | 2 +-
drivers/net/bonding/bond_main.c | 2 +-
drivers/net/cris/eth_v10.c | 2 +-
drivers/net/ethernet/3com/3c515.c | 2 +-
drivers/net/ethernet/3com/3c59x.c | 2 +-
drivers/net/ethernet/8390/ax88796.c | 2 +-
drivers/net/ethernet/8390/etherh.c | 2 +-
drivers/net/ethernet/8390/ne2k-pci.c | 2 +-
drivers/net/ethernet/adi/bfin_mac.c | 2 +-
drivers/net/ethernet/aeroflex/greth.c | 2 +-
drivers/net/ethernet/allwinner/sun4i-emac.c | 2 +-
drivers/net/ethernet/amd/pcnet32.c | 2 +-
drivers/net/ethernet/amd/sunlance.c | 2 +-
drivers/net/ethernet/apple/bmac.c | 2 +-
drivers/net/ethernet/arc/emac_main.c | 2 +-
drivers/net/ethernet/atheros/atlx/atl1.c | 2 +-
drivers/net/ethernet/broadcom/bnx2.c | 2 +-
drivers/net/ethernet/broadcom/tg3.c | 2 +-
drivers/net/ethernet/cadence/at91_ether.c | 2 +-
drivers/net/ethernet/cirrus/ep93xx_eth.c | 2 +-
drivers/net/ethernet/davicom/dm9000.c | 2 +-
drivers/net/ethernet/dec/tulip/de2104x.c | 2 +-
drivers/net/ethernet/dec/tulip/dmfe.c | 2 +-
drivers/net/ethernet/dec/tulip/uli526x.c | 2 +-
drivers/net/ethernet/dec/tulip/winbond-840.c | 2 +-
drivers/net/ethernet/dnet.c | 2 +-
drivers/net/ethernet/ethoc.c | 2 +-
drivers/net/ethernet/fealnx.c | 2 +-
drivers/net/ethernet/freescale/fec_main.c | 2 +-
drivers/net/ethernet/freescale/fec_mpc52xx.c | 2 +-
drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c | 2 +-
drivers/net/ethernet/freescale/gianfar.c | 2 +-
drivers/net/ethernet/ibm/ibmveth.c | 2 +-
drivers/net/ethernet/jme.c | 2 +-
drivers/net/ethernet/korina.c | 2 +-
drivers/net/ethernet/lantiq_etop.c | 2 +-
drivers/net/ethernet/marvell/skge.c | 2 +-
drivers/net/ethernet/micrel/ks8842.c | 2 +-
drivers/net/ethernet/micrel/ks8851_mll.c | 2 +-
drivers/net/ethernet/nuvoton/w90p910_ether.c | 2 +-
drivers/net/ethernet/nxp/lpc_eth.c | 2 +-
drivers/net/ethernet/octeon/octeon_mgmt.c | 2 +-
drivers/net/ethernet/pasemi/pasemi_mac.c | 2 +-
drivers/net/ethernet/rdc/r6040.c | 2 +-
drivers/net/ethernet/realtek/8139cp.c | 2 +-
drivers/net/ethernet/realtek/8139too.c | 2 +-
drivers/net/ethernet/sfc/efx.c | 2 ++
drivers/net/ethernet/sgi/ioc3-eth.c | 2 +-
drivers/net/ethernet/silan/sc92031.c | 2 +-
drivers/net/ethernet/sis/sis900.c | 2 +-
drivers/net/ethernet/smsc/epic100.c | 2 +-
drivers/net/ethernet/smsc/smc911x.c | 2 +-
drivers/net/ethernet/smsc/smc91x.c | 2 +-
drivers/net/ethernet/smsc/smsc911x.c | 2 +-
drivers/net/ethernet/smsc/smsc9420.c | 2 +-
drivers/net/ethernet/sun/cassini.c | 2 +-
drivers/net/ethernet/sun/niu.c | 2 +-
drivers/net/ethernet/sun/sunbmac.c | 2 +-
drivers/net/ethernet/sun/sungem.c | 2 +-
drivers/net/ethernet/sun/sunhme.c | 4 ++--
drivers/net/ethernet/sun/sunqe.c | 2 +-
drivers/net/ethernet/sun/sunvnet.c | 2 +-
drivers/net/ethernet/ti/cpmac.c | 2 +-
drivers/net/ethernet/toshiba/ps3_gelic_net.c | 2 +-
drivers/net/ethernet/toshiba/ps3_gelic_wireless.c | 2 +-
drivers/net/ethernet/toshiba/spider_net.c | 2 +-
drivers/net/ethernet/toshiba/tc35815.c | 2 +-
drivers/net/ethernet/tundra/tsi108_eth.c | 2 +-
drivers/net/ethernet/via/via-rhine.c | 2 +-
drivers/net/ethernet/via/via-velocity.c | 2 +-
drivers/net/ethernet/wiznet/w5100.c | 2 +-
drivers/net/ethernet/wiznet/w5300.c | 2 +-
drivers/net/ethernet/xilinx/ll_temac_main.c | 2 +-
drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 2 +-
drivers/net/ethernet/xircom/xirc2ps_cs.c | 2 +-
drivers/net/ethernet/xscale/ixp4xx_eth.c | 2 +-
drivers/net/loopback.c | 2 +-
drivers/net/macvlan.c | 2 +-
drivers/net/nlmon.c | 2 +-
drivers/net/team/team.c | 2 +-
drivers/net/tun.c | 2 +-
drivers/net/usb/mcs7830.c | 2 +-
drivers/net/usb/sr9700.c | 2 +-
drivers/net/usb/usbnet.c | 2 +-
drivers/net/veth.c | 2 +-
drivers/net/wimax/i2400m/netdev.c | 2 +-
drivers/net/wimax/i2400m/usb.c | 2 +-
drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c | 2 +-
drivers/net/wireless/ipw2x00/ipw2100.c | 2 +-
drivers/net/wireless/ipw2x00/ipw2200.c | 2 +-
drivers/net/wireless/libertas/main.c | 2 +-
drivers/net/wireless/libertas/mesh.c | 2 +-
drivers/net/wireless/mwifiex/cfg80211.c | 2 +-
drivers/net/wireless/prism54/islpci_dev.c | 2 +-
drivers/staging/bcm/Bcmnet.c | 2 +-
drivers/staging/rtl8192e/rtl8192e/rtl_core.c | 2 +-
drivers/staging/wlags49_h2/wl_netdev.c | 3 +--
net/8021q/vlan_dev.c | 2 +-
100 files changed, 102 insertions(+), 101 deletions(-)

diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index 7d26d9c..1a8125f 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -477,7 +477,7 @@ static void eth_configure(int n, void *init, char *mac,

dev->mtu = transport->user->mtu;
dev->netdev_ops = &uml_netdev_ops;
- dev->ethtool_ops = &uml_net_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &uml_net_ethtool_ops);
dev->watchdog_timeo = (HZ >> 1);
dev->irq = UM_ETH_IRQ;

diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index 4af0a7b..f0bc9bc 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -1393,7 +1393,7 @@ static void fwnet_init_dev(struct net_device *net)
net->hard_header_len = FWNET_HLEN;
net->type = ARPHRD_IEEE1394;
net->tx_queue_len = FWNET_TX_QUEUE_LEN;
- net->ethtool_ops = &fwnet_ethtool_ops;
+ SET_ETHTOOL_OPS(net, &fwnet_ethtool_ops);
}

/* caller must hold fwnet_device_mutex */
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
index 49eb511..316b888 100644
--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -1678,7 +1678,7 @@ struct net_device *nes_netdev_init(struct nes_device *nesdev,
netdev->addr_len = ETH_ALEN;
netdev->type = ARPHRD_ETHER;
netdev->netdev_ops = &nes_netdev_ops;
- netdev->ethtool_ops = &nes_ethtool_ops;
+ SET_ETHTOOL_OPS(netdev, &nes_ethtool_ops);
netif_napi_add(netdev, &nesvnic->napi, nes_netdev_poll, 128);
nes_debug(NES_DBG_INIT, "Enabling VLAN Insert/Delete.\n");

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 69aff72..ef89255 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3910,7 +3910,7 @@ void bond_setup(struct net_device *bond_dev)
/* Initialize the device entry points */
ether_setup(bond_dev);
bond_dev->netdev_ops = &bond_netdev_ops;
- bond_dev->ethtool_ops = &bond_ethtool_ops;
+ SET_ETHTOOL_OPS(bond_dev, &bond_ethtool_ops);

bond_dev->destructor = bond_destructor;

diff --git a/drivers/net/cris/eth_v10.c b/drivers/net/cris/eth_v10.c
index 29e272c..fc8ca1f 100644
--- a/drivers/net/cris/eth_v10.c
+++ b/drivers/net/cris/eth_v10.c
@@ -314,7 +314,7 @@ etrax_ethernet_init(void)

/* fill in our handlers so the network layer can talk to us in the future */

- dev->ethtool_ops = &e100_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &e100_ethtool_ops);
dev->netdev_ops = &e100_netdev_ops;

spin_lock_init(&np->lock);
diff --git a/drivers/net/ethernet/3com/3c515.c b/drivers/net/ethernet/3com/3c515.c
index 94c656f..9fc8d80 100644
--- a/drivers/net/ethernet/3com/3c515.c
+++ b/drivers/net/ethernet/3com/3c515.c
@@ -698,7 +698,7 @@ static int corkscrew_setup(struct net_device *dev, int ioaddr,
/* The 3c51x-specific entries in the device structure. */
dev->netdev_ops = &netdev_ops;
dev->watchdog_timeo = (400 * HZ) / 1000;
- dev->ethtool_ops = &netdev_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);

return register_netdev(dev);
}
diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index 61477b8..eb0d3b6 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -1467,7 +1467,7 @@ static int vortex_probe1(struct device *gendev, void __iomem *ioaddr, int irq,
(dev->features & NETIF_F_IP_CSUM) ? "en":"dis");
}

- dev->ethtool_ops = &vortex_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &vortex_ethtool_ops);
dev->watchdog_timeo = (watchdog * HZ) / 1000;

if (pdev) {
diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c
index 455d4c3..5650712 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -782,7 +782,7 @@ static int ax_init_dev(struct net_device *dev)
ei_local->msg_enable = ax_msg_enable;

dev->netdev_ops = &ax_netdev_ops;
- dev->ethtool_ops = &ax_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &ax_ethtool_ops);

ret = ax_mii_init(dev);
if (ret)
diff --git a/drivers/net/ethernet/8390/etherh.c b/drivers/net/ethernet/8390/etherh.c
index b36ee9e..c0846b8 100644
--- a/drivers/net/ethernet/8390/etherh.c
+++ b/drivers/net/ethernet/8390/etherh.c
@@ -688,7 +688,7 @@ etherh_probe(struct expansion_card *ec, const struct ecard_id *id)

dev->netdev_ops = &etherh_netdev_ops;
dev->irq = ec->irq;
- dev->ethtool_ops = &etherh_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &etherh_ethtool_ops);

if (data->supported & SUPPORTED_Autoneg)
dev->flags |= IFF_AUTOMEDIA;
diff --git a/drivers/net/ethernet/8390/ne2k-pci.c b/drivers/net/ethernet/8390/ne2k-pci.c
index f395c96..67c91ff 100644
--- a/drivers/net/ethernet/8390/ne2k-pci.c
+++ b/drivers/net/ethernet/8390/ne2k-pci.c
@@ -373,7 +373,7 @@ static int ne2k_pci_init_one(struct pci_dev *pdev,
ei_status.get_8390_hdr = &ne2k_pci_get_8390_hdr;
ei_status.priv = (unsigned long) pdev;

- dev->ethtool_ops = &ne2k_pci_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &ne2k_pci_ethtool_ops);
NS8390_init(dev, 0);

memcpy(dev->dev_addr, SA_prom, dev->addr_len);
diff --git a/drivers/net/ethernet/adi/bfin_mac.c b/drivers/net/ethernet/adi/bfin_mac.c
index 7ae74d4..826016bc 100644
--- a/drivers/net/ethernet/adi/bfin_mac.c
+++ b/drivers/net/ethernet/adi/bfin_mac.c
@@ -1683,7 +1683,7 @@ static int bfin_mac_probe(struct platform_device *pdev)
ether_setup(ndev);

ndev->netdev_ops = &bfin_mac_netdev_ops;
- ndev->ethtool_ops = &bfin_mac_ethtool_ops;
+ SET_ETHTOOL_OPS(ndev, &bfin_mac_ethtool_ops);

init_timer(&lp->tx_reclaim_timer);
lp->tx_reclaim_timer.data = (unsigned long)lp;
diff --git a/drivers/net/ethernet/aeroflex/greth.c b/drivers/net/ethernet/aeroflex/greth.c
index 23578df..3608b97 100644
--- a/drivers/net/ethernet/aeroflex/greth.c
+++ b/drivers/net/ethernet/aeroflex/greth.c
@@ -1529,7 +1529,7 @@ static int greth_of_probe(struct platform_device *ofdev)
}

dev->netdev_ops = &greth_netdev_ops;
- dev->ethtool_ops = &greth_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &greth_ethtool_ops);

err = register_netdev(dev);
if (err) {
diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c
index 2846067..3a6c75a 100644
--- a/drivers/net/ethernet/allwinner/sun4i-emac.c
+++ b/drivers/net/ethernet/allwinner/sun4i-emac.c
@@ -879,7 +879,7 @@ static int emac_probe(struct platform_device *pdev)

ndev->netdev_ops = &emac_netdev_ops;
ndev->watchdog_timeo = msecs_to_jiffies(watchdog);
- ndev->ethtool_ops = &emac_ethtool_ops;
+ SET_ETHTOOL_OPS(ndev, &emac_ethtool_ops);

platform_set_drvdata(pdev, ndev);

diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c
index e7cc917..4a9b2cf 100644
--- a/drivers/net/ethernet/amd/pcnet32.c
+++ b/drivers/net/ethernet/amd/pcnet32.c
@@ -1900,7 +1900,7 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)

/* The PCNET32-specific entries in the device structure. */
dev->netdev_ops = &pcnet32_netdev_ops;
- dev->ethtool_ops = &pcnet32_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &pcnet32_ethtool_ops);
dev->watchdog_timeo = (5 * HZ);

/* Fill in the generic fields of the device structure. */
diff --git a/drivers/net/ethernet/amd/sunlance.c b/drivers/net/ethernet/amd/sunlance.c
index 5e4273b..ac937c3 100644
--- a/drivers/net/ethernet/amd/sunlance.c
+++ b/drivers/net/ethernet/amd/sunlance.c
@@ -1450,7 +1450,7 @@ no_link_test:
lp->dev = dev;
SET_NETDEV_DEV(dev, &op->dev);
dev->watchdog_timeo = 5*HZ;
- dev->ethtool_ops = &sparc_lance_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &sparc_lance_ethtool_ops);
dev->netdev_ops = &sparc_lance_ops;

dev->irq = op->archdata.irqs[0];
diff --git a/drivers/net/ethernet/apple/bmac.c b/drivers/net/ethernet/apple/bmac.c
index daae0e0..a2c6c2a 100644
--- a/drivers/net/ethernet/apple/bmac.c
+++ b/drivers/net/ethernet/apple/bmac.c
@@ -1301,7 +1301,7 @@ static int bmac_probe(struct macio_dev *mdev, const struct of_device_id *match)
bmwrite(dev, INTDISABLE, DisableAll);

dev->netdev_ops = &bmac_netdev_ops;
- dev->ethtool_ops = &bmac_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &bmac_ethtool_ops);

bmac_get_station_address(dev, addr);
if (bmac_verify_checksum(dev) != 0)
diff --git a/drivers/net/ethernet/arc/emac_main.c b/drivers/net/ethernet/arc/emac_main.c
index eeecc29..75956dc 100644
--- a/drivers/net/ethernet/arc/emac_main.c
+++ b/drivers/net/ethernet/arc/emac_main.c
@@ -665,7 +665,7 @@ static int arc_emac_probe(struct platform_device *pdev)
SET_NETDEV_DEV(ndev, &pdev->dev);

ndev->netdev_ops = &arc_emac_netdev_ops;
- ndev->ethtool_ops = &arc_emac_ethtool_ops;
+ SET_ETHTOOL_OPS(ndev, &arc_emac_ethtool_ops);
ndev->watchdog_timeo = TX_TIMEOUT;
/* FIXME :: no multicast support yet */
ndev->flags &= ~IFF_MULTICAST;
diff --git a/drivers/net/ethernet/atheros/atlx/atl1.c b/drivers/net/ethernet/atheros/atlx/atl1.c
index dfd0e91..a50b729 100644
--- a/drivers/net/ethernet/atheros/atlx/atl1.c
+++ b/drivers/net/ethernet/atheros/atlx/atl1.c
@@ -3014,7 +3014,7 @@ static int atl1_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
netdev->watchdog_timeo = 5 * HZ;
netif_napi_add(netdev, &adapter->napi, atl1_rings_clean, 64);

- netdev->ethtool_ops = &atl1_ethtool_ops;
+ SET_ETHTOOL_OPS(netdev, &atl1_ethtool_ops);
adapter->bd_number = cards_found;

/* setup the private structure */
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index 0ab8370..9fac102 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -8556,7 +8556,7 @@ bnx2_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)

dev->netdev_ops = &bnx2_netdev_ops;
dev->watchdog_timeo = TX_TIMEOUT;
- dev->ethtool_ops = &bnx2_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &bnx2_ethtool_ops);

bp = netdev_priv(dev);

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index e5d95c5..903710b 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -17590,7 +17590,7 @@ static int tg3_init_one(struct pci_dev *pdev,
tp->rx_pending = TG3_DEF_RX_RING_PENDING;
tp->rx_jumbo_pending = TG3_DEF_RX_JUMBO_RING_PENDING;

- dev->ethtool_ops = &tg3_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &tg3_ethtool_ops);
dev->watchdog_timeo = TG3_TX_TIMEOUT;
dev->netdev_ops = &tg3_netdev_ops;
dev->irq = pdev->irq;
diff --git a/drivers/net/ethernet/cadence/at91_ether.c b/drivers/net/ethernet/cadence/at91_ether.c
index 4a79eda..f71c2b6 100644
--- a/drivers/net/ethernet/cadence/at91_ether.c
+++ b/drivers/net/ethernet/cadence/at91_ether.c
@@ -353,7 +353,7 @@ static int __init at91ether_probe(struct platform_device *pdev)

ether_setup(dev);
dev->netdev_ops = &at91ether_netdev_ops;
- dev->ethtool_ops = &macb_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &macb_ethtool_ops);
platform_set_drvdata(pdev, dev);
SET_NETDEV_DEV(dev, &pdev->dev);

diff --git a/drivers/net/ethernet/cirrus/ep93xx_eth.c b/drivers/net/ethernet/cirrus/ep93xx_eth.c
index 2be2a99..948e2e8 100644
--- a/drivers/net/ethernet/cirrus/ep93xx_eth.c
+++ b/drivers/net/ethernet/cirrus/ep93xx_eth.c
@@ -765,7 +765,7 @@ static struct net_device *ep93xx_dev_alloc(struct ep93xx_eth_data *data)

memcpy(dev->dev_addr, data->dev_addr, ETH_ALEN);

- dev->ethtool_ops = &ep93xx_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &ep93xx_ethtool_ops);
dev->netdev_ops = &ep93xx_netdev_ops;

dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM;
diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c
index 8c4b93b..73f76a8 100644
--- a/drivers/net/ethernet/davicom/dm9000.c
+++ b/drivers/net/ethernet/davicom/dm9000.c
@@ -1602,7 +1602,7 @@ dm9000_probe(struct platform_device *pdev)

ndev->netdev_ops = &dm9000_netdev_ops;
ndev->watchdog_timeo = msecs_to_jiffies(watchdog);
- ndev->ethtool_ops = &dm9000_ethtool_ops;
+ SET_ETHTOOL_OPS(ndev, &dm9000_ethtool_ops);

db->msg_enable = NETIF_MSG_LINK;
db->mii.phy_id_mask = 0x1f;
diff --git a/drivers/net/ethernet/dec/tulip/de2104x.c b/drivers/net/ethernet/dec/tulip/de2104x.c
index 38148b0..55c80eb 100644
--- a/drivers/net/ethernet/dec/tulip/de2104x.c
+++ b/drivers/net/ethernet/dec/tulip/de2104x.c
@@ -1985,7 +1985,7 @@ static int de_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)

dev->netdev_ops = &de_netdev_ops;
SET_NETDEV_DEV(dev, &pdev->dev);
- dev->ethtool_ops = &de_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &de_ethtool_ops);
dev->watchdog_timeo = TX_TIMEOUT;

de = netdev_priv(dev);
diff --git a/drivers/net/ethernet/dec/tulip/dmfe.c b/drivers/net/ethernet/dec/tulip/dmfe.c
index 53f0c61..14bceaa 100644
--- a/drivers/net/ethernet/dec/tulip/dmfe.c
+++ b/drivers/net/ethernet/dec/tulip/dmfe.c
@@ -477,7 +477,7 @@ static int dmfe_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)

pci_set_drvdata(pdev, dev);
dev->netdev_ops = &netdev_ops;
- dev->ethtool_ops = &netdev_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
netif_carrier_off(dev);
spin_lock_init(&db->lock);

diff --git a/drivers/net/ethernet/dec/tulip/uli526x.c b/drivers/net/ethernet/dec/tulip/uli526x.c
index aa801a6..b54b462 100644
--- a/drivers/net/ethernet/dec/tulip/uli526x.c
+++ b/drivers/net/ethernet/dec/tulip/uli526x.c
@@ -372,7 +372,7 @@ static int uli526x_init_one(struct pci_dev *pdev,

/* Register some necessary functions */
dev->netdev_ops = &netdev_ops;
- dev->ethtool_ops = &netdev_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);

spin_lock_init(&db->lock);

diff --git a/drivers/net/ethernet/dec/tulip/winbond-840.c b/drivers/net/ethernet/dec/tulip/winbond-840.c
index 62fe512..d780dc3 100644
--- a/drivers/net/ethernet/dec/tulip/winbond-840.c
+++ b/drivers/net/ethernet/dec/tulip/winbond-840.c
@@ -431,7 +431,7 @@ static int w840_probe1(struct pci_dev *pdev, const struct pci_device_id *ent)

/* The chip-specific entries in the device structure. */
dev->netdev_ops = &netdev_ops;
- dev->ethtool_ops = &netdev_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
dev->watchdog_timeo = TX_TIMEOUT;

i = register_netdev(dev);
diff --git a/drivers/net/ethernet/dnet.c b/drivers/net/ethernet/dnet.c
index e9b0fab..1b17b9e 100644
--- a/drivers/net/ethernet/dnet.c
+++ b/drivers/net/ethernet/dnet.c
@@ -876,7 +876,7 @@ static int dnet_probe(struct platform_device *pdev)

dev->netdev_ops = &dnet_netdev_ops;
netif_napi_add(dev, &bp->napi, dnet_poll, 64);
- dev->ethtool_ops = &dnet_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &dnet_ethtool_ops);

dev->base_addr = (unsigned long)bp->regs;

diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c
index 8b70ca7..129a83c 100644
--- a/drivers/net/ethernet/ethoc.c
+++ b/drivers/net/ethernet/ethoc.c
@@ -1234,7 +1234,7 @@ static int ethoc_probe(struct platform_device *pdev)
netdev->netdev_ops = &ethoc_netdev_ops;
netdev->watchdog_timeo = ETHOC_TIMEOUT;
netdev->features |= 0;
- netdev->ethtool_ops = &ethoc_ethtool_ops;
+ SET_ETHTOOL_OPS(netdev, &ethoc_ethtool_ops);

/* setup NAPI */
netif_napi_add(netdev, &priv->napi, ethoc_poll, 64);
diff --git a/drivers/net/ethernet/fealnx.c b/drivers/net/ethernet/fealnx.c
index 4b22a95..5cfac60 100644
--- a/drivers/net/ethernet/fealnx.c
+++ b/drivers/net/ethernet/fealnx.c
@@ -657,7 +657,7 @@ static int fealnx_init_one(struct pci_dev *pdev,
}

dev->netdev_ops = &netdev_ops;
- dev->ethtool_ops = &netdev_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
dev->watchdog_timeo = TX_TIMEOUT;

err = register_netdev(dev);
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 8d69e43..f959bad 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -2019,7 +2019,7 @@ static int fec_enet_init(struct net_device *ndev)
/* The FEC Ethernet specific entries in the device structure */
ndev->watchdog_timeo = TX_TIMEOUT;
ndev->netdev_ops = &fec_netdev_ops;
- ndev->ethtool_ops = &fec_enet_ethtool_ops;
+ SET_ETHTOOL_OPS(ndev, &fec_enet_ethtool_ops);

writel(FEC_RX_DISABLED_IMASK, fep->hwp + FEC_IMASK);
netif_napi_add(ndev, &fep->napi, fec_enet_rx_napi, NAPI_POLL_WEIGHT);
diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c b/drivers/net/ethernet/freescale/fec_mpc52xx.c
index 9947765..779fe56 100644
--- a/drivers/net/ethernet/freescale/fec_mpc52xx.c
+++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c
@@ -879,7 +879,7 @@ static int mpc52xx_fec_probe(struct platform_device *op)

/* Init ether ndev with what we have */
ndev->netdev_ops = &mpc52xx_fec_netdev_ops;
- ndev->ethtool_ops = &mpc52xx_fec_ethtool_ops;
+ SET_ETHTOOL_OPS(ndev, &mpc52xx_fec_ethtool_ops);
ndev->watchdog_timeo = FEC_WATCHDOG_TIMEOUT;
ndev->base_addr = mem.start;
SET_NETDEV_DEV(ndev, &op->dev);
diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
index dc80db4..8179097 100644
--- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
+++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
@@ -1103,7 +1103,7 @@ static int fs_enet_probe(struct platform_device *ofdev)
netif_napi_add(ndev, &fep->napi, fs_enet_rx_napi,
fpi->napi_weight);

- ndev->ethtool_ops = &fs_ethtool_ops;
+ SET_ETHTOOL_OPS(ndev, &fs_ethtool_ops);

init_timer(&fep->phy_timer_list);

diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 9125d9a..2b5541f3 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -1304,7 +1304,7 @@ static int gfar_probe(struct platform_device *ofdev)
dev->watchdog_timeo = TX_TIMEOUT;
dev->mtu = 1500;
dev->netdev_ops = &gfar_netdev_ops;
- dev->ethtool_ops = &gfar_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &gfar_ethtool_ops);

/* Register for napi ...We are registering NAPI for each grp */
for (i = 0; i < priv->num_grps; i++) {
diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index c912756..81e3dff 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -1384,7 +1384,7 @@ static int ibmveth_probe(struct vio_dev *dev, const struct vio_device_id *id)

netdev->irq = dev->irq;
netdev->netdev_ops = &ibmveth_netdev_ops;
- netdev->ethtool_ops = &netdev_ethtool_ops;
+ SET_ETHTOOL_OPS(netdev, &netdev_ethtool_ops);
SET_NETDEV_DEV(netdev, &dev->dev);
netdev->hw_features = NETIF_F_SG | NETIF_F_RXCSUM |
NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index b0c6050..a1dbc6a 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -3005,7 +3005,7 @@ jme_init_one(struct pci_dev *pdev,
goto err_out_release_regions;
}
netdev->netdev_ops = &jme_netdev_ops;
- netdev->ethtool_ops = &jme_ethtool_ops;
+ SET_ETHTOOL_OPS(netdev, &jme_ethtool_ops);
netdev->watchdog_timeo = TX_TIMEOUT;
netdev->hw_features = NETIF_F_IP_CSUM |
NETIF_F_IPV6_CSUM |
diff --git a/drivers/net/ethernet/korina.c b/drivers/net/ethernet/korina.c
index d74f5f4..b855327 100644
--- a/drivers/net/ethernet/korina.c
+++ b/drivers/net/ethernet/korina.c
@@ -1160,7 +1160,7 @@ static int korina_probe(struct platform_device *pdev)
lp->dev = dev;

dev->netdev_ops = &korina_netdev_ops;
- dev->ethtool_ops = &netdev_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
dev->watchdog_timeo = TX_TIMEOUT;
netif_napi_add(dev, &lp->napi, korina_poll, 64);

diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c
index fd4b6ae..db7a814 100644
--- a/drivers/net/ethernet/lantiq_etop.c
+++ b/drivers/net/ethernet/lantiq_etop.c
@@ -741,7 +741,7 @@ ltq_etop_probe(struct platform_device *pdev)
}
strcpy(dev->name, "eth%d");
dev->netdev_ops = &ltq_eth_netdev_ops;
- dev->ethtool_ops = &ltq_etop_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &ltq_etop_ethtool_ops);
priv = netdev_priv(dev);
priv->res = res;
priv->pdev = pdev;
diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c
index 7f81ae6..a8c3226 100644
--- a/drivers/net/ethernet/marvell/skge.c
+++ b/drivers/net/ethernet/marvell/skge.c
@@ -3854,7 +3854,7 @@ static struct net_device *skge_devinit(struct skge_hw *hw, int port,

SET_NETDEV_DEV(dev, &hw->pdev->dev);
dev->netdev_ops = &skge_netdev_ops;
- dev->ethtool_ops = &skge_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &skge_ethtool_ops);
dev->watchdog_timeo = TX_WATCHDOG;
dev->irq = hw->pdev->irq;

diff --git a/drivers/net/ethernet/micrel/ks8842.c b/drivers/net/ethernet/micrel/ks8842.c
index 822616e..f3e19e9 100644
--- a/drivers/net/ethernet/micrel/ks8842.c
+++ b/drivers/net/ethernet/micrel/ks8842.c
@@ -1192,7 +1192,7 @@ static int ks8842_probe(struct platform_device *pdev)
spin_lock_init(&adapter->lock);

netdev->netdev_ops = &ks8842_netdev_ops;
- netdev->ethtool_ops = &ks8842_ethtool_ops;
+ SET_ETHTOOL_OPS(netdev, &ks8842_ethtool_ops);

/* Check if a mac address was given */
i = netdev->addr_len;
diff --git a/drivers/net/ethernet/micrel/ks8851_mll.c b/drivers/net/ethernet/micrel/ks8851_mll.c
index c83d16d..2225381 100644
--- a/drivers/net/ethernet/micrel/ks8851_mll.c
+++ b/drivers/net/ethernet/micrel/ks8851_mll.c
@@ -1583,7 +1583,7 @@ static int ks8851_probe(struct platform_device *pdev)
spin_lock_init(&ks->statelock);

netdev->netdev_ops = &ks_netdev_ops;
- netdev->ethtool_ops = &ks_ethtool_ops;
+ SET_ETHTOOL_OPS(netdev, &ks_ethtool_ops);

/* setup mii state */
ks->mii.dev = netdev;
diff --git a/drivers/net/ethernet/nuvoton/w90p910_ether.c b/drivers/net/ethernet/nuvoton/w90p910_ether.c
index 79645f7..c64f33f 100644
--- a/drivers/net/ethernet/nuvoton/w90p910_ether.c
+++ b/drivers/net/ethernet/nuvoton/w90p910_ether.c
@@ -945,7 +945,7 @@ static int w90p910_ether_setup(struct net_device *dev)

ether_setup(dev);
dev->netdev_ops = &w90p910_ether_netdev_ops;
- dev->ethtool_ops = &w90p910_ether_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &w90p910_ether_ethtool_ops);

dev->tx_queue_len = 16;
dev->dma = 0x0;
diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c
index 422d9b5..f2c2411 100644
--- a/drivers/net/ethernet/nxp/lpc_eth.c
+++ b/drivers/net/ethernet/nxp/lpc_eth.c
@@ -1379,7 +1379,7 @@ static int lpc_eth_drv_probe(struct platform_device *pdev)

/* Setup driver functions */
ndev->netdev_ops = &lpc_netdev_ops;
- ndev->ethtool_ops = &lpc_eth_ethtool_ops;
+ SET_ETHTOOL_OPS(ndev, &lpc_eth_ethtool_ops);
ndev->watchdog_timeo = msecs_to_jiffies(2500);

/* Get size of DMA buffers/descriptors region */
diff --git a/drivers/net/ethernet/octeon/octeon_mgmt.c b/drivers/net/ethernet/octeon/octeon_mgmt.c
index 7dc3e9b..e47d8d3 100644
--- a/drivers/net/ethernet/octeon/octeon_mgmt.c
+++ b/drivers/net/ethernet/octeon/octeon_mgmt.c
@@ -1541,7 +1541,7 @@ static int octeon_mgmt_probe(struct platform_device *pdev)
netdev->priv_flags |= IFF_UNICAST_FLT;

netdev->netdev_ops = &octeon_mgmt_ops;
- netdev->ethtool_ops = &octeon_mgmt_ethtool_ops;
+ SET_ETHTOOL_OPS(netdev, &octeon_mgmt_ethtool_ops);

mac = of_get_mac_address(pdev->dev.of_node);

diff --git a/drivers/net/ethernet/pasemi/pasemi_mac.c b/drivers/net/ethernet/pasemi/pasemi_mac.c
index 9abf70d7..e92d8e5 100644
--- a/drivers/net/ethernet/pasemi/pasemi_mac.c
+++ b/drivers/net/ethernet/pasemi/pasemi_mac.c
@@ -1812,7 +1812,7 @@ pasemi_mac_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
/* 1500 MTU + ETH_HLEN + VLAN_HLEN + 2 64B cachelines */
mac->bufsz = dev->mtu + ETH_HLEN + ETH_FCS_LEN + LOCAL_SKB_ALIGN + 128;

- dev->ethtool_ops = &pasemi_mac_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &pasemi_mac_ethtool_ops);

if (err)
goto out;
diff --git a/drivers/net/ethernet/rdc/r6040.c b/drivers/net/ethernet/rdc/r6040.c
index cd045ec..96cdc6b 100644
--- a/drivers/net/ethernet/rdc/r6040.c
+++ b/drivers/net/ethernet/rdc/r6040.c
@@ -1171,7 +1171,7 @@ static int r6040_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)

/* The RDC-specific entries in the device structure. */
dev->netdev_ops = &r6040_netdev_ops;
- dev->ethtool_ops = &netdev_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
dev->watchdog_timeo = TX_TIMEOUT;

netif_napi_add(dev, &lp->napi, r6040_poll, 64);
diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c
index 2bc728e..5f57041 100644
--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@@ -1992,7 +1992,7 @@ static int cp_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)

dev->netdev_ops = &cp_netdev_ops;
netif_napi_add(dev, &cp->napi, cp_rx_poll, 16);
- dev->ethtool_ops = &cp_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &cp_ethtool_ops);
dev->watchdog_timeo = TX_TIMEOUT;

dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
diff --git a/drivers/net/ethernet/realtek/8139too.c b/drivers/net/ethernet/realtek/8139too.c
index 2e5df14..bfc36b4 100644
--- a/drivers/net/ethernet/realtek/8139too.c
+++ b/drivers/net/ethernet/realtek/8139too.c
@@ -996,7 +996,7 @@ static int rtl8139_init_one(struct pci_dev *pdev,

/* The Rtl8139-specific entries in the device structure. */
dev->netdev_ops = &rtl8139_netdev_ops;
- dev->ethtool_ops = &rtl8139_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &rtl8139_ethtool_ops);
dev->watchdog_timeo = TX_TIMEOUT;
netif_napi_add(dev, &tp->napi, rtl8139_poll, 64);

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 63d595f..26a6add 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -2214,6 +2214,8 @@ static int efx_netdev_event(struct notifier_block *this,
{
struct net_device *net_dev = netdev_notifier_info_to_dev(ptr);

+ /* FIXME: this is broken now with !NETDEV_OPS */
+
if ((net_dev->netdev_ops == &efx_farch_netdev_ops ||
net_dev->netdev_ops == &efx_ef10_netdev_ops) &&
event == NETDEV_CHANGENAME)
diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c
index 7984ad0..803f129 100644
--- a/drivers/net/ethernet/sgi/ioc3-eth.c
+++ b/drivers/net/ethernet/sgi/ioc3-eth.c
@@ -1325,7 +1325,7 @@ static int ioc3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
/* The IOC3-specific entries in the device structure. */
dev->watchdog_timeo = 5 * HZ;
dev->netdev_ops = &ioc3_netdev_ops;
- dev->ethtool_ops = &ioc3_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &ioc3_ethtool_ops);
dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
dev->features = NETIF_F_IP_CSUM;

diff --git a/drivers/net/ethernet/silan/sc92031.c b/drivers/net/ethernet/silan/sc92031.c
index 7daa7d4..cad6e80 100644
--- a/drivers/net/ethernet/silan/sc92031.c
+++ b/drivers/net/ethernet/silan/sc92031.c
@@ -1442,7 +1442,7 @@ static int sc92031_probe(struct pci_dev *pdev, const struct pci_device_id *id)

dev->netdev_ops = &sc92031_netdev_ops;
dev->watchdog_timeo = TX_TIMEOUT;
- dev->ethtool_ops = &sc92031_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &sc92031_ethtool_ops);

priv = netdev_priv(dev);
spin_lock_init(&priv->lock);
diff --git a/drivers/net/ethernet/sis/sis900.c b/drivers/net/ethernet/sis/sis900.c
index 6072f09..6aa56cd 100644
--- a/drivers/net/ethernet/sis/sis900.c
+++ b/drivers/net/ethernet/sis/sis900.c
@@ -496,7 +496,7 @@ static int sis900_probe(struct pci_dev *pci_dev,
/* The SiS900-specific entries in the device structure. */
net_dev->netdev_ops = &sis900_netdev_ops;
net_dev->watchdog_timeo = TX_TIMEOUT;
- net_dev->ethtool_ops = &sis900_ethtool_ops;
+ SET_ETHTOOL_OPS(net_dev, &sis900_ethtool_ops);

if (sis900_debug > 0)
sis_priv->msg_enable = sis900_debug;
diff --git a/drivers/net/ethernet/smsc/epic100.c b/drivers/net/ethernet/smsc/epic100.c
index 8ae1f8a7..93395e6 100644
--- a/drivers/net/ethernet/smsc/epic100.c
+++ b/drivers/net/ethernet/smsc/epic100.c
@@ -480,7 +480,7 @@ static int epic_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)

/* The Epic-specific entries in the device structure. */
dev->netdev_ops = &epic_netdev_ops;
- dev->ethtool_ops = &netdev_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
dev->watchdog_timeo = TX_TIMEOUT;
netif_napi_add(dev, &ep->napi, epic_poll, 64);

diff --git a/drivers/net/ethernet/smsc/smc911x.c b/drivers/net/ethernet/smsc/smc911x.c
index 1c44e67..1bc3e57 100644
--- a/drivers/net/ethernet/smsc/smc911x.c
+++ b/drivers/net/ethernet/smsc/smc911x.c
@@ -1932,7 +1932,7 @@ static int smc911x_probe(struct net_device *dev)

dev->netdev_ops = &smc911x_netdev_ops;
dev->watchdog_timeo = msecs_to_jiffies(watchdog);
- dev->ethtool_ops = &smc911x_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &smc911x_ethtool_ops);

INIT_WORK(&lp->phy_configure, smc911x_phy_configure);
lp->mii.phy_id_mask = 0x1f;
diff --git a/drivers/net/ethernet/smsc/smc91x.c b/drivers/net/ethernet/smsc/smc91x.c
index d1b4dca..635e3ce 100644
--- a/drivers/net/ethernet/smsc/smc91x.c
+++ b/drivers/net/ethernet/smsc/smc91x.c
@@ -1971,7 +1971,7 @@ static int smc_probe(struct net_device *dev, void __iomem *ioaddr,

dev->watchdog_timeo = msecs_to_jiffies(watchdog);
dev->netdev_ops = &smc_netdev_ops;
- dev->ethtool_ops = &smc_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &smc_ethtool_ops);

tasklet_init(&lp->tx_task, smc_hardware_send_pkt, (unsigned long)dev);
INIT_WORK(&lp->phy_configure, smc_phy_configure);
diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c
index a0fc151..3cc7b0c 100644
--- a/drivers/net/ethernet/smsc/smsc911x.c
+++ b/drivers/net/ethernet/smsc/smsc911x.c
@@ -2259,7 +2259,7 @@ static int smsc911x_init(struct net_device *dev)
dev->flags |= IFF_MULTICAST;
netif_napi_add(dev, &pdata->napi, smsc911x_poll, SMSC_NAPI_WEIGHT);
dev->netdev_ops = &smsc911x_netdev_ops;
- dev->ethtool_ops = &smsc911x_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &smsc911x_ethtool_ops);

return 0;
}
diff --git a/drivers/net/ethernet/smsc/smsc9420.c b/drivers/net/ethernet/smsc/smsc9420.c
index d3b967a..df0ac39 100644
--- a/drivers/net/ethernet/smsc/smsc9420.c
+++ b/drivers/net/ethernet/smsc/smsc9420.c
@@ -1661,7 +1661,7 @@ smsc9420_probe(struct pci_dev *pdev, const struct pci_device_id *id)
smsc9420_check_mac_address(dev);

dev->netdev_ops = &smsc9420_netdev_ops;
- dev->ethtool_ops = &smsc9420_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &smsc9420_ethtool_ops);

netif_napi_add(dev, &pd->napi, smsc9420_rx_poll, NAPI_WEIGHT);

diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c
index df8d383..df9fa8c 100644
--- a/drivers/net/ethernet/sun/cassini.c
+++ b/drivers/net/ethernet/sun/cassini.c
@@ -5103,7 +5103,7 @@ static int cas_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
skb_queue_head_init(&cp->rx_flows[i]);

dev->netdev_ops = &cas_netdev_ops;
- dev->ethtool_ops = &cas_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &cas_ethtool_ops);
dev->watchdog_timeo = CAS_TX_TIMEOUT;

#ifdef USE_NAPI
diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index 79606f47a..a49c9e6 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -9699,7 +9699,7 @@ static const struct net_device_ops niu_netdev_ops = {
static void niu_assign_netdev_ops(struct net_device *dev)
{
dev->netdev_ops = &niu_netdev_ops;
- dev->ethtool_ops = &niu_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &niu_ethtool_ops);
dev->watchdog_timeo = NIU_TX_TIMEOUT;
}

diff --git a/drivers/net/ethernet/sun/sunbmac.c b/drivers/net/ethernet/sun/sunbmac.c
index 206c106..b9475ed 100644
--- a/drivers/net/ethernet/sun/sunbmac.c
+++ b/drivers/net/ethernet/sun/sunbmac.c
@@ -1180,7 +1180,7 @@ static int bigmac_ether_init(struct platform_device *op,
bp->dev = dev;

/* Set links to our BigMAC open and close routines. */
- dev->ethtool_ops = &bigmac_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &bigmac_ethtool_ops);
dev->netdev_ops = &bigmac_ops;
dev->watchdog_timeo = 5*HZ;

diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c
index 102a66f..e68ab54 100644
--- a/drivers/net/ethernet/sun/sungem.c
+++ b/drivers/net/ethernet/sun/sungem.c
@@ -2965,7 +2965,7 @@ static int gem_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)

dev->netdev_ops = &gem_netdev_ops;
netif_napi_add(dev, &gp->napi, gem_poll, 64);
- dev->ethtool_ops = &gem_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &gem_ethtool_ops);
dev->watchdog_timeo = 5 * HZ;
dev->dma = 0;

diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c
index 0dbf46f..9e34e31 100644
--- a/drivers/net/ethernet/sun/sunhme.c
+++ b/drivers/net/ethernet/sun/sunhme.c
@@ -2767,7 +2767,7 @@ static int happy_meal_sbus_probe_one(struct platform_device *op, int is_qfe)
hp->dev = dev;
dev->netdev_ops = &hme_netdev_ops;
dev->watchdog_timeo = 5*HZ;
- dev->ethtool_ops = &hme_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &hme_ethtool_ops);

/* Happy Meal can do it all... */
dev->hw_features = NETIF_F_SG | NETIF_F_HW_CSUM;
@@ -3082,7 +3082,7 @@ static int happy_meal_pci_probe(struct pci_dev *pdev,
hp->dev = dev;
dev->netdev_ops = &hme_netdev_ops;
dev->watchdog_timeo = 5*HZ;
- dev->ethtool_ops = &hme_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &hme_ethtool_ops);

/* Happy Meal can do it all... */
dev->hw_features = NETIF_F_SG | NETIF_F_HW_CSUM;
diff --git a/drivers/net/ethernet/sun/sunqe.c b/drivers/net/ethernet/sun/sunqe.c
index 5695ae2..82f5942 100644
--- a/drivers/net/ethernet/sun/sunqe.c
+++ b/drivers/net/ethernet/sun/sunqe.c
@@ -895,7 +895,7 @@ static int qec_ether_init(struct platform_device *op)
dev->watchdog_timeo = 5*HZ;
dev->irq = op->archdata.irqs[0];
dev->dma = 0;
- dev->ethtool_ops = &qe_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &qe_ethtool_ops);
dev->netdev_ops = &qec_ops;

res = register_netdev(dev);
diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c
index 1c24a8f..32a5f1b 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -1043,7 +1043,7 @@ static struct vnet *vnet_new(const u64 *local_mac)
vp->local_mac = *local_mac;

dev->netdev_ops = &vnet_ops;
- dev->ethtool_ops = &vnet_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &vnet_ethtool_ops);
dev->watchdog_timeo = VNET_TX_TIMEOUT;

err = register_netdev(dev);
diff --git a/drivers/net/ethernet/ti/cpmac.c b/drivers/net/ethernet/ti/cpmac.c
index 73f74f3..45ff516 100644
--- a/drivers/net/ethernet/ti/cpmac.c
+++ b/drivers/net/ethernet/ti/cpmac.c
@@ -1157,7 +1157,7 @@ static int cpmac_probe(struct platform_device *pdev)
dev->irq = platform_get_irq_byname(pdev, "irq");

dev->netdev_ops = &cpmac_netdev_ops;
- dev->ethtool_ops = &cpmac_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &cpmac_ethtool_ops);

netif_napi_add(dev, &priv->napi, cpmac_poll, 64);

diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_net.c b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
index d899d00..7d3b6d6 100644
--- a/drivers/net/ethernet/toshiba/ps3_gelic_net.c
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_net.c
@@ -1467,7 +1467,7 @@ static void gelic_ether_setup_netdev_ops(struct net_device *netdev,
netdev->watchdog_timeo = GELIC_NET_WATCHDOG_TIMEOUT;
/* NAPI */
netif_napi_add(netdev, napi, gelic_net_poll, NAPI_POLL_WEIGHT);
- netdev->ethtool_ops = &gelic_ether_ethtool_ops;
+ SET_ETHTOOL_OPS(netdev, &gelic_ether_ethtool_ops);
netdev->netdev_ops = &gelic_netdevice_ops;
}

diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c
index d568af1..399d1a2 100644
--- a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c
+++ b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c
@@ -2589,7 +2589,7 @@ static void gelic_wl_setup_netdev_ops(struct net_device *netdev)
BUG_ON(!wl);
netdev->watchdog_timeo = GELIC_NET_WATCHDOG_TIMEOUT;

- netdev->ethtool_ops = &gelic_wl_ethtool_ops;
+ SET_ETHTOOL_OPS(netdev, &gelic_wl_ethtool_ops);
netdev->netdev_ops = &gelic_wl_netdevice_ops;
netdev->wireless_data = &wl->wireless_data;
netdev->wireless_handlers = &gelic_wl_wext_handler_def;
diff --git a/drivers/net/ethernet/toshiba/spider_net.c b/drivers/net/ethernet/toshiba/spider_net.c
index 0282d01..0a8fea5 100644
--- a/drivers/net/ethernet/toshiba/spider_net.c
+++ b/drivers/net/ethernet/toshiba/spider_net.c
@@ -2286,7 +2286,7 @@ spider_net_setup_netdev_ops(struct net_device *netdev)
netdev->netdev_ops = &spider_net_ops;
netdev->watchdog_timeo = SPIDER_NET_WATCHDOG_TIMEOUT;
/* ethtool ops */
- netdev->ethtool_ops = &spider_net_ethtool_ops;
+ SET_ETHTOOL_OPS(netdev, &spider_net_ethtool_ops);
}

/**
diff --git a/drivers/net/ethernet/toshiba/tc35815.c b/drivers/net/ethernet/toshiba/tc35815.c
index fef5573..e8889cf 100644
--- a/drivers/net/ethernet/toshiba/tc35815.c
+++ b/drivers/net/ethernet/toshiba/tc35815.c
@@ -825,7 +825,7 @@ static int tc35815_init_one(struct pci_dev *pdev,

/* Initialize the device structure. */
dev->netdev_ops = &tc35815_netdev_ops;
- dev->ethtool_ops = &tc35815_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &tc35815_ethtool_ops);
dev->watchdog_timeo = TC35815_TX_TIMEOUT;
netif_napi_add(dev, &lp->napi, tc35815_poll, NAPI_WEIGHT);

diff --git a/drivers/net/ethernet/tundra/tsi108_eth.c b/drivers/net/ethernet/tundra/tsi108_eth.c
index 47eeb3a..98d688c 100644
--- a/drivers/net/ethernet/tundra/tsi108_eth.c
+++ b/drivers/net/ethernet/tundra/tsi108_eth.c
@@ -1604,7 +1604,7 @@ tsi108_init_one(struct platform_device *pdev)
data->id = pdev->id;
netif_napi_add(dev, &data->napi, tsi108_poll, 64);
dev->netdev_ops = &tsi108_netdev_ops;
- dev->ethtool_ops = &tsi108_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &tsi108_ethtool_ops);

/* Apparently, the Linux networking code won't use scatter-gather
* if the hardware doesn't do checksums. However, it's faster
diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
index f61dc2b..89aef77 100644
--- a/drivers/net/ethernet/via/via-rhine.c
+++ b/drivers/net/ethernet/via/via-rhine.c
@@ -1022,7 +1022,7 @@ static int rhine_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)

/* The chip-specific entries in the device structure. */
dev->netdev_ops = &rhine_netdev_ops;
- dev->ethtool_ops = &netdev_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
dev->watchdog_timeo = TX_TIMEOUT;

netif_napi_add(dev, &rp->napi, rhine_napipoll, 64);
diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c
index de08e86..5f633b0 100644
--- a/drivers/net/ethernet/via/via-velocity.c
+++ b/drivers/net/ethernet/via/via-velocity.c
@@ -2875,7 +2875,7 @@ static int velocity_probe(struct device *dev, int irq,
vptr->phy_id = MII_GET_PHY_ID(vptr->mac_regs);

netdev->netdev_ops = &velocity_netdev_ops;
- netdev->ethtool_ops = &velocity_ethtool_ops;
+ SET_ETHTOOL_OPS(netdev, &velocity_ethtool_ops);
netif_napi_add(netdev, &vptr->napi, velocity_poll,
VELOCITY_NAPI_WEIGHT);

diff --git a/drivers/net/ethernet/wiznet/w5100.c b/drivers/net/ethernet/wiznet/w5100.c
index 104d46f..45a6bdb 100644
--- a/drivers/net/ethernet/wiznet/w5100.c
+++ b/drivers/net/ethernet/wiznet/w5100.c
@@ -710,7 +710,7 @@ static int w5100_probe(struct platform_device *pdev)

ether_setup(ndev);
ndev->netdev_ops = &w5100_netdev_ops;
- ndev->ethtool_ops = &w5100_ethtool_ops;
+ SET_ETHTOOL_OPS(ndev, &w5100_ethtool_ops);
ndev->watchdog_timeo = HZ;
netif_napi_add(ndev, &priv->napi, w5100_napi_poll, 16);

diff --git a/drivers/net/ethernet/wiznet/w5300.c b/drivers/net/ethernet/wiznet/w5300.c
index 1f33c4c..ce37156 100644
--- a/drivers/net/ethernet/wiznet/w5300.c
+++ b/drivers/net/ethernet/wiznet/w5300.c
@@ -622,7 +622,7 @@ static int w5300_probe(struct platform_device *pdev)

ether_setup(ndev);
ndev->netdev_ops = &w5300_netdev_ops;
- ndev->ethtool_ops = &w5300_ethtool_ops;
+ SET_ETHTOOL_OPS(ndev, &w5300_ethtool_ops);
ndev->watchdog_timeo = HZ;
netif_napi_add(ndev, &priv->napi, w5300_napi_poll, 16);

diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index fa193c4..a53ac0c 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -1018,7 +1018,7 @@ static int temac_of_probe(struct platform_device *op)
ndev->flags &= ~IFF_MULTICAST; /* clear multicast */
ndev->features = NETIF_F_SG;
ndev->netdev_ops = &temac_netdev_ops;
- ndev->ethtool_ops = &temac_ethtool_ops;
+ SET_ETHTOOL_OPS(ndev, &temac_ethtool_ops);
#if 0
ndev->features |= NETIF_F_IP_CSUM; /* Can checksum TCP/UDP over IPv4. */
ndev->features |= NETIF_F_HW_CSUM; /* Can checksum all the packets. */
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index 7b0a735..7b6b6636 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -1492,7 +1492,7 @@ static int axienet_of_probe(struct platform_device *op)
ndev->flags &= ~IFF_MULTICAST; /* clear multicast */
ndev->features = NETIF_F_SG;
ndev->netdev_ops = &axienet_netdev_ops;
- ndev->ethtool_ops = &axienet_ethtool_ops;
+ SET_ETHTOOL_OPS(ndev, &axienet_ethtool_ops);

lp = netdev_priv(ndev);
lp->ndev = ndev;
diff --git a/drivers/net/ethernet/xircom/xirc2ps_cs.c b/drivers/net/ethernet/xircom/xirc2ps_cs.c
index 7c81ffb..31d6516 100644
--- a/drivers/net/ethernet/xircom/xirc2ps_cs.c
+++ b/drivers/net/ethernet/xircom/xirc2ps_cs.c
@@ -493,7 +493,7 @@ xirc2ps_probe(struct pcmcia_device *link)

/* Fill in card specific entries */
dev->netdev_ops = &netdev_ops;
- dev->ethtool_ops = &netdev_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &netdev_ethtool_ops);
dev->watchdog_timeo = TX_TIMEOUT;
INIT_WORK(&local->tx_timeout_task, xirc2ps_tx_timeout_task);

diff --git a/drivers/net/ethernet/xscale/ixp4xx_eth.c b/drivers/net/ethernet/xscale/ixp4xx_eth.c
index f7e0f0f..b854b79 100644
--- a/drivers/net/ethernet/xscale/ixp4xx_eth.c
+++ b/drivers/net/ethernet/xscale/ixp4xx_eth.c
@@ -1436,7 +1436,7 @@ static int eth_init_one(struct platform_device *pdev)
}

dev->netdev_ops = &ixp4xx_netdev_ops;
- dev->ethtool_ops = &ixp4xx_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &ixp4xx_ethtool_ops);
dev->tx_queue_len = 100;

netif_napi_add(dev, &port->napi, eth_poll, NAPI_WEIGHT);
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index bb96409..604fa6f 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -182,7 +182,7 @@ static void loopback_setup(struct net_device *dev)
| NETIF_F_NETNS_LOCAL
| NETIF_F_VLAN_CHALLENGED
| NETIF_F_LOOPBACK;
- dev->ethtool_ops = &loopback_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &loopback_ethtool_ops);
dev->header_ops = &eth_header_ops;
dev->netdev_ops = &loopback_ops;
dev->destructor = loopback_dev_free;
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 753a8c2..91eabd4 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -735,7 +735,7 @@ void macvlan_common_setup(struct net_device *dev)
dev->netdev_ops = &macvlan_netdev_ops;
dev->destructor = free_netdev;
dev->header_ops = &macvlan_hard_header_ops;
- dev->ethtool_ops = &macvlan_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &macvlan_ethtool_ops);
}
EXPORT_SYMBOL_GPL(macvlan_common_setup);

diff --git a/drivers/net/nlmon.c b/drivers/net/nlmon.c
index 34924df..abeca48 100644
--- a/drivers/net/nlmon.c
+++ b/drivers/net/nlmon.c
@@ -133,7 +133,7 @@ static void nlmon_setup(struct net_device *dev)
dev->tx_queue_len = 0;

dev->netdev_ops = &nlmon_ops;
- dev->ethtool_ops = &nlmon_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &nlmon_ethtool_ops);
dev->destructor = free_netdev;

dev->features = NETIF_F_SG | NETIF_F_FRAGLIST |
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 33008c1..d1ad7cf 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -2011,7 +2011,7 @@ static void team_setup(struct net_device *dev)
ether_setup(dev);

dev->netdev_ops = &team_netdev_ops;
- dev->ethtool_ops = &team_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &team_ethtool_ops);
dev->destructor = team_destructor;
dev->tx_queue_len = 0;
dev->flags |= IFF_MULTICAST;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ee328ba..c2529b5 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1418,7 +1418,7 @@ static void tun_setup(struct net_device *dev)
tun->owner = INVALID_UID;
tun->group = INVALID_GID;

- dev->ethtool_ops = &tun_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &tun_ethtool_ops);
dev->destructor = tun_free_netdev;
}

diff --git a/drivers/net/usb/mcs7830.c b/drivers/net/usb/mcs7830.c
index 82d844a..0f79443 100644
--- a/drivers/net/usb/mcs7830.c
+++ b/drivers/net/usb/mcs7830.c
@@ -503,7 +503,7 @@ static int mcs7830_bind(struct usbnet *dev, struct usb_interface *udev)
if (ret)
goto out;

- net->ethtool_ops = &mcs7830_ethtool_ops;
+ SET_ETHTOOL_OPS(net, &mcs7830_ethtool_ops);
net->netdev_ops = &mcs7830_netdev_ops;

/* reserve space for the status byte on rx */
diff --git a/drivers/net/usb/sr9700.c b/drivers/net/usb/sr9700.c
index 99b69af..4789e1a 100644
--- a/drivers/net/usb/sr9700.c
+++ b/drivers/net/usb/sr9700.c
@@ -327,7 +327,7 @@ static int sr9700_bind(struct usbnet *dev, struct usb_interface *intf)
netdev = dev->net;

netdev->netdev_ops = &sr9700_netdev_ops;
- netdev->ethtool_ops = &sr9700_ethtool_ops;
+ SET_ETHTOOL_OPS(netdev, &sr9700_ethtool_ops);
netdev->hard_header_len += SR_TX_OVERHEAD;
dev->hard_mtu = netdev->mtu + netdev->hard_header_len;
/* bulkin buffer is preferably not less than 3K */
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index f9e96c4..3bea2c0 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1616,7 +1616,7 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)

net->netdev_ops = &usbnet_netdev_ops;
net->watchdog_timeo = TX_TIMEOUT_JIFFIES;
- net->ethtool_ops = &usbnet_ethtool_ops;
+ SET_ETHTOOL_OPS(net, &usbnet_ethtool_ops);

// allow device-specific bind/init procedures
// NOTE net->name still not usable ...
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index b4a10bc..77fe496 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -274,7 +274,7 @@ static void veth_setup(struct net_device *dev)
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;

dev->netdev_ops = &veth_netdev_ops;
- dev->ethtool_ops = &veth_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &veth_ethtool_ops);
dev->features |= NETIF_F_LLTX;
dev->features |= VETH_FEATURES;
dev->vlan_features = dev->features &
diff --git a/drivers/net/wimax/i2400m/netdev.c b/drivers/net/wimax/i2400m/netdev.c
index a9970f1..50f25f0 100644
--- a/drivers/net/wimax/i2400m/netdev.c
+++ b/drivers/net/wimax/i2400m/netdev.c
@@ -631,7 +631,7 @@ void i2400m_netdev_setup(struct net_device *net_dev)
& ~IFF_MULTICAST);
net_dev->watchdog_timeo = I2400M_TX_TIMEOUT;
net_dev->netdev_ops = &i2400m_netdev_ops;
- net_dev->ethtool_ops = &i2400m_ethtool_ops;
+ SET_ETHTOOL_OPS(net_dev, &i2400m_ethtool_ops);
d_fnend(3, NULL, "(net_dev %p) = void\n", net_dev);
}
EXPORT_SYMBOL_GPL(i2400m_netdev_setup);
diff --git a/drivers/net/wimax/i2400m/usb.c b/drivers/net/wimax/i2400m/usb.c
index cd15a93..fe1b6f3 100644
--- a/drivers/net/wimax/i2400m/usb.c
+++ b/drivers/net/wimax/i2400m/usb.c
@@ -364,7 +364,7 @@ void i2400mu_netdev_setup(struct net_device *net_dev)
struct i2400mu *i2400mu = container_of(i2400m, struct i2400mu, i2400m);
i2400mu_init(i2400mu);
i2400m_netdev_setup(net_dev);
- net_dev->ethtool_ops = &i2400mu_ethtool_ops;
+ SET_ETHTOOL_OPS(net_dev, &i2400mu_ethtool_ops);
}

diff --git a/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c b/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c
index 7d28cd3..d04ad4e 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c
@@ -680,7 +680,7 @@ int brcmf_net_attach(struct brcmf_if *ifp, bool rtnl_locked)
ndev->netdev_ops = &brcmf_netdev_ops_pri;

ndev->hard_header_len += drvr->hdrlen;
- ndev->ethtool_ops = &brcmf_ethtool_ops;
+ SET_ETHTOOL_OPS(ndev, &brcmf_ethtool_ops);

drvr->rxsz = ndev->mtu + ndev->hard_header_len +
drvr->hdrlen;
diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c
index dfc6dfc..4a7e1e9 100644
--- a/drivers/net/wireless/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/ipw2x00/ipw2100.c
@@ -6069,7 +6069,7 @@ static struct net_device *ipw2100_alloc_device(struct pci_dev *pci_dev,
priv->ieee->worst_rssi = -85;

dev->netdev_ops = &ipw2100_netdev_ops;
- dev->ethtool_ops = &ipw2100_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &ipw2100_ethtool_ops);
dev->wireless_handlers = &ipw2100_wx_handler_def;
priv->wireless_data.libipw = priv->ieee;
dev->wireless_data = &priv->wireless_data;
diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c
index c5aa404..661d3cc 100644
--- a/drivers/net/wireless/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/ipw2x00/ipw2200.c
@@ -11823,7 +11823,7 @@ static int ipw_pci_probe(struct pci_dev *pdev,
priv->wireless_data.spy_data = &priv->ieee->spy_data;
net_dev->wireless_data = &priv->wireless_data;
net_dev->wireless_handlers = &ipw_wx_handler_def;
- net_dev->ethtool_ops = &ipw_ethtool_ops;
+ SET_ETHTOOL_OPS(net_dev, &ipw_ethtool_ops);

err = sysfs_create_group(&pdev->dev.kobj, &ipw_attribute_group);
if (err) {
diff --git a/drivers/net/wireless/libertas/main.c b/drivers/net/wireless/libertas/main.c
index 0c02f04..7ebf413 100644
--- a/drivers/net/wireless/libertas/main.c
+++ b/drivers/net/wireless/libertas/main.c
@@ -995,7 +995,7 @@ struct lbs_private *lbs_add_card(void *card, struct device *dmdev)

dev->netdev_ops = &lbs_netdev_ops;
dev->watchdog_timeo = 5 * HZ;
- dev->ethtool_ops = &lbs_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &lbs_ethtool_ops);
dev->flags |= IFF_BROADCAST | IFF_MULTICAST;

priv->card = card;
diff --git a/drivers/net/wireless/libertas/mesh.c b/drivers/net/wireless/libertas/mesh.c
index 6fef746..546b92a 100644
--- a/drivers/net/wireless/libertas/mesh.c
+++ b/drivers/net/wireless/libertas/mesh.c
@@ -1016,7 +1016,7 @@ static int lbs_add_mesh(struct lbs_private *priv)
priv->mesh_dev = mesh_dev;

mesh_dev->netdev_ops = &mesh_netdev_ops;
- mesh_dev->ethtool_ops = &lbs_ethtool_ops;
+ SET_ETHTOOL_OPS(mesh_dev, &lbs_ethtool_ops);
eth_hw_addr_inherit(mesh_dev, priv->dev);

SET_NETDEV_DEV(priv->mesh_dev, priv->dev->dev.parent);
diff --git a/drivers/net/wireless/mwifiex/cfg80211.c b/drivers/net/wireless/mwifiex/cfg80211.c
index 21ee27a..69f3ab2 100644
--- a/drivers/net/wireless/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/mwifiex/cfg80211.c
@@ -2296,7 +2296,7 @@ struct wireless_dev *mwifiex_add_virtual_intf(struct wiphy *wiphy,
dev->flags |= IFF_BROADCAST | IFF_MULTICAST;
dev->watchdog_timeo = MWIFIEX_DEFAULT_WATCHDOG_TIMEOUT;
dev->hard_header_len += MWIFIEX_MIN_DATA_HEADER_LEN;
- dev->ethtool_ops = &mwifiex_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &mwifiex_ethtool_ops);

mdev_priv = netdev_priv(dev);
*((unsigned long *) mdev_priv) = (unsigned long) priv;
diff --git a/drivers/net/wireless/prism54/islpci_dev.c b/drivers/net/wireless/prism54/islpci_dev.c
index 931cf44..44bb341 100644
--- a/drivers/net/wireless/prism54/islpci_dev.c
+++ b/drivers/net/wireless/prism54/islpci_dev.c
@@ -834,7 +834,7 @@ islpci_setup(struct pci_dev *pdev)
/* initialize the function pointers */
ndev->netdev_ops = &islpci_netdev_ops;
ndev->wireless_handlers = &prism54_handler_def;
- ndev->ethtool_ops = &islpci_ethtool_ops;
+ SET_ETHTOOL_OPS(ndev, &islpci_ethtool_ops);

/* ndev->set_multicast_list = &islpci_set_multicast_list; */
ndev->addr_len = ETH_ALEN;
diff --git a/drivers/staging/bcm/Bcmnet.c b/drivers/staging/bcm/Bcmnet.c
index 95a2358..c40e613 100644
--- a/drivers/staging/bcm/Bcmnet.c
+++ b/drivers/staging/bcm/Bcmnet.c
@@ -195,7 +195,7 @@ int register_networkdev(struct bcm_mini_adapter *Adapter)
int result;

net->netdev_ops = &bcmNetDevOps;
- net->ethtool_ops = &bcm_ethtool_ops;
+ SET_ETHTOOL_OPS(net, &bcm_ethtool_ops);
net->mtu = MTU_SIZE; /* 1400 Bytes */
net->tx_queue_len = TX_QLEN;
net->flags |= IFF_NOARP;
diff --git a/drivers/staging/rtl8192e/rtl8192e/rtl_core.c b/drivers/staging/rtl8192e/rtl8192e/rtl_core.c
index c01abc2..1f282cf 100644
--- a/drivers/staging/rtl8192e/rtl8192e/rtl_core.c
+++ b/drivers/staging/rtl8192e/rtl8192e/rtl_core.c
@@ -2928,7 +2928,7 @@ static int rtl8192_pci_probe(struct pci_dev *pdev,

dev->wireless_handlers = (struct iw_handler_def *)
&r8192_wx_handlers_def;
- dev->ethtool_ops = &rtl819x_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &rtl819x_ethtool_ops);

dev->type = ARPHRD_ETHER;
dev->watchdog_timeo = HZ * 3;
diff --git a/drivers/staging/wlags49_h2/wl_netdev.c b/drivers/staging/wlags49_h2/wl_netdev.c
index a10d014..a24379f 100644
--- a/drivers/staging/wlags49_h2/wl_netdev.c
+++ b/drivers/staging/wlags49_h2/wl_netdev.c
@@ -1183,8 +1183,7 @@ struct net_device *wl_device_alloc(void)

dev->watchdog_timeo = TX_TIMEOUT;

- dev->ethtool_ops = &wl_ethtool_ops;
-
+ SET_ETHTOOL_OPS(dev, &wl_ethtool_ops);
netif_stop_queue(dev);

/* Allocate virtual devices for WDS support if needed */
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 733ec28..b286646 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -831,7 +831,7 @@ void vlan_setup(struct net_device *dev)

dev->netdev_ops = &vlan_netdev_ops;
dev->destructor = free_netdev;
- dev->ethtool_ops = &vlan_ethtool_ops;
+ SET_ETHTOOL_OPS(dev, &vlan_ethtool_ops);

memset(dev->broadcast, 0, ETH_ALEN);
}

--
1.9.0

Andi Kleen

2014-05-05 22:25:57 UTC

From: Andi Kleen <***@linux.intel.com>

This is all the code that saves connection information
between different sockets. Not really essential for
small systems.

Saves about 5.5k text

text data bss dec hex filename
492952 19571 13480 526003 806b3 net/built-in.o-with-metrics
487675 19275 13480 520430 7f0ee net/built-in.o-without-metrics

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
include/net/tcp.h | 25 +++++++++++++++++++++++++
net/ipv4/Kconfig | 6 ++++++
net/ipv4/Makefile | 3 ++-
net/ipv4/sysctl_net_ipv4.c | 2 ++
4 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 87d8774..d741d2f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -419,14 +419,29 @@ int tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb);
void tcp_enter_loss(struct sock *sk, int how);
void tcp_clear_retrans(struct tcp_sock *tp);
+#ifdef CONFIG_TCP_METRICS
void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk);
void tcp_metrics_init(void);
+
bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst,
bool paws_check);
bool tcp_remember_stamp(struct sock *sk);
bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw);
void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst);
+#else
+static inline void tcp_update_metrics(struct sock *sk) {}
+static inline void tcp_init_metrics(struct sock *sk) {}
+static inline void tcp_metrics_init(void) {}
+static inline bool tcp_peer_is_proven(struct request_sock *req,
+ struct dst_entry *dst,
+ bool paws_check) { return false; }
+static inline bool tcp_remember_stamp(struct sock *sk) { return false; }
+static inline bool
+tcp_tw_remember_stamp(struct inet_timewait_sock *tw) { return false; }
+static inline void
+tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) {}
+#endif
void tcp_disable_fack(struct tcp_sock *tp);
void tcp_close(struct sock *sk, long timeout);
void tcp_init_sock(struct sock *sk);
@@ -1296,11 +1311,21 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp,
const struct tcp_md5sig_key *key);

/* From tcp_fastopen.c */
+#ifdef CONFIG_TCP_METRICS
void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie, int *syn_loss,
unsigned long *last_syn_loss);
void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
struct tcp_fastopen_cookie *cookie, bool syn_lost);
+#else
+static inline void
+tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
+ struct tcp_fastopen_cookie *cookie, int *syn_loss,
+ unsigned long *last_syn_loss) {}
+static inline void
+tcp_fastopen_cache_set(struct sock *sk, u16 mss,
+ struct tcp_fastopen_cookie *cookie, bool syn_lost) {}
+#endif
struct tcp_fastopen_request {
/* Fast Open cookie. Size 0 means a cookie request */
struct tcp_fastopen_cookie cookie;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 6146b1b..db2dada 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -264,6 +264,12 @@ config IP_PIMSM_V2
gated-5). This routing protocol is not used widely, so say N unless
you want to play with it.

+config TCP_METRICS
+ bool "Report TCP metrics over netlink"
+ ---help---
+ Enable support in TCP to save host information between different
+ connections.
+
config SYN_COOKIES
bool "IP: TCP syncookie support"
---help---
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 756855c..8b17b83 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -7,7 +7,7 @@ obj-y := route.o inetpeer.o protocol.o \
ip_output.o ip_sockglue.o inet_hashtables.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
- tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+ tcp_minisocks.o tcp_cong.o tcp_fastopen.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
@@ -17,6 +17,7 @@ obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_IP_PING) += ping.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_TCP_METRICS) += tcp_metrics.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
obj-$(CONFIG_NET_IPIP) += ipip.o
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 44eba05..2110d2e 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -573,6 +573,7 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+#ifdef CONFIG_TCP_METRICS
{
.procname = "tcp_no_metrics_save",
.data = &sysctl_tcp_nometrics_save,
@@ -580,6 +581,7 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#endif
{
.procname = "tcp_moderate_rcvbuf",
.data = &sysctl_tcp_moderate_rcvbuf,

--
1.9.0

Tom Zanussi

2014-05-06 04:39:08 UTC

Date: Tue, 6 May 2014 05:21:14 +0200

What parts would you remove to get the foot print down for a 2MB
single purpose machine?

I wouldn't use Linux, end of story.
Maybe two decades ago, but not now, those days are over.

These 'new' systems we're talking about pretty much are the same as
systems from 20 years ago e.g. Quark, which is essentially a 486, with
tiny amounts of RAM, say less than 2 MB, so I hope those days aren't
over forever for Linux...

Anyway, Andi's net-diet and LTO patches allowed me to go from a kernel
text size of above 1100k down to about 750k with the kernel network
stack running an unmodified small web server (nostromo) and the ability
to drop in a dedicated single-purpose application that this type of
system would typically run e.g. something based on for example UDP.

I was able to get that down to about 620k removing INET completely and
replacing it with LWIP, but LWIP either bloated the application back up
to similar memory usage numbers as with INET configured in, or
alternatively required extensive changes to the application in order to
use the not-bloating low-level interface.

But why go to all that trouble when there's a perfectly good networking
stack in the kernel? Even if most of these options aren't things that
would be useful to most systems, being able to turn them off and save
1/3 of the kernel text size for tiny systems like this does makes a big
difference...

Tom

David Miller

2014-05-06 03:12:29 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:25:57 -0700

This is all the code that saves connection information
between different sockets. Not really essential for
small systems.

It is absolutely essential unless you want poor performance
of TCP connections.

I'm not applying this.

j***@joshtriplett.org

2014-05-06 17:30:15 UTC

Sounds like we have some optimization to do, then; there's no
fundamental unfixable reason for that delta.

I think you have little idea of the reasons for this delta.

I have a rather good idea, actually.

Some servers handle 10 millions of TCP flows, using as little as 1KB per
connection in user space, all included.
Do you have an idea of how much memory is needed for 10 millions TCP
sockets in the kernel ?

Too much. That's potentially fixable, but not if we start with the
premise that it's impossible.

- Josh Triplett

Eric Dumazet

2014-05-06 17:03:24 UTC

Sounds like we have some optimization to do, then; there's no
fundamental unfixable reason for that delta.

I think you have little idea of the reasons for this delta.

Some servers handle 10 millions of TCP flows, using as little as 1KB per
connection in user space, all included.

Do you have an idea of how much memory is needed for 10 millions TCP
sockets in the kernel ?

j***@joshtriplett.org

2014-05-06 15:57:03 UTC

Date: Tue, 6 May 2014 05:21:14 +0200

What parts would you remove to get the foot print down for a 2MB
single purpose machine?

I wouldn't use Linux, end of story.
Maybe two decades ago, but not now, those days are over.

That's a self-fulfilling prophecy: if you and others assume that Linux
should not run on such machines, then size regressions will continue to
happen, and patches to make Linux continue running on such systems
will not make it into the kernel.

There are real people and products intending to use Linux on incredibly
tiny embedded systems; Tom already posted about one in this thread.
Personally, I'd much rather see Linux on such systems rather than some
crazy embedded (often proprietary) OS, and so would many other people.
A NAK isn't going to cut it, here; tiny Linux systems are going to
exist, and they shouldn't have to maintain a long-term out-of-tree fork
or use crazy things like LWIP.

I understand that you want to reduce maintenance effort and Kconfig
option proliferation; that's a very real concern. It's likely possible
to address those concerns while still producing a usable minimal version
of the networking stack, if you'd be willing to provide feedback and
support iteration of patches like these.

Would you be interested in discussing this at Kernel Summit, perhaps?
Would that help to hammer out a plan for this?

- Josh Triplett

Eric Dumazet

2014-05-06 16:39:19 UTC

Post by j***@joshtriplett.org
A NAK isn't going to cut it, here; tiny Linux systems are going to
exist, and they shouldn't have to maintain a long-term out-of-tree fork
or use crazy things like LWIP.

What's wrong with user space implementations of networking stacks ?

For many usages, it can bring 10 times the performance of having user
application and kernel sockets.

In any cases, we do not model kernel implementations to 'compete' with
user space.

We simply can not compete with user space, as a programmer is free to
keep what he really wants/needs.

I have started using linux on 386/486 pcs which had more than 2MB of
memory, it makes me sad we want linux-3.16 to run on this kind of
hardware, and consuming time to save few KB here and here.

j***@joshtriplett.org

2014-05-06 16:45:46 UTC

Post by Eric Dumazet

Post by j***@joshtriplett.org
A NAK isn't going to cut it, here; tiny Linux systems are going to
exist, and they shouldn't have to maintain a long-term out-of-tree fork
or use crazy things like LWIP.

What's wrong with user space implementations of networking stacks ?

What's wrong with the kernel implementation?

Post by Eric Dumazet
For many usages, it can bring 10 times the performance of having user
application and kernel sockets.

Sounds like we have some optimization to do, then; there's no
fundamental unfixable reason for that delta.

Post by Eric Dumazet
In any cases, we do not model kernel implementations to 'compete' with
user space.
We simply can not compete with user space, as a programmer is free to
keep what he really wants/needs.

The kernel can do the same. Consider the idea of analyzing a set of
userspace programs, determining what kernel functionality they do and
don't need, feeding that information into the kernel build process, and
automatically dropping unused bits of the kernel.

Ideally, that kind of process would support eliminating kernel config
options that just select userspace-visible interfaces, leaving only the
kernel config options that change how those interfaces behave
(size/performance/feature tradeoffs).

- Josh Triplett

David Miller

2014-05-06 17:17:52 UTC

From: ***@joshtriplett.org
Date: Tue, 6 May 2014 09:45:46 -0700

The kernel can do the same. Consider the idea of analyzing a set of
userspace programs, determining what kernel functionality they do and
don't need, feeding that information into the kernel build process, and
automatically dropping unused bits of the kernel.

Please make sure I'm not on the list of people who see reports for
bugs reported in that setup.

Thanks :-)

Tom Herbert

2014-05-06 18:58:38 UTC

Post by Eric Dumazet
We simply can not compete with user space, as a programmer is free to
keep what he really wants/needs.

Not true.
With my patches and LTO Linux can be competive with LWIP+socket layer.
(about 60K more text). And it's easier to use because it's just
the standard interface.

Post by Eric Dumazet
I have started using linux on 386/486 pcs which had more than 2MB of
memory, it makes me sad we want linux-3.16 to run on this kind of
hardware, and consuming time to save few KB here and here.

Linux has always been a system from very small to big.
That's been one of its strengths. It is very adaptable.
Many subsystems are very configurable for this.
For example that is why we have both SLOB and SLUB.
That is why we have NOMMU MM and lots of other tuning
knobs for small systems.
So if the other subsystems can do this, why should it be
impossible for networking?

Can this at least be done without the combinatorial explosion in
number of configurations? As Yuchung pointed out these patches
introduce at least one unresolved configuration dependency. CONFIG_SMP
works quite well since with a single parameter we can enable/disable a
whole bunch of functionality in bulk, and it's quite clear that new
development cannot break smp or non-smp configurations. Maybe you want
something similar like CONFIG_NETWORK_SMALL?

Tom

-Andi
--
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
More majordomo info at http://vger.kernel.org/majordomo-info.html

j***@joshtriplett.org

2014-05-06 19:37:50 UTC

Post by Tom Herbert

Post by Eric Dumazet
We simply can not compete with user space, as a programmer is free to
keep what he really wants/needs.

Not true.
With my patches and LTO Linux can be competive with LWIP+socket layer.
(about 60K more text). And it's easier to use because it's just
the standard interface.

Post by Eric Dumazet
I have started using linux on 386/486 pcs which had more than 2MB of
memory, it makes me sad we want linux-3.16 to run on this kind of
hardware, and consuming time to save few KB here and here.

Linux has always been a system from very small to big.
That's been one of its strengths. It is very adaptable.
Many subsystems are very configurable for this.
For example that is why we have both SLOB and SLUB.
That is why we have NOMMU MM and lots of other tuning
knobs for small systems.
So if the other subsystems can do this, why should it be
impossible for networking?

Can this at least be done without the combinatorial explosion in
number of configurations? As Yuchung pointed out these patches
introduce at least one unresolved configuration dependency. CONFIG_SMP
works quite well since with a single parameter we can enable/disable a
whole bunch of functionality in bulk, and it's quite clear that new
development cannot break smp or non-smp configurations. Maybe you want
something similar like CONFIG_NETWORK_SMALL?

That seems completely reasonable. Likewise, for infrastructure that
scales by CPU, keying off of CONFIG_NR_CPUS might make sense.

I'd suggest inverting it, so that 'n' means "small" and 'y' means fully
featured. Here's a rough description for a CONFIG_NETWORK_FULL:

config NETWORK_FULL
default y
bool "Full-featured networking stack" if EMBEDDED
--help--
Leave this option enabled for a full-featured networking
stack, including features used by the vast majority of
systems. Saying N here results in a minimal embedded
networking stack, suitable only for the most
memory-constrained and storage-constrained systems; the
minimal stack removes many features, and optimizes for code
and data size rather than performance.

If in doubt, say Y here.

Andi Kleen

2014-05-06 19:57:04 UTC

Post by Tom Herbert
Can this at least be done without the combinatorial explosion in
number of configurations? As Yuchung pointed out these patches
introduce at least one unresolved configuration dependency. CONFIG_SMP
works quite well since with a single parameter we can enable/disable a
whole bunch of functionality in bulk, and it's quite clear that new
development cannot break smp or non-smp configurations. Maybe you want
something similar like CONFIG_NETWORK_SMALL?

Yes I've considered this. I'm not sure SMP is good enough though,
at some point we'll get tiny dual core systems.
Right now I'm using own Kconfigs for every removed features. I realize
this somewhat increases the compile test matrix. It would be possible
to hide some of the options and select them using higher level
configurations like the ones listed above. I haven't done this
in this version.
<<<

-Andi

Andi Kleen

2014-05-06 18:32:16 UTC

Post by Eric Dumazet
We simply can not compete with user space, as a programmer is free to
keep what he really wants/needs.

Not true.

With my patches and LTO Linux can be competive with LWIP+socket layer.
(about 60K more text). And it's easier to use because it's just
the standard interface.

Post by Eric Dumazet
I have started using linux on 386/486 pcs which had more than 2MB of
memory, it makes me sad we want linux-3.16 to run on this kind of
hardware, and consuming time to save few KB here and here.

Linux has always been a system from very small to big.
That's been one of its strengths. It is very adaptable.

Many subsystems are very configurable for this.
For example that is why we have both SLOB and SLUB.
That is why we have NOMMU MM and lots of other tuning
knobs for small systems.

So if the other subsystems can do this, why should it be
impossible for networking?

-Andi

--
***@linux.intel.com -- Speaking for myself only

Eric Dumazet

2014-05-06 20:17:58 UTC

Post by Eric Dumazet
We simply can not compete with user space, as a programmer is free to
keep what he really wants/needs.

Not true.

You can shake the kernel as much as you want, you wont make :
- a TCP socket
- a dentry
- an inode
- a file structure
- eventpoll structures (assuming epoll use)
- 2 dst per flow.

In 1024 bytes of memory, and keep an efficient kernel to handle
arbitrary number of sockets using the venerable and slow BSD socket api.

I was objecting to the "crazy things like LWIP" comment from Josh, not
to your patches in general.

I actually took a look at them but stopped at patch 22

Adding ~1000 lines of code to save few KB was the point I gave up.

j***@joshtriplett.org

2014-05-06 20:27:19 UTC

Post by Eric Dumazet

Post by Eric Dumazet
We simply can not compete with user space, as a programmer is free to
keep what he really wants/needs.

Not true.

- a TCP socket
- a dentry
- an inode
- a file structure
- eventpoll structures (assuming epoll use)
- 2 dst per flow.
In 1024 bytes of memory, and keep an efficient kernel to handle
arbitrary number of sockets using the venerable and slow BSD socket api.
I was objecting to the "crazy things like LWIP" comment from Josh, not
to your patches in general.

My primary statement was that it's crazy to use something like LWIP just
because you want a *tiny* system. We could argue about using LWIP
because you want a massively scalable system, or one that more closely
couples userspace and the kernel, but that's not the current goal in any
case. So let's drop that branch of the thread. :)

Post by Eric Dumazet
I actually took a look at them but stopped at patch 22
Adding ~1000 lines of code to save few KB was the point I gave up.

Please consider ignoring that one and reading the rest; we could always
handle the routing table issue separately.

- Josh Triplett

Andi Kleen

2014-05-06 20:37:43 UTC

Post by Eric Dumazet
In 1024 bytes of memory, and keep an efficient kernel to handle
arbitrary number of sockets using the venerable and slow BSD socket api.

I agree running in 1024 bytes would be challenging.

Post by Eric Dumazet
Adding ~1000 lines of code to save few KB was the point I gave up.

You're refering to fib_list? It currently has some duplicated code
(this could/should be fixed). Or we could drop it, I suppose, if really
everyone hates it.

(I thought it was a cute idea, but I'm biased :-)

Total it is saving 350k, about 30% of the total text size of the mini kernel
(plus some dynamic savings) with networking.

I would be happy to fix any reasonable objection. But fundamental
"I don't care about anything smaller than my smart phone" type arguments
are not particularly constructive.

-Andi

--
***@linux.intel.com -- Speaking for myself only

David Miller

2014-05-06 20:48:29 UTC

From: Eric Dumazet <***@gmail.com>
Date: Tue, 06 May 2014 13:17:58 -0700

Post by Eric Dumazet
Adding ~1000 lines of code to save few KB was the point I gave up.

+1

David Miller

2014-05-06 03:23:27 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Tue, 6 May 2014 05:21:14 +0200

What parts would you remove to get the foot print down for a 2MB
single purpose machine?

I wouldn't use Linux, end of story.

Maybe two decades ago, but not now, those days are over.

Andi Kleen

2014-05-06 03:21:14 UTC

Post by David Miller
Date: Mon, 5 May 2014 15:25:57 -0700

This is all the code that saves connection information
between different sockets. Not really essential for
small systems.

It is absolutely essential unless you want poor performance
of TCP connections.

Ok so every code in the network stack is essential?

What parts would you remove to get the foot print down for a 2MB
single purpose machine?

-Andi

--
***@linux.intel.com -- Speaking for myself only.

Andi Kleen

2014-05-05 22:26:10 UTC

From: Andi Kleen <***@linux.intel.com>

Many DHCP clients need basic packet sockets, but they don't need
the fancy zero copy packet capture code, like tpacket, mmap, rings,
fanouts. This is quite substantial code, so it's worthwhile to
make it optional

Worth nearly 10k code.

text data bss dec hex filename
952827 71874 25352 1050053 1005c5 net/built-in.o-with-packet-mmap
943211 71810 25352 1040373 fdff5 net/built-in.o-wo-packet-mmap

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
net/packet/Kconfig | 8 +++++
net/packet/af_packet.c | 82 +++++++++++++++++++++++++++++++++++++++++---------
2 files changed, 75 insertions(+), 15 deletions(-)

diff --git a/net/packet/Kconfig b/net/packet/Kconfig
index cc55b35..c215d31 100644
--- a/net/packet/Kconfig
+++ b/net/packet/Kconfig
@@ -22,3 +22,11 @@ config PACKET_DIAG
---help---
Support for PF_PACKET sockets monitoring interface used by the ss tool.
If unsure, say Y.
+
+config PACKET_MMAP
+ bool "Enable packet mmap/ring support"
+ depends on PACKET
+ default y
+ ---help---
+ Enable support to mmap the packet data zero copy. This is useful for
+ highspeed packet interceptors.
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index b85c67c..723f57f 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -158,6 +158,8 @@ struct packet_mreq_max {
unsigned char mr_address[MAX_ADDR_LEN];
};

+#ifdef CONFIG_PACKET_MMAP
+
union tpacket_uhdr {
struct tpacket_hdr *h1;
struct tpacket2_hdr *h2;
@@ -165,8 +167,6 @@ union tpacket_uhdr {
void *raw;
};

-static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
- int closing, int tx_ring);

#define V3_ALIGNMENT (8)

@@ -213,6 +213,9 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *,
struct tpacket3_hdr *);
static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
struct tpacket3_hdr *);
+
+#endif
+
static void packet_flush_mclist(struct sock *sk);

struct packet_skb_cb {
@@ -384,6 +387,8 @@ static void unregister_prot_hook(struct sock *sk, bool sync)
__unregister_prot_hook(sk, sync);
}

+#ifdef CONFIG_PACKET_MMAP
+
static inline __pure struct page *pgv_to_page(void *addr)
{
if (is_vmalloc_addr(addr))
@@ -1210,6 +1215,8 @@ static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
return refcnt;
}

+#endif
+
static int packet_alloc_pending(struct packet_sock *po)
{
po->rx_ring.pending_refcnt = NULL;
@@ -1226,6 +1233,7 @@ static void packet_free_pending(struct packet_sock *po)
free_percpu(po->tx_ring.pending_refcnt);
}

+#ifdef CONFIG_PACKET_MMAP
static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{
struct sock *sk = &po->sk;
@@ -1249,6 +1257,8 @@ static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
return has_room;
}

+#endif
+
static void packet_sock_destruct(struct sock *sk)
{
skb_queue_purge(&sk->sk_error_queue);
@@ -1264,6 +1274,8 @@ static void packet_sock_destruct(struct sock *sk)
sk_refcnt_debug_dec(sk);
}

+#ifdef CONFIG_PACKET_MMAP
+
static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
{
int x = atomic_read(&f->rr_cur) + 1;
@@ -1530,6 +1542,12 @@ static void fanout_release(struct sock *sk)
mutex_unlock(&fanout_mutex);
}

+#else
+static void __fanout_unlink(struct sock *sk, struct packet_sock *po) {}
+static void __fanout_link(struct sock *sk, struct packet_sock *po) {}
+static void fanout_release(struct sock *sk) {}
+#endif
+
static const struct proto_ops packet_ops;

static const struct proto_ops packet_ops_spkt;
@@ -1867,6 +1885,11 @@ drop:
return 0;
}

+#ifdef CONFIG_PACKET_MMAP
+
+static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
+ int closing, int tx_ring);
+
static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
@@ -2357,6 +2380,35 @@ out:
return err;
}

+static inline bool use_tpacket(struct packet_sock *po)
+{
+ return po->tx_ring.pg_vec;
+}
+
+static void tpacket_release(struct sock *sk, struct packet_sock *po)
+{
+ union tpacket_req_u req_u;
+
+ if (po->rx_ring.pg_vec) {
+ memset(&req_u, 0, sizeof(req_u));
+ packet_set_ring(sk, &req_u, 1, 0);
+ }
+
+ if (po->tx_ring.pg_vec) {
+ memset(&req_u, 0, sizeof(req_u));
+ packet_set_ring(sk, &req_u, 1, 1);
+ }
+}
+
+#else
+static inline bool use_tpacket(struct packet_sock *po) { return false; }
+static inline void tpacket_release(struct sock *sk, struct packet_sock *po) {}
+static inline int tpacket_snd(struct packet_sock *po, struct msghdr *msg) { return 0; }
+static inline int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{ return 0; }
+#endif
+
static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
size_t reserve, size_t len,
size_t linear, int noblock,
@@ -2576,7 +2628,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
struct sock *sk = sock->sk;
struct packet_sock *po = pkt_sk(sk);

- if (po->tx_ring.pg_vec)
+ if (use_tpacket(po))
return tpacket_snd(po, msg);
else
return packet_snd(sock, msg, len);
@@ -2592,7 +2644,6 @@ static int packet_release(struct socket *sock)
struct sock *sk = sock->sk;
struct packet_sock *po;
struct net *net;
- union tpacket_req_u req_u;

if (!sk)
return 0;
@@ -2620,15 +2671,7 @@ static int packet_release(struct socket *sock)

packet_flush_mclist(sk);

- if (po->rx_ring.pg_vec) {
- memset(&req_u, 0, sizeof(req_u));
- packet_set_ring(sk, &req_u, 1, 0);
- }
-
- if (po->tx_ring.pg_vec) {
- memset(&req_u, 0, sizeof(req_u));
- packet_set_ring(sk, &req_u, 1, 1);
- }
+ tpacket_release(sk, po);

fanout_release(sk);

@@ -3203,7 +3246,7 @@ static int
packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
- struct packet_sock *po = pkt_sk(sk);
+ struct packet_sock *po __maybe_unused = pkt_sk(sk);
int ret;

if (level != SOL_PACKET)
@@ -3231,6 +3274,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
return ret;
}

+#ifdef CONFIG_PACKET_MMAP
case PACKET_RX_RING:
case PACKET_TX_RING:
{
@@ -3314,6 +3358,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
po->tp_loss = !!val;
return 0;
}
+#endif
case PACKET_AUXDATA:
{
int val;
@@ -3366,6 +3411,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
po->tp_tstamp = val;
return 0;
}
+#ifdef CONFIG_PACKET_MMAP
case PACKET_FANOUT:
{
int val;
@@ -3390,6 +3436,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
po->tp_tx_has_off = !!val;
return 0;
}
+#endif
case PACKET_QDISC_BYPASS:
{
int val;
@@ -3615,6 +3662,7 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
return 0;
}

+#ifdef CONFIG_PACKET_MMAP
static unsigned int packet_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
@@ -3855,7 +3903,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
swap(rb->pg_vec_len, req->tp_block_nr);

rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
- po->prot_hook.func = (po->rx_ring.pg_vec) ?
+ po->prot_hook.func = use_tpacket(po) ?
tpacket_rcv : packet_rcv;
skb_queue_purge(rb_queue);
if (atomic_read(&po->mapped))
@@ -3944,6 +3992,10 @@ out:
mutex_unlock(&po->pg_vec_lock);
return err;
}
+#else
+#define packet_mmap sock_no_mmap
+#define packet_poll datagram_poll
+#endif

static const struct proto_ops packet_ops_spkt = {
.family = PF_PACKET,

--
1.9.0

David Miller

2014-05-06 03:09:25 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:26:10 -0700

Post by Andi Kleen
Many DHCP clients need basic packet sockets, but they don't need
the fancy zero copy packet capture code, like tpacket, mmap, rings,
fanouts. This is quite substantial code, so it's worthwhile to
make it optional
Worth nearly 10k code.
text data bss dec hex filename
952827 71874 25352 1050053 1005c5 net/built-in.o-with-packet-mmap
943211 71810 25352 1040373 fdff5 net/built-in.o-wo-packet-mmap

Sorry, I'm not applying this either.

Small systems use packet generation and reception tools commonly, and
the only sane way to do so is with tpacket.

Andi Kleen

2014-05-05 22:26:11 UTC

From: Andi Kleen <***@linux.intel.com>

Add an optional fib_list that uses a simple list to store routes.
This is suitable for single homed client system which typically
have only a handful of routes.

The new file is a complete 1:1 replacement for fib_trie and
implements the same API for other files.

The code still uses RCU, so should have similar multi-core scalability,
as long as the number of routes is small. It also has
all the functionality of the fib_trie, so can be used
with advanced routing (except for a few missing proc files)

The code started as a copy of fib_trie, with all the trie
code replaced with a simple list. There's some very similar
code with trie that could be factored out in a common file.

Worth about 8.5k of code. In addition it uses kmalloc
instead of explicit slabs, which gives better memory efficiency
for very small routing table (all data <4K, so sharing with
other kmalloc users is possible)

text data bss dec hex filename
943211 71810 25352 1040373 fdff5 net/built-in.o-fib-list
951716 72066 25352 1049134 10022e net/built-in.o-fib-trie

XXX experimential, needs more testing, review

Cc: ***@its.uu.se
Signed-off-by: Andi Kleen <***@linux.intel.com>
---
net/ipv4/Kconfig | 18 +-
net/ipv4/Makefile | 4 +-
net/ipv4/fib_list.c | 985 ++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 1005 insertions(+), 2 deletions(-)
create mode 100644 net/ipv4/fib_list.c

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index cdb4f57..578e332 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -55,9 +55,25 @@ config IP_PING
---help---
Enable ping sockets to enable suid-less ping.

+config FIB_TRIE
+ bool "Use trie based routing table"
+ default y
+ ---help---
+ This is an advanced routing table implementation that scales to a larger
+ number of routes. When in doubt say y.
+
+config FIB_LIST
+ bool
+ default y
+ depends on !FIB_TRIE
+ ---help---
+ This is a very simple linear routing table implementation that is only
+ suitable for clients with a very small number of routes (<10). The only
+ advantage over FIB_TRIE is smaller code size.
+
config IP_FIB_TRIE_STATS
bool "FIB TRIE statistics"
- depends on IP_ADVANCED_ROUTER
+ depends on IP_ADVANCED_ROUTER && FIB_TRIE
---help---
Keep track of statistics on structure of FIB TRIE table.
Useful for testing and measuring TRIE performance.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 9353beb..8c2a08d 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,9 +10,11 @@ obj-y := route.o inetpeer.o protocol.o \
tcp_minisocks.o tcp_cong.o \
datagram.o udp.o udplite.o \
arp.o icmp.o devinet.o af_inet.o \
- fib_frontend.o fib_semantics.o fib_trie.o \
+ fib_frontend.o fib_semantics.o \
inet_fragment.o ip_tunnel_core.o gre_offload.o

+obj-$(CONFIG_FIB_TRIE) += fib_trie.o
+obj-$(CONFIG_FIB_LIST) += fib_list.o
obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_IP_PING) += ping.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
diff --git a/net/ipv4/fib_list.c b/net/ipv4/fib_list.c
new file mode 100644
index 0000000..969e54d
--- /dev/null
+++ b/net/ipv4/fib_list.c
@@ -0,0 +1,985 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version
+ * 2 of the License.
+ *
+ * This is based on the fib_trie code (which was based on fib_hash),
+ * but all the trie code removed and replaced with a simple
+ * list. In addition it uses kmalloc instead of slab directly
+ * to save some dynamic memory.
+ *
+ * This is for systems with very small routing tables, like
+ * your typical single-homed client. You shouldn't be using this for
+ * more than a handful of routes. The main motivation is smaller
+ * code size. Apart from being less scalable and some missing
+ * proc output the functionality should be the same as fib_trie.
+ *
+ * Some code is very similar with fib_trie and could be later
+ * factored out into a shared file.
+ *
+ * List code by Andi Kleen, original trie file was:
+ *
+ * Robert Olsson <***@its.uu.se> Uppsala Universitet
+ * & Swedish University of Agricultural Sciences.
+ *
+ * Jens Laas <***@data.slu.se> Swedish University of
+ * Agricultural Sciences.
+ *
+ * Hans Liss <***@its.uu.se> Uppsala Universitet
+ *
+ * Code from fib_hash has been reused which includes the following header:
+ *
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IPv4 FIB: lookup engine and maintenance routines.
+ *
+ *
+ * Authors: Alexey Kuznetsov, <***@ms2.inr.ac.ru>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Substantial contributions to this work comes from:
+ *
+ * David S. Miller, <***@davemloft.net>
+ * Stephen Hemminger <***@osdl.org>
+ * Paul E. McKenney <***@us.ibm.com>
+ * Patrick McHardy <***@trash.net>
+ */
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <net/net_namespace.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include "fib_lookup.h"
+
+/* One entry with the same key/mask */
+struct entry {
+ struct hlist_node nd;
+ u32 key;
+ u32 mask;
+ int plen;
+ struct hlist_head list;
+ struct rcu_head rcu;
+};
+
+struct entry_info {
+ struct hlist_node hlist;
+ int plen;
+ u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
+ struct list_head falh;
+ struct rcu_head rcu;
+};
+
+/* The list is ordered by the prefix length. Larger prefixes come
+ * earlier (so the default routes should be always at the end)
+ * Protected by the RTNL for writing, reading is handled with standard
+ * list RCU.
+ */
+struct rlist {
+ struct hlist_head list;
+};
+
+static void __alias_free_mem(struct rcu_head *head)
+{
+ struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+ kfree(fa);
+}
+
+static inline void alias_free_mem_rcu(struct fib_alias *fa)
+{
+ call_rcu(&fa->rcu, __alias_free_mem);
+}
+
+static void __entry_free_rcu(struct rcu_head *head)
+{
+ struct entry *l = container_of(head, struct entry, rcu);
+ kfree(l);
+}
+
+static inline void free_entry(struct entry *entry)
+{
+ call_rcu(&entry->rcu, __entry_free_rcu);
+}
+
+static inline void free_entry_info(struct entry_info *entry)
+{
+ kfree_rcu(entry, rcu);
+}
+
+static struct entry *entry_new(void)
+{
+ return kzalloc(sizeof(struct entry), GFP_KERNEL);
+}
+
+static struct entry_info *entry_info_new(int plen)
+{
+ struct entry_info *ei = kzalloc(sizeof(struct entry_info), GFP_KERNEL);
+ if (ei) {
+ ei->plen = plen;
+ ei->mask_plen = ntohl(inet_make_mask(plen));
+ INIT_LIST_HEAD(&ei->falh);
+ }
+ return ei;
+}
+
+/* readside must use rcu_read_lock currently dump routines
+ via get_fa_head and dump */
+
+static struct entry_info *find_entry_info(struct entry *e, int plen)
+{
+ struct hlist_head *head = &e->list;
+ struct entry_info *ei;
+
+ hlist_for_each_entry_rcu(ei, head, hlist)
+ if (ei->plen == plen)
+ return ei;
+
+ return NULL;
+}
+
+static inline struct list_head *get_fa_head(struct entry *l, int plen)
+{
+ struct entry_info *li = find_entry_info(l, plen);
+
+ if (!li)
+ return NULL;
+
+ return &li->falh;
+}
+
+static void insert_entry_info(struct hlist_head *head, struct entry_info *new)
+{
+ struct entry_info *li = NULL, *last = NULL;
+
+ if (hlist_empty(head)) {
+ hlist_add_head_rcu(&new->hlist, head);
+ } else {
+ hlist_for_each_entry(li, head, hlist) {
+ if (new->plen > li->plen)
+ break;
+
+ last = li;
+ }
+ if (last)
+ hlist_add_after_rcu(&last->hlist, &new->hlist);
+ else
+ hlist_add_before_rcu(&new->hlist, &li->hlist);
+ }
+}
+
+/* rcu_read_lock needs to be hold by caller from readside */
+static struct entry *
+fib_find_node(struct rlist *rl, u32 key)
+{
+ struct entry *entry;
+
+ hlist_for_each_entry_rcu (entry, &rl->list, nd) {
+ if (key == entry->key)
+ return entry;
+ }
+ return NULL;
+}
+
+/* only used from updater-side */
+
+static struct list_head *fib_insert_node(struct rlist *rl, u32 key, int plen)
+{
+ struct entry *entry;
+ struct entry_info *ei;
+ struct hlist_node *prev = NULL;
+ struct list_head *fa_head;
+
+ hlist_for_each_entry (entry, &rl->list, nd) {
+ if (entry->plen < plen)
+ break;
+ prev = &entry->nd;
+ }
+ entry = entry_new();
+ if (!entry)
+ return NULL;
+ entry->key = key;
+ entry->mask = ntohl(inet_make_mask(plen));
+ entry->plen = plen;
+ if (prev)
+ hlist_add_after_rcu(prev, &entry->nd);
+ else
+ hlist_add_head_rcu(&entry->nd, &rl->list);
+ ei = entry_info_new(plen);
+ if (!ei)
+ return NULL;
+ fa_head = &ei->falh;
+ insert_entry_info(&entry->list, ei);
+ return fa_head;
+}
+
+int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
+{
+ struct rlist *rl = (struct rlist *) tb->tb_data;
+ struct fib_alias *fa, *new_fa;
+ struct list_head *fa_head = NULL;
+ struct fib_info *fi;
+ int plen = cfg->fc_dst_len;
+ u8 tos = cfg->fc_tos;
+ u32 key, mask;
+ int err;
+ struct entry *l;
+ bool found = false;
+
+ if (plen > 32)
+ return -EINVAL;
+
+ key = ntohl(cfg->fc_dst);
+
+ pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
+
+ mask = ntohl(inet_make_mask(plen));
+
+ if (key & ~mask)
+ return -EINVAL;
+
+ fi = fib_create_info(cfg);
+ if (IS_ERR(fi)) {
+ err = PTR_ERR(fi);
+ goto err;
+ }
+
+ hlist_for_each_entry (l, &rl->list, nd) {
+ if (l->key == key) {
+ found = true;
+ break;
+ }
+ }
+ fa = NULL;
+ if (!found)
+ l = NULL;
+
+ if (l) {
+ fa_head = get_fa_head(l, plen);
+ fa = fib_find_alias(fa_head, tos, fi->fib_priority);
+ }
+
+ /* Now fa, if non-NULL, points to the first fib alias
+ * with the same keys [prefix,tos,priority], if such key already
+ * exists or to the node before which we will insert new one.
+ *
+ * If fa is NULL, we will need to allocate a new one and
+ * insert to the head of f.
+ *
+ * If f is NULL, no fib node matched the destination key
+ * and we need to allocate a new one of those as well.
+ */
+
+ if (fa && fa->fa_tos == tos &&
+ fa->fa_info->fib_priority == fi->fib_priority) {
+ struct fib_alias *fa_first, *fa_match;
+ int iter = 0;
+
+ err = -EEXIST;
+ if (cfg->fc_nlflags & NLM_F_EXCL)
+ goto out;
+
+ /* We have 2 goals:
+ * 1. Find exact match for type, scope, fib_info to avoid
+ * duplicate routes
+ * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
+ */
+ fa_match = NULL;
+ fa_first = fa;
+ fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+ list_for_each_entry_continue(fa, fa_head, fa_list) {
+ if (fa->fa_tos != tos)
+ break;
+ if (fa->fa_info->fib_priority != fi->fib_priority)
+ break;
+ if (fa->fa_type == cfg->fc_type &&
+ fa->fa_info == fi) {
+ fa_match = fa;
+ break;
+ }
+ iter++;
+ }
+ if (cfg->fc_nlflags & NLM_F_REPLACE) {
+ struct fib_info *fi_drop;
+ u8 state;
+
+ fa = fa_first;
+ if (fa_match) {
+ if (fa == fa_match)
+ err = 0;
+ goto out;
+ }
+ err = -ENOBUFS;
+ new_fa = kmalloc(sizeof(struct fib_alias), GFP_KERNEL);
+ if (new_fa == NULL)
+ goto out;
+
+ fi_drop = fa->fa_info;
+ new_fa->fa_tos = fa->fa_tos;
+ new_fa->fa_info = fi;
+ new_fa->fa_type = cfg->fc_type;
+ state = fa->fa_state;
+ new_fa->fa_state = state & ~FA_S_ACCESSED;
+
+ list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
+ alias_free_mem_rcu(fa);
+
+ fib_release_info(fi_drop);
+ if (state & FA_S_ACCESSED)
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
+ rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
+ tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
+
+ goto succeeded;
+ }
+ /* Error if we find a perfect match which
+ * uses the same scope, type, and nexthop
+ * information.
+ */
+ if (fa_match) {
+ goto out;
+ }
+
+ if (!(cfg->fc_nlflags & NLM_F_APPEND))
+ fa = fa_first;
+ }
+ err = -ENOENT;
+ if (!(cfg->fc_nlflags & NLM_F_CREATE))
+ goto out;
+
+ err = -ENOBUFS;
+ new_fa = kmalloc(sizeof(struct fib_alias), GFP_KERNEL);
+ if (new_fa == NULL)
+ goto out;
+
+ new_fa->fa_info = fi;
+ new_fa->fa_tos = tos;
+ new_fa->fa_type = cfg->fc_type;
+ new_fa->fa_state = 0;
+
+ /*
+ * Insert new entry to the list.
+ */
+
+ if (!fa_head) {
+ fa_head = fib_insert_node(rl, key, plen);
+ if (unlikely(!fa_head)) {
+ err = -ENOMEM;
+ goto out_free_new_fa;
+ }
+ }
+ if (!plen)
+ tb->tb_num_default++;
+
+ list_add_tail_rcu(&new_fa->fa_list,
+ (fa ? &fa->fa_list : fa_head));
+
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
+ rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
+ &cfg->fc_nlinfo, 0);
+succeeded:
+ return 0;
+
+out_free_new_fa:
+ kfree(new_fa);
+out:
+ fib_release_info(fi);
+err:
+ pr_debug("insert err %d\n", err);
+ return err;
+}
+
+static int check_entry(struct fib_table *tb, struct entry *l,
+ u32 key, const struct flowi4 *flp,
+ struct fib_result *res, int fib_flags)
+{
+ struct entry_info *li;
+ struct hlist_head *hhead = &l->list;
+
+ hlist_for_each_entry_rcu(li, hhead, hlist) {
+ struct fib_alias *fa;
+
+ list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+ int nhsel, err;
+
+ if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ continue;
+ if (fi->fib_dead)
+ continue;
+ if (fa->fa_info->fib_scope < flp->flowi4_scope)
+ continue;
+ fib_alias_accessed(fa);
+ err = fib_props[fa->fa_type].error;
+ if (err) {
+ return err;
+ }
+ if (fi->fib_flags & RTNH_F_DEAD)
+ continue;
+ for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+ const struct fib_nh *nh = &fi->fib_nh[nhsel];
+
+ if (nh->nh_flags & RTNH_F_DEAD)
+ continue;
+ if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
+ continue;
+
+ res->prefixlen = li->plen;
+ res->nh_sel = nhsel;
+ res->type = fa->fa_type;
+ res->scope = fa->fa_info->fib_scope;
+ res->fi = fi;
+ res->table = tb;
+ res->fa_head = &li->falh;
+ if (!(fib_flags & FIB_LOOKUP_NOREF))
+ atomic_inc(&fi->fib_clntref);
+ return 0;
+ }
+ }
+ }
+
+ return 1;
+}
+
+int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
+ struct fib_result *res, int fib_flags)
+{
+ struct entry *entry;
+ struct rlist *rl = (struct rlist *)tb->tb_data;
+ u32 key = ntohl(flp->daddr);
+ int ret = 1;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu (entry, &rl->list, nd) {
+ if ((key & entry->mask) == entry->key) {
+ ret = check_entry(tb, entry, key, flp, res, fib_flags);
+ if (ret <= 0)
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(fib_table_lookup);
+
+static void entry_remove(struct entry *entry)
+{
+ hlist_del_rcu(&entry->nd);
+ free_entry(entry);
+}
+
+/*
+ * Caller must hold RTNL.
+ */
+int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
+{
+ struct rlist *rl = (struct rlist *) tb->tb_data;
+ u32 key, mask;
+ int plen = cfg->fc_dst_len;
+ u8 tos = cfg->fc_tos;
+ struct fib_alias *fa, *fa_to_delete;
+ struct list_head *fa_head;
+ struct entry *l;
+ struct entry_info *li;
+
+ if (plen > 32)
+ return -EINVAL;
+
+ key = ntohl(cfg->fc_dst);
+ mask = ntohl(inet_make_mask(plen));
+
+ if (key & ~mask)
+ return -EINVAL;
+
+ l = fib_find_node(rl, key);
+
+ if (!l)
+ return -ESRCH;
+
+ li = find_entry_info(l, plen);
+
+ if (!li)
+ return -ESRCH;
+
+ fa_head = &li->falh;
+ fa = fib_find_alias(fa_head, tos, 0);
+
+ if (!fa)
+ return -ESRCH;
+
+ pr_debug("Deleting %08x/%d tos=%d\n", key, plen, tos);
+
+ fa_to_delete = NULL;
+ fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+ list_for_each_entry_continue(fa, fa_head, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (fa->fa_tos != tos)
+ break;
+
+ if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
+ (cfg->fc_scope == RT_SCOPE_NOWHERE ||
+ fa->fa_info->fib_scope == cfg->fc_scope) &&
+ (!cfg->fc_prefsrc ||
+ fi->fib_prefsrc == cfg->fc_prefsrc) &&
+ (!cfg->fc_protocol ||
+ fi->fib_protocol == cfg->fc_protocol) &&
+ fib_nh_match(cfg, fi) == 0) {
+ fa_to_delete = fa;
+ break;
+ }
+ }
+
+ if (!fa_to_delete)
+ return -ESRCH;
+
+ fa = fa_to_delete;
+ rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
+ &cfg->fc_nlinfo, 0);
+
+ list_del_rcu(&fa->fa_list);
+
+ if (!plen)
+ tb->tb_num_default--;
+
+ if (list_empty(fa_head)) {
+ hlist_del_rcu(&li->hlist);
+ free_entry_info(li);
+ }
+
+ if (hlist_empty(&l->list))
+ entry_remove(l);
+
+ if (fa->fa_state & FA_S_ACCESSED)
+ rt_cache_flush(cfg->fc_nlinfo.nl_net);
+
+ fib_release_info(fa->fa_info);
+ alias_free_mem_rcu(fa);
+ return 0;
+}
+
+static int flush_list(struct list_head *head)
+{
+ struct fib_alias *fa, *fa_node;
+ int found = 0;
+
+ list_for_each_entry_safe(fa, fa_node, head, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
+ list_del_rcu(&fa->fa_list);
+ fib_release_info(fa->fa_info);
+ alias_free_mem_rcu(fa);
+ found++;
+ }
+ }
+ return found;
+}
+
+static int flush_entry(struct entry *e)
+{
+ int found = 0;
+ struct hlist_head *lih = &e->list;
+ struct hlist_node *tmp;
+ struct entry_info *li = NULL;
+
+ hlist_for_each_entry_safe(li, tmp, lih, hlist) {
+ found += flush_list(&li->falh);
+
+ if (list_empty(&li->falh)) {
+ hlist_del_rcu(&li->hlist);
+ free_entry_info(li);
+ }
+ }
+ return found;
+}
+
+static struct entry *first_entry(struct rlist *rl)
+{
+ struct hlist_node *first;
+
+ first = rcu_dereference(hlist_first_rcu(&rl->list));
+ return first ? hlist_entry(first, struct entry, nd) : NULL;
+}
+
+static struct entry *next_entry(struct entry *e)
+{
+ struct hlist_node *next = hlist_next_rcu(&e->nd);
+
+ return next ? hlist_entry(next, struct entry, nd) : NULL;
+}
+
+static struct entry *entry_index(struct rlist *rl, int index)
+{
+ struct entry *e;
+
+ hlist_for_each_entry_rcu (e, &rl->list, nd) {
+ if (index-- <= 0)
+ break;
+ }
+ return e;
+}
+
+/*
+ * Caller must hold RTNL.
+ */
+int fib_table_flush(struct fib_table *tb)
+{
+ struct rlist *rl = (struct rlist *) tb->tb_data;
+ struct entry *ei;
+ struct hlist_node *tmp;
+ int found = 0;
+
+ hlist_for_each_entry_safe (ei, tmp, &rl->list, nd) {
+ found += flush_entry(ei);
+ if (hlist_empty(&ei->list))
+ entry_remove(ei);
+ }
+ pr_debug("flush found=%d\n", found);
+ return found;
+}
+
+void fib_free_table(struct fib_table *tb)
+{
+ kfree(tb);
+}
+
+static int fn_list_dump_fa(u32 key, int plen, struct list_head *fah,
+ struct fib_table *tb,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ int i, s_i;
+ struct fib_alias *fa;
+ __be32 xkey = htonl(key);
+
+ s_i = cb->args[5];
+ i = 0;
+
+ /* rcu_read_lock is hold by caller */
+
+ list_for_each_entry_rcu(fa, fah, fa_list) {
+ if (i < s_i) {
+ i++;
+ continue;
+ }
+
+ if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWROUTE,
+ tb->tb_id,
+ fa->fa_type,
+ xkey,
+ plen,
+ fa->fa_tos,
+ fa->fa_info, NLM_F_MULTI) < 0) {
+ cb->args[5] = i;
+ return -1;
+ }
+ i++;
+ }
+ cb->args[5] = i;
+ return skb->len;
+}
+
+static int fn_dump_entry(struct entry *l, struct fib_table *tb,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct entry_info *li;
+ int i, s_i;
+
+ s_i = cb->args[4];
+ i = 0;
+
+ /* rcu_read_lock is hold by caller */
+ hlist_for_each_entry_rcu(li, &l->list, hlist) {
+ if (i < s_i) {
+ i++;
+ continue;
+ }
+
+ if (i > s_i)
+ cb->args[5] = 0;
+
+ if (list_empty(&li->falh))
+ continue;
+
+ if (fn_list_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
+ cb->args[4] = i;
+ return -1;
+ }
+ i++;
+ }
+
+ cb->args[4] = i;
+ return skb->len;
+}
+
+int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct entry *l;
+ struct rlist *rl = (struct rlist *) tb->tb_data;
+ u32 key = cb->args[2];
+ int count = cb->args[3];
+
+ rcu_read_lock();
+ if (count == 0)
+ l = first_entry(rl);
+ else {
+ /* Normally, continue from last key, but if that is missing
+ * fallback to using slow rescan
+ */
+ l = fib_find_node(rl, key);
+ if (!l)
+ l = entry_index(rl, count);
+ }
+
+ while (l) {
+ cb->args[2] = l->key;
+ if (fn_dump_entry(l, tb, skb, cb) < 0) {
+ cb->args[3] = count;
+ rcu_read_unlock();
+ return -1;
+ }
+
+ ++count;
+ l = next_entry(l);
+ memset(&cb->args[4], 0,
+ sizeof(cb->args) - 4*sizeof(cb->args[0]));
+ }
+ cb->args[3] = count;
+ rcu_read_unlock();
+
+ return skb->len;
+}
+
+/* new name? */
+struct fib_table *fib_trie_table(u32 id)
+{
+ struct fib_table *tb;
+
+ tb = kzalloc(sizeof(struct fib_table) + sizeof(struct rtable),
+ GFP_KERNEL);
+ if (tb == NULL)
+ return NULL;
+
+ tb->tb_id = id;
+ tb->tb_default = -1;
+ tb->tb_num_default = 0;
+
+ return tb;
+}
+
+#ifdef CONFIG_PROC_FS
+
+struct fib_route_iter {
+ struct seq_net_private p;
+ struct rlist *rl;
+ loff_t pos;
+ u32 key;
+};
+
+static struct entry *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
+{
+ struct entry *e = NULL;
+
+ /* use cache location of last found key */
+ if (iter->pos > 0 && pos >= iter->pos && (e = fib_find_node(iter->rl, iter->key)) != NULL)
+ pos -= iter->pos;
+ else {
+ iter->pos = 0;
+ e = first_entry(iter->rl);
+ }
+
+ while (e && pos-- > 0) {
+ iter->pos++;
+ e = next_entry(e);
+ }
+
+ if (e)
+ iter->key = pos; /* remember it */
+ else
+ iter->pos = 0; /* forget it */
+
+ return e;
+}
+
+static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ struct fib_route_iter *iter = seq->private;
+ struct fib_table *tb;
+
+ rcu_read_lock();
+ tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
+ if (!tb)
+ return NULL;
+
+ iter->rl = (struct rlist *) tb->tb_data;
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+ else
+ return fib_route_get_idx(iter, *pos - 1);
+}
+
+static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct fib_route_iter *iter = seq->private;
+ struct entry *e = v;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN) {
+ iter->pos = 0;
+ e = first_entry(iter->rl);
+ } else {
+ iter->pos++;
+ e = next_entry(e);
+ }
+
+ if (e)
+ iter->key = e->key;
+ else
+ iter->pos = 0;
+ return e;
+}
+
+static void fib_route_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+{
+ unsigned int flags = 0;
+
+ if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
+ flags = RTF_REJECT;
+ if (fi && fi->fib_nh->nh_gw)
+ flags |= RTF_GATEWAY;
+ if (mask == htonl(0xFFFFFFFF))
+ flags |= RTF_HOST;
+ flags |= RTF_UP;
+ return flags;
+}
+
+/* This outputs /proc/net/route.
+ * The format of the file is not supposed to be changed
+ * and needs to be same as fib_hash output to avoid breaking
+ * legacy utilities.
+ */
+static int fib_route_seq_show(struct seq_file *seq, void *v)
+{
+ struct entry *entry = v;
+ struct entry_info *ei;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
+ "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
+ "\tWindow\tIRTT");
+ return 0;
+ }
+
+ hlist_for_each_entry_rcu(ei, &entry->list, hlist) {
+ struct fib_alias *fa;
+ __be32 mask, prefix;
+
+ mask = inet_make_mask(ei->plen);
+ prefix = htonl(entry->key);
+
+ list_for_each_entry_rcu(fa, &ei->falh, fa_list) {
+ const struct fib_info *fi = fa->fa_info;
+ unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
+
+ if (fa->fa_type == RTN_BROADCAST
+ || fa->fa_type == RTN_MULTICAST)
+ continue;
+
+ seq_setwidth(seq, 127);
+
+ if (fi)
+ seq_printf(seq,
+ "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%d\t%08X\t%d\t%u\t%u",
+ fi->fib_dev ? fi->fib_dev->name : "*",
+ prefix,
+ fi->fib_nh->nh_gw, flags, 0, 0,
+ fi->fib_priority,
+ mask,
+ (fi->fib_advmss ?
+ fi->fib_advmss + 40 : 0),
+ fi->fib_window,
+ fi->fib_rtt >> 3);
+ else
+ seq_printf(seq,
+ "*\t%08X\t%08X\t%04X\t%d\t%u\t"
+ "%d\t%08X\t%d\t%u\t%u",
+ prefix, 0, flags, 0, 0, 0,
+ mask, 0, 0, 0);
+
+ seq_pad(seq, '\n');
+ }
+ }
+
+ return 0;
+}
+
+static const struct seq_operations fib_route_seq_ops = {
+ .start = fib_route_seq_start,
+ .next = fib_route_seq_next,
+ .stop = fib_route_seq_stop,
+ .show = fib_route_seq_show,
+};
+
+static int fib_route_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &fib_route_seq_ops,
+ sizeof(struct fib_route_iter));
+}
+
+static const struct file_operations fib_route_fops = {
+ .owner = THIS_MODULE,
+ .open = fib_route_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+int __net_init fib_proc_init(struct net *net)
+{
+ if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
+ return -ENOMEM;
+
+ return 0;
+}
+
+void __net_exit fib_proc_exit(struct net *net)
+{
+ remove_proc_entry("route", net->proc_net);
+}
+
+#endif /* CONFIG_PROC_FS */
+
+void __init fib_trie_init(void) {}

--
1.9.0

David Miller

2014-05-06 03:21:05 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Tue, 6 May 2014 05:08:33 +0200

FWIW it seems to me the trie code is very stable, there doesn't
seem to be any big changes in a long time.

But it was BUGGY AS HELL in the corner cases because it wasn't
the default while it sat next to fib_hash.

It only got totally fixed when it was the ONLY option.

That's exactly my point, parts of it never got tested and were never
verified because debugging resources for it was split with another
implementation.

David Miller

2014-05-06 03:02:59 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:26:11 -0700

Post by Andi Kleen
Add an optional fib_list that uses a simple list to store routes.
This is suitable for single homed client system which typically
have only a handful of routes.

Sorry I'm not applying this.

We had two routing table implementations for some time and it sucked,
it meant two code paths and two places for bugs to hide.

I want only one data structure which does the job well, and we have
that now.

Andi Kleen

2014-05-06 03:08:33 UTC

Post by David Miller
Date: Mon, 5 May 2014 15:26:11 -0700

Post by Andi Kleen
Add an optional fib_list that uses a simple list to store routes.
This is suitable for single homed client system which typically
have only a handful of routes.

Sorry I'm not applying this.
We had two routing table implementations for some time and it sucked,
it meant two code paths and two places for bugs to hide.

FWIW it seems to me the trie code is very stable, there doesn't
seem to be any big changes in a long time.

Post by David Miller
I want only one data structure which does the job well, and we have
that now.

I don't see any good way to slim the trie code down. Do you?

It's just inherently complex. Yes on many systems it's a good thing,
and the complexity is justified,
but on the tiny systems the bloat and code size just hurts.

If we can't get the bloat down these users will just desert us to lwip ...

-Andi

--
***@linux.intel.com -- Speaking for myself only.

Andi Kleen

2014-05-05 22:26:02 UTC

From: Andi Kleen <***@linux.intel.com>

Make the GRO offload code optional. It's not needed on small systems.
Since it's not a single file there are a couple of ifdefs.

Some code is still there (not ifdef'ed) but can be removed now with LTO.

Without LTO it gives about 3K.

text data bss dec hex filename
432712 18689 12616 464017 71491 net/built-in.o-with-offload
429737 17665 12616 460018 704f2 net/built-in.o-wo-offload

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
include/net/protocol.h | 10 ++++++++++
include/net/tcp.h | 5 +++++
net/ipv4/Kconfig | 4 ++++
net/ipv4/Makefile | 5 +++--
net/ipv4/af_inet.c | 9 +++++++++
net/ipv4/protocol.c | 8 +++++++-
net/ipv6/Makefile | 5 +++--
net/ipv6/protocol.c | 2 ++
8 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/include/net/protocol.h b/include/net/protocol.h
index a7e986b..63f5b0c 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -103,8 +103,18 @@ extern const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS];

int inet_add_protocol(const struct net_protocol *prot, unsigned char num);
int inet_del_protocol(const struct net_protocol *prot, unsigned char num);
+#ifdef CONFIG_IP_OFFLOAD
int inet_add_offload(const struct net_offload *prot, unsigned char num);
int inet_del_offload(const struct net_offload *prot, unsigned char num);
+#else
+static inline int
+inet_add_offload(const struct net_offload *prot, unsigned char num)
+{ return 0; }
+static inline int
+inet_del_offload(const struct net_offload *prot, unsigned char num)
+{ return 0; }
+#endif
+
void inet_register_protosw(struct inet_protosw *p);
void inet_unregister_protosw(struct inet_protosw *p);

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d741d2f..ac9f6bd 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1567,7 +1567,12 @@ void tcp_v4_destroy_sock(struct sock *sk);
struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
netdev_features_t features);
struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb);
+#ifdef CONFIG_IP_OFFLOAD
int tcp_gro_complete(struct sk_buff *skb);
+#else
+/* For the benefit of one driver who really shouldn't be using this. */
+static inline int tcp_gro_complete(struct sk_buff *skb) { return -EIO; }
+#endif

void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr);

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index db2dada..00a7f76 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -306,6 +306,10 @@ config SYN_COOKIES

If unsure, say N.

+config IP_OFFLOAD
+ bool "Support for IP GRO/offload"
+ default y
+
config NET_IPVTI
tristate "Virtual (secure) IP: tunneling"
select INET_TUNNEL
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 8b17b83..784a782 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,8 +8,8 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_fastopen.o \
- tcp_offload.o datagram.o raw.o udp.o udplite.o \
- udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
+ datagram.o raw.o udp.o udplite.o \
+ arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
inet_fragment.o ip_tunnel_core.o gre_offload.o

@@ -26,6 +26,7 @@ obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
obj-$(CONFIG_NET_IPGRE) += ip_gre.o
obj-$(CONFIG_NET_IPVTI) += ip_vti.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
+obj-$(CONFIG_IP_OFFLOAD) += tcp_offload.o udp_offload.o
obj-$(CONFIG_INET_AH) += ah4.o
obj-$(CONFIG_INET_ESP) += esp4.o
obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c275ce5..e65e750 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1209,6 +1209,9 @@ int inet_sk_rebuild_header(struct sock *sk)
}
EXPORT_SYMBOL(inet_sk_rebuild_header);

+#ifdef CONFIG_IP_OFFLOAD
+/* Should move to a new file */
+
static int inet_gso_send_check(struct sk_buff *skb)
{
const struct net_offload *ops;
@@ -1455,6 +1458,8 @@ out_unlock:
return err;
}

+#endif
+
int inet_ctl_sock_create(struct sock **sk, unsigned short family,
unsigned short type, unsigned char protocol,
struct net *net)
@@ -1653,6 +1658,9 @@ static int __init init_ipv4_mibs(void)

static int ipv4_proc_init(void);

+#ifdef CONFIG_IP_OFFLOAD
+/* Move elsewhere? */
+
/*
* IP protocol layer initialiser
*/
@@ -1690,6 +1698,7 @@ static int __init ipv4_offload_init(void)
}

fs_initcall(ipv4_offload_init);
+#endif

static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 46d6a1c..0a33a12 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -29,7 +29,6 @@
#include <net/protocol.h>

const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
-const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;

int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
@@ -44,6 +43,9 @@ int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
}
EXPORT_SYMBOL(inet_add_protocol);

+#ifdef CONFIG_IP_OFFLOAD
+const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
+
int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
{
return !cmpxchg((const struct net_offload **)&inet_offloads[protocol],
@@ -51,6 +53,8 @@ int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
}
EXPORT_SYMBOL(inet_add_offload);

+#endif
+
int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
{
int ret;
@@ -64,6 +68,7 @@ int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
}
EXPORT_SYMBOL(inet_del_protocol);

+#ifdef CONFIG_IP_OFFLOAD
int inet_del_offload(const struct net_offload *prot, unsigned char protocol)
{
int ret;
@@ -76,3 +81,4 @@ int inet_del_offload(const struct net_offload *prot, unsigned char protocol)
return ret;
}
EXPORT_SYMBOL(inet_del_offload);
+#endif
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 6ff7cfd..7ce7aa0 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -10,7 +10,8 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o \
exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o

-ipv6-offload := ip6_offload.o tcpv6_offload.o udp_offload.o exthdrs_offload.o
+ipv6-offload-$(CONFIG_IP_OFFLOAD) := ip6_offload.o tcpv6_offload.o \
+ udp_offload.o exthdrs_offload.o

ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o
ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o
@@ -43,6 +44,6 @@ obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
obj-$(CONFIG_IPV6_GRE) += ip6_gre.o

obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
-obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload)
+obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload-y)

obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c
index e048cf1..bae02e1 100644
--- a/net/ipv6/protocol.c
+++ b/net/ipv6/protocol.c
@@ -50,6 +50,7 @@ int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol
EXPORT_SYMBOL(inet6_del_protocol);
#endif

+#ifdef CONFIG_IP_OFFLOAD
const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS] __read_mostly;

int inet6_add_offload(const struct net_offload *prot, unsigned char protocol)
@@ -71,3 +72,4 @@ int inet6_del_offload(const struct net_offload *prot, unsigned char protocol)
return ret;
}
EXPORT_SYMBOL(inet6_del_offload);
+#endif

--
1.9.0

David Miller

2014-05-06 03:01:18 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Mon, 5 May 2014 15:26:02 -0700

Post by Andi Kleen
It's not needed on small systems.

On the contrary, I would expect the performance gains from
batching the packet receive path to be even larger on slow
small computers.

Andi Kleen

2014-05-06 03:03:28 UTC

Post by David Miller
Date: Mon, 5 May 2014 15:26:02 -0700

Post by Andi Kleen
It's not needed on small systems.

On the contrary, I would expect the performance gains from
batching the packet receive path to be even larger on slow
small computers.

Ok I will rephrase it. The issue is really only the text size.

-Andi

--
***@linux.intel.com -- Speaking for myself only.

Andi Kleen

2014-05-05 22:25:51 UTC

From: Andi Kleen <***@linux.intel.com>

For really small kernels, use only 3 bits for the hash table.

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
net/core/dev.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index d2c8a06..c6cbe69 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -175,7 +175,14 @@ EXPORT_SYMBOL(dev_base_lock);
static DEFINE_SPINLOCK(napi_hash_lock);

static unsigned int napi_gen_id;
-static DEFINE_HASHTABLE(napi_hash, 8);
+
+#ifdef CONFIG_BASE_SMALL
+#define NAPI_HASH_BITS 3
+#else
+#define NAPI_HASH_BITS 8
+#endif
+
+static DEFINE_HASHTABLE(napi_hash, NAPI_HASH_BITS);

static seqcount_t devnet_rename_seq;

--
1.9.0

Andi Kleen

2014-05-05 22:26:06 UTC

From: Andi Kleen <***@linux.intel.com>

Make the internal stack socket use UDP. This avoids a dependency
to the soon optional RAW sockets.

Generally the users only use rudimential socket services, mostly
for managing the socket buffer. This is the same for UDP and RAW.

Could in fact remove the arguments.

Signed-off-by: Andi Kleen <***@linux.intel.com>
---
net/ipv4/icmp.c | 9 ++++++++-
net/ipv6/af_inet6.c | 3 +++
net/ipv6/icmp.c | 2 +-
net/ipv6/mcast.c | 2 +-
net/ipv6/ndisc.c | 2 +-
net/ipv6/tcp_ipv6.c | 2 +-
6 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index df3872b..3e111d8 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1092,8 +1092,15 @@ static int __net_init icmp_sk_init(struct net *net)
for_each_possible_cpu(i) {
struct sock *sk;

+ /*
+ * Use UDP here. We only use rudimentary
+ * functionality of the socket, and UDP
+ * provides it for us.
+ * This avoids a dependency on the optional
+ * RAW sockets
+ */
err = inet_ctl_sock_create(&sk, PF_INET,
- SOCK_RAW, IPPROTO_ICMP, net);
+ SOCK_DGRAM, 0, net);
if (err < 0)
goto fail;

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 9ff80ad..327042a 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -854,6 +854,9 @@ static int __init inet6_init(void)
if (err)
goto out;

+ /* We MUST register UDP sockets before we create the ICMP6,
+ * IGMP6, or NDISC control sockets.
+ */
err = proto_register(&udpv6_prot, 1);
if (err)
goto out_unregister_tcp_proto;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 7b32652..7effc19 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -844,7 +844,7 @@ static int __net_init icmpv6_sk_init(struct net *net)

for_each_possible_cpu(i) {
err = inet_ctl_sock_create(&sk, PF_INET6,
- SOCK_RAW, IPPROTO_ICMPV6, net);
+ SOCK_DGRAM, 0, net);
if (err < 0) {
pr_err("Failed to initialize the ICMP6 control socket (err %d)\n",
err);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 08b367c..88c0520 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -2870,7 +2870,7 @@ static int __net_init igmp6_net_init(struct net *net)
int err;

err = inet_ctl_sock_create(&net->ipv6.igmp_sk, PF_INET6,
- SOCK_RAW, IPPROTO_ICMPV6, net);
+ SOCK_DGRAM, 0, net);
if (err < 0) {
pr_err("Failed to initialize the IGMP6 control socket (err %d)\n",
err);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 09a22f4..7dd16e1 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1689,7 +1689,7 @@ static int __net_init ndisc_net_init(struct net *net)
int err;

err = inet_ctl_sock_create(&sk, PF_INET6,
- SOCK_RAW, IPPROTO_ICMPV6, net);
+ SOCK_DGRAM, 0, net);
if (err < 0) {
ND_PRINTK(0, err,
"NDISC: Failed to initialize the control socket (err %d)\n",
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e289830..b17499d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1989,7 +1989,7 @@ static struct inet_protosw tcpv6_protosw = {
static int __net_init tcpv6_net_init(struct net *net)
{
return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6,
- SOCK_RAW, IPPROTO_TCP, net);
+ SOCK_DGRAM, 0, net);
}

static void __net_exit tcpv6_net_exit(struct net *net)

--
1.9.0

David Miller

2014-05-06 17:14:42 UTC

From: Eric Dumazet <***@gmail.com>
Date: Tue, 06 May 2014 09:39:19 -0700

I have started using linux on 386/486 pcs which had more than 2MB of
memory, it makes me sad we want linux-3.16 to run on this kind of
hardware, and consuming time to save few KB here and here.

+1

Rick Jones

2014-05-06 17:12:11 UTC

Making 2MB RAM machines today makes no sense at all.
The lowest end dirt cheap smartphone, something which fits on
someone's pocket, has gigabytes of ram.

The lowest-end smartphone isn't anywhere close to "dirt cheap", and
hardly counts as "embedded" at all anymore. Smartphones cost $100+;
we're talking about systems in the low tens of dollars or less. These
systems will have no graphics, no peripherals, and only one or two
specific functions. The entirety of their functionality will likely
consist of a single userspace program; they might not even have a PID 2.
*That's* the kind of "embedded" we're talking about, not the
supercomputers we carry around in our pockets.

Would this be some sort of "Internet of Things" system?

rick jones

j***@joshtriplett.org

2014-05-06 18:09:19 UTC

Post by Rick Jones

Making 2MB RAM machines today makes no sense at all.
The lowest end dirt cheap smartphone, something which fits on
someone's pocket, has gigabytes of ram.

The lowest-end smartphone isn't anywhere close to "dirt cheap", and
hardly counts as "embedded" at all anymore. Smartphones cost $100+;
we're talking about systems in the low tens of dollars or less. These
systems will have no graphics, no peripherals, and only one or two
specific functions. The entirety of their functionality will likely
consist of a single userspace program; they might not even have a PID 2.
*That's* the kind of "embedded" we're talking about, not the
supercomputers we carry around in our pockets.

Would this be some sort of "Internet of Things" system?

That's one of many buzzwords being used for this kind of system, sure.
The "Internet of" part makes networking particularly important.

- Josh Triplett

Alexei Starovoitov

2014-05-06 15:20:50 UTC

There has been a lot of interest recently to run Linux on very small systems,
like Quark systems. These may have only 2-4MB memory. They are also limited
by flash space.
One problem on these small system is the size of the network stack.
Currently enabling IPv4 costs about 400k in text, which is prohibitive on
a 2MB system, and very expensive with 4MB.
There were proposals to instead use LWIP in user space. LWIP with
its socket interface comes in at a bit over 100k overhead per application.
I maintain that the Linux network stack is actually not that bloated,
it just has a lot of features :-) The goal of this project was to
subset it in a sensible way so that the native kernel stack becomes
competitive with LWIP.
It turns out that the standard stack has a couple of features that
are not really needed on client systems. Luckily it is also
relatively well modularized, so it becomes possible to stub
out these features at the edge.
With removing these features we still have a powerful TCP/IP stack,
but one that fits better into small systems.
It would have been prohibitive to ifdef every optional feature.
This patchkit relies heavily on LTO to effectively remove unused
code. This allows to disable features only at the module boundaries,
and rely on the compiler to drop unreferenced code and data.
A few features have been also reimplemented in a simpler way.
And I shrank a number of data structures based on CONFIG_BASE_SMALL.
With these changes I can get a fully featured network stack down
to about 170k with LTO. Without LTO there are also benefits,
but somewhat less.
- Full featured like today.
- Client only subset, but still works with standard distribution userland.
Remove some obscure features like fastopen, make all tables smaller,
packet socket mmap code, use a simpler routing table, remove
high speed networking features like RPX, XPS, GRO offload.
Disable SNMP, TCP metrics
- Minimal subset for deeply embedded systems that can use special userland.
Remove rtnetlink (ioctl only), remove ethtool, raw sockets.
Right now I'm using own Kconfigs for every removed features. I realize
this somewhat increases the compile test matrix. It would be possible
to hide some of the options and select them using higher level
configurations like the ones listed above. I haven't done this
in this version.
At this point I'm mainly interested in review and comments.
git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc net/debloat
Main tree
git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc net/debloat-3.14
3.14 based tree.
Thanks to Tom Zanussi for contributions and testing.

What kind of userspace do you use on such a small system?
It looks like you run kernels without procfs and netlink, so not even
ps would work. :)

The microYocto 'distro' I have running with these net-diet patches
doesn't use a full procfs, but a pared-down version (CONFIG_PROCFS_MIN).
Keeping ps working is of course essential, and it does that (along with
a couple other things like /proc/filesystems and /proc/mounts I needed
https://github.com/tzanussi/linux-yocto-micro-3.14/commit/68379432afcfa82ac695d9f02892fcf48ade5ae8
Anyway all the userspace and kernel bits are available for anyone who
https://github.com/tzanussi/meta-galileo/blob/daisy/meta-galileo/README
It's very much a work-in-progress with a lot of rough edges, but it is a
fully functional system on real hardware (Galileo board/Quark processor)
with a usable shell (ps too!) and web server running on a kernel with
native networking and ~ 750k text size.

Intel Galileo datasheet says:
- 400MHz 32bit Intel
- 512 KBytes of on-die embedded SRAM
- 256 MByte DRAM, enabled by the firmware by default

where did 2-4Mbyte restriction come from?

Anyway, with all these hacks you get a half functional kernel with "a
lot of rough edges"
that is likely working only for the given very limited set of applications.
Kernel function profiling can potentially achieve the same thing.
Profile the kernel with the set of apps and then prune all cold
functions out of kernel.
config explosion and LTO is unnecessary. Just some linker hacks.
Obviously such kernel will also be half functional,
but you'll get big reduction in .text that it seems is the goal of this project.

Tom Zanussi

2014-05-06 15:34:50 UTC

Post by Alexei Starovoitov

There has been a lot of interest recently to run Linux on very small systems,
like Quark systems. These may have only 2-4MB memory. They are also limited
by flash space.
One problem on these small system is the size of the network stack.
Currently enabling IPv4 costs about 400k in text, which is prohibitive on
a 2MB system, and very expensive with 4MB.
There were proposals to instead use LWIP in user space. LWIP with
its socket interface comes in at a bit over 100k overhead per application.
I maintain that the Linux network stack is actually not that bloated,
it just has a lot of features :-) The goal of this project was to
subset it in a sensible way so that the native kernel stack becomes
competitive with LWIP.
It turns out that the standard stack has a couple of features that
are not really needed on client systems. Luckily it is also
relatively well modularized, so it becomes possible to stub
out these features at the edge.
With removing these features we still have a powerful TCP/IP stack,
but one that fits better into small systems.
It would have been prohibitive to ifdef every optional feature.
This patchkit relies heavily on LTO to effectively remove unused
code. This allows to disable features only at the module boundaries,
and rely on the compiler to drop unreferenced code and data.
A few features have been also reimplemented in a simpler way.
And I shrank a number of data structures based on CONFIG_BASE_SMALL.
With these changes I can get a fully featured network stack down
to about 170k with LTO. Without LTO there are also benefits,
but somewhat less.
- Full featured like today.
- Client only subset, but still works with standard distribution userland.
Remove some obscure features like fastopen, make all tables smaller,
packet socket mmap code, use a simpler routing table, remove
high speed networking features like RPX, XPS, GRO offload.
Disable SNMP, TCP metrics
- Minimal subset for deeply embedded systems that can use special userland.
Remove rtnetlink (ioctl only), remove ethtool, raw sockets.
Right now I'm using own Kconfigs for every removed features. I realize
this somewhat increases the compile test matrix. It would be possible
to hide some of the options and select them using higher level
configurations like the ones listed above. I haven't done this
in this version.
At this point I'm mainly interested in review and comments.
git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc net/debloat
Main tree
git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc net/debloat-3.14
3.14 based tree.
Thanks to Tom Zanussi for contributions and testing.

What kind of userspace do you use on such a small system?
It looks like you run kernels without procfs and netlink, so not even
ps would work. :)

The microYocto 'distro' I have running with these net-diet patches
doesn't use a full procfs, but a pared-down version (CONFIG_PROCFS_MIN).
Keeping ps working is of course essential, and it does that (along with
a couple other things like /proc/filesystems and /proc/mounts I needed
https://github.com/tzanussi/linux-yocto-micro-3.14/commit/68379432afcfa82ac695d9f02892fcf48ade5ae8
Anyway all the userspace and kernel bits are available for anyone who
https://github.com/tzanussi/meta-galileo/blob/daisy/meta-galileo/README
It's very much a work-in-progress with a lot of rough edges, but it is a
fully functional system on real hardware (Galileo board/Quark processor)
with a usable shell (ps too!) and web server running on a kernel with
native networking and ~ 750k text size.

- 400MHz 32bit Intel
- 512 KBytes of on-die embedded SRAM
- 256 MByte DRAM, enabled by the firmware by default
where did 2-4Mbyte restriction come from?

General 'order-of-magnitude' difference from the typical 'tiny distro'
which typically targets about 16MB, so sort of arbitrary, but it's a
nice round goal for similar systems I'm sure are coming.

Actually, a better goal would be to run only on the 512k SRAM, but let's
start with something more achievable for a first cut.

Post by Alexei Starovoitov
Anyway, with all these hacks you get a half functional kernel with "a
lot of rough edges"

'work-in-progress' see above.

Post by Alexei Starovoitov
that is likely working only for the given very limited set of applications.
Kernel function profiling can potentially achieve the same thing.
Profile the kernel with the set of apps and then prune all cold
functions out of kernel.

Right, and are Profile-Guided-Optimization results now reproduceable?
Better change it to Trace-Guided-Optimization. But yeah, for a
single-purpose system where it's known exactly what will run for the
lifetime of the system, it makes sense to get rid of all the codepaths
that will never be hit.

Post by Alexei Starovoitov
config explosion and LTO is unnecessary. Just some linker hacks.
Obviously such kernel will also be half functional,
but you'll get big reduction in .text that it seems is the goal of this project.

Alexei Starovoitov

2014-05-06 17:20:26 UTC

Post by Tom Zanussi

Post by Alexei Starovoitov

There has been a lot of interest recently to run Linux on very small systems,
like Quark systems. These may have only 2-4MB memory. They are also limited
by flash space.
One problem on these small system is the size of the network stack.
Currently enabling IPv4 costs about 400k in text, which is prohibitive on
a 2MB system, and very expensive with 4MB.
There were proposals to instead use LWIP in user space. LWIP with
its socket interface comes in at a bit over 100k overhead per application.
I maintain that the Linux network stack is actually not that bloated,
it just has a lot of features :-) The goal of this project was to
subset it in a sensible way so that the native kernel stack becomes
competitive with LWIP.
It turns out that the standard stack has a couple of features that
are not really needed on client systems. Luckily it is also
relatively well modularized, so it becomes possible to stub
out these features at the edge.
With removing these features we still have a powerful TCP/IP stack,
but one that fits better into small systems.
It would have been prohibitive to ifdef every optional feature.
This patchkit relies heavily on LTO to effectively remove unused
code. This allows to disable features only at the module boundaries,
and rely on the compiler to drop unreferenced code and data.
A few features have been also reimplemented in a simpler way.
And I shrank a number of data structures based on CONFIG_BASE_SMALL.
With these changes I can get a fully featured network stack down
to about 170k with LTO. Without LTO there are also benefits,
but somewhat less.
- Full featured like today.
- Client only subset, but still works with standard distribution userland.
Remove some obscure features like fastopen, make all tables smaller,
packet socket mmap code, use a simpler routing table, remove
high speed networking features like RPX, XPS, GRO offload.
Disable SNMP, TCP metrics
- Minimal subset for deeply embedded systems that can use special userland.
Remove rtnetlink (ioctl only), remove ethtool, raw sockets.
Right now I'm using own Kconfigs for every removed features. I realize
this somewhat increases the compile test matrix. It would be possible
to hide some of the options and select them using higher level
configurations like the ones listed above. I haven't done this
in this version.
At this point I'm mainly interested in review and comments.
git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc net/debloat
Main tree
git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc net/debloat-3.14
3.14 based tree.
Thanks to Tom Zanussi for contributions and testing.

What kind of userspace do you use on such a small system?
It looks like you run kernels without procfs and netlink, so not even
ps would work. :)

The microYocto 'distro' I have running with these net-diet patches
doesn't use a full procfs, but a pared-down version (CONFIG_PROCFS_MIN).
Keeping ps working is of course essential, and it does that (along with
a couple other things like /proc/filesystems and /proc/mounts I needed
https://github.com/tzanussi/linux-yocto-micro-3.14/commit/68379432afcfa82ac695d9f02892fcf48ade5ae8
Anyway all the userspace and kernel bits are available for anyone who
https://github.com/tzanussi/meta-galileo/blob/daisy/meta-galileo/README
It's very much a work-in-progress with a lot of rough edges, but it is a
fully functional system on real hardware (Galileo board/Quark processor)
with a usable shell (ps too!) and web server running on a kernel with
native networking and ~ 750k text size.

- 400MHz 32bit Intel
- 512 KBytes of on-die embedded SRAM
- 256 MByte DRAM, enabled by the firmware by default
where did 2-4Mbyte restriction come from?

General 'order-of-magnitude' difference from the typical 'tiny distro'
which typically targets about 16MB, so sort of arbitrary, but it's a
nice round goal for similar systems I'm sure are coming.
Actually, a better goal would be to run only on the 512k SRAM, but let's
start with something more achievable for a first cut.

Post by Alexei Starovoitov
Anyway, with all these hacks you get a half functional kernel with "a
lot of rough edges"

'work-in-progress' see above.

Post by Alexei Starovoitov
that is likely working only for the given very limited set of applications.
Kernel function profiling can potentially achieve the same thing.
Profile the kernel with the set of apps and then prune all cold
functions out of kernel.

Right, and are Profile-Guided-Optimization results now reproduceable?
Better change it to Trace-Guided-Optimization. But yeah, for a

not quite. I'm saying: no extra optimizations, no GCC changes.
Compile kernel as-is. Most functions have a stub for mcount() already.
Use it to track whether kernel function was called or not.
Collect this data in userspace (as perf already does), add few
more functions that had 'notrace' attribute on them, and feed this into
special linker that unpacks existing vmlinux, throws away cold functions,
relocates the rest and here you have tiny vmlinux without recompilation.

Post by Tom Zanussi
single-purpose system where it's known exactly what will run for the
lifetime of the system, it makes sense to get rid of all the codepaths
that will never be hit.

Post by Alexei Starovoitov
config explosion and LTO is unnecessary. Just some linker hacks.
Obviously such kernel will also be half functional,
but you'll get big reduction in .text that it seems is the goal of this project.

Andi Kleen

2014-05-06 20:00:47 UTC

Post by Alexei Starovoitov
not quite. I'm saying: no extra optimizations, no GCC changes.
Compile kernel as-is. Most functions have a stub for mcount() already.
Use it to track whether kernel function was called or not.
Collect this data in userspace (as perf already does), add few
more functions that had 'notrace' attribute on them, and feed this into
special linker that unpacks existing vmlinux, throws away cold functions,
relocates the rest and here you have tiny vmlinux without recompilation.

That's very difficult for networking code. How would you know you
exercised all the corner cases in the TCP stack? And you wouldn't
want a remotely exploitable system because some important error
handler is missing.

I agree it may work for some other subsystems.

-Andi

Tom Zanussi

2014-05-06 13:34:06 UTC

There has been a lot of interest recently to run Linux on very small systems,
like Quark systems. These may have only 2-4MB memory. They are also limited
by flash space.
One problem on these small system is the size of the network stack.
Currently enabling IPv4 costs about 400k in text, which is prohibitive on
a 2MB system, and very expensive with 4MB.
There were proposals to instead use LWIP in user space. LWIP with
its socket interface comes in at a bit over 100k overhead per application.
I maintain that the Linux network stack is actually not that bloated,
it just has a lot of features :-) The goal of this project was to
subset it in a sensible way so that the native kernel stack becomes
competitive with LWIP.
It turns out that the standard stack has a couple of features that
are not really needed on client systems. Luckily it is also
relatively well modularized, so it becomes possible to stub
out these features at the edge.
With removing these features we still have a powerful TCP/IP stack,
but one that fits better into small systems.
It would have been prohibitive to ifdef every optional feature.
This patchkit relies heavily on LTO to effectively remove unused
code. This allows to disable features only at the module boundaries,
and rely on the compiler to drop unreferenced code and data.
A few features have been also reimplemented in a simpler way.
And I shrank a number of data structures based on CONFIG_BASE_SMALL.
With these changes I can get a fully featured network stack down
to about 170k with LTO. Without LTO there are also benefits,
but somewhat less.
- Full featured like today.
- Client only subset, but still works with standard distribution userland.
Remove some obscure features like fastopen, make all tables smaller,
packet socket mmap code, use a simpler routing table, remove
high speed networking features like RPX, XPS, GRO offload.
Disable SNMP, TCP metrics
- Minimal subset for deeply embedded systems that can use special userland.
Remove rtnetlink (ioctl only), remove ethtool, raw sockets.
Right now I'm using own Kconfigs for every removed features. I realize
this somewhat increases the compile test matrix. It would be possible
to hide some of the options and select them using higher level
configurations like the ones listed above. I haven't done this
in this version.
At this point I'm mainly interested in review and comments.
git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc net/debloat
Main tree
git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc net/debloat-3.14
3.14 based tree.
Thanks to Tom Zanussi for contributions and testing.

What kind of userspace do you use on such a small system?
It looks like you run kernels without procfs and netlink, so not even
ps would work. :)

The microYocto 'distro' I have running with these net-diet patches
doesn't use a full procfs, but a pared-down version (CONFIG_PROCFS_MIN).
Keeping ps working is of course essential, and it does that (along with
a couple other things like /proc/filesystems and /proc/mounts I needed
to boot):

https://github.com/tzanussi/linux-yocto-micro-3.14/commit/68379432afcfa82ac695d9f02892fcf48ade5ae8

Anyway all the userspace and kernel bits are available for anyone who
wants to build it and try it out:

https://github.com/tzanussi/meta-galileo/blob/daisy/meta-galileo/README

It's very much a work-in-progress with a lot of rough edges, but it is a
fully functional system on real hardware (Galileo board/Quark processor)
with a usable shell (ps too!) and web server running on a kernel with
native networking and ~ 750k text size.

Tom

j***@joshtriplett.org

2014-05-06 17:21:06 UTC

Date: Tue, 6 May 2014 09:45:46 -0700

The kernel can do the same. Consider the idea of analyzing a set of
userspace programs, determining what kernel functionality they do and
don't need, feeding that information into the kernel build process, and
automatically dropping unused bits of the kernel.

Please make sure I'm not on the list of people who see reports for
bugs reported in that setup.
Thanks :-)

Fine by me. Just please allow such a setup to exist. :)

- Josh Triplett

David Miller

2014-05-06 17:25:01 UTC

From: ***@joshtriplett.org
Date: Tue, 6 May 2014 10:21:06 -0700

Post by j***@joshtriplett.org

Date: Tue, 6 May 2014 09:45:46 -0700

The kernel can do the same. Consider the idea of analyzing a set of
userspace programs, determining what kernel functionality they do and
don't need, feeding that information into the kernel build process, and
automatically dropping unused bits of the kernel.

Please make sure I'm not on the list of people who see reports for
bugs reported in that setup.
Thanks :-)

Fine by me. Just please allow such a setup to exist. :)

You see, that's the point I'm trying to make, once it's upstream
then it's my problem.

You absolutely must consider the burdon you put upon upstream
maintainers when you ask for things to be included.

Andi Kleen

2014-05-06 20:06:44 UTC

Post by David Miller
You see, that's the point I'm trying to make, once it's upstream
then it's my problem.

FWIW I don't think any of the changes I proposed would be likely
to add lots of new bugs. Nothing was really adding any significant new logic,
just doing less (modulo perhaps fib_list) Was that your main concern?

I realize the many config options are something that could
be consolidated to ease compile time testing.

-Andi

--
***@linux.intel.com -- Speaking for myself only.

David Miller

2014-05-06 20:47:42 UTC

From: Andi Kleen <***@firstfloor.org>
Date: Tue, 6 May 2014 22:06:44 +0200

Post by Andi Kleen

Post by David Miller
You see, that's the point I'm trying to make, once it's upstream
then it's my problem.

FWIW I don't think any of the changes I proposed would be likely
to add lots of new bugs.

Then you're living in a dream world, one in which the rest of us do
not live in.

Every new configuration combination is a new situation that competes
testing wise with the others.

j***@joshtriplett.org

2014-05-06 20:11:45 UTC

Post by David Miller
Date: Tue, 6 May 2014 10:21:06 -0700

Post by j***@joshtriplett.org

Date: Tue, 6 May 2014 09:45:46 -0700

The kernel can do the same. Consider the idea of analyzing a set of
userspace programs, determining what kernel functionality they do and
don't need, feeding that information into the kernel build process, and
automatically dropping unused bits of the kernel.

Please make sure I'm not on the list of people who see reports for
bugs reported in that setup.
Thanks :-)

Fine by me. Just please allow such a setup to exist. :)

You see, that's the point I'm trying to make, once it's upstream
then it's my problem.
You absolutely must consider the burdon you put upon upstream
maintainers when you ask for things to be included.

Absolutely. And Andi and I are both interested in working *with* you to
find a way to run on tiny embedded systems *without* necessarily
introducing excessive proliferation of configuration options or
unnecessarily increasing your support burden.

For instance, it's easy enough to key some options off of CONFIG_NR_CPUS
(such as data structure sizes), or introduce a big config switch
(CONFIG_NETWORK_FULL=n or similar) that controls all of these cases
rather than having option for each. That would not be hard to supply in
a v2 of this patch series.

And if you're asking for someone to help pay attention to bug reports so
you don't have to, that's reasonable as well; just like you probably
have a stock response for "that's a crazy distro kernel, ask them about
it and not me", you could have a stock response for "that kernel has the
crazy embedded option turned on, ask the embedded people about it and
not me". It doesn't just have to be *your* problem alone.

There's a stigma rightfully attached to out-of-tree patches, which
roughly amounts to "people ought to submit patches upstream, we
shouldn't have to support or care about out-of-tree patches". But that
only works if the responses to patch submissions are either "No, because
you need to fix X, Y, and Z", or "No, because your use case is better
served by this existing mechanism already in the kernel", rather than
"No, your use case is not valid".

- Josh Triplett

Andi Kleen

2014-05-06 20:44:17 UTC

Post by j***@joshtriplett.org
And if you're asking for someone to help pay attention to bug reports so
you don't have to, that's reasonable as well; just like you probably
have a stock response for "that's a crazy distro kernel, ask them about
it and not me", you could have a stock response for "that kernel has the
crazy embedded option turned on, ask the embedded people about it and
not me". It doesn't just have to be *your* problem alone.

We could add a tainted flag for these configurations and a message at bootup
to make it obvious in bug reports. Would that help?

-Andi

David Miller

2014-05-06 15:59:41 UTC

From: ***@joshtriplett.org
Date: Tue, 6 May 2014 08:57:03 -0700

Date: Tue, 6 May 2014 05:21:14 +0200

What parts would you remove to get the foot print down for a 2MB
single purpose machine?

I wouldn't use Linux, end of story.
Maybe two decades ago, but not now, those days are over.

Making 2MB RAM machines today makes no sense at all.

The lowest end dirt cheap smartphone, something which fits on
someone's pocket, has gigabytes of ram.

The only entity looking backwards are the people making these
improperly provisioned systems.

David Miller

2014-05-06 17:16:43 UTC

From: ***@joshtriplett.org
Date: Tue, 6 May 2014 09:41:08 -0700

Every KB of RAM costs real money and SoC die area (for eDRAM/eSRAM).

Another poster commented that 16MB of DRAM would be cheaper than
the 2MB of ram you have on these boards, probably one that fits
your size profile is available as well.

2MB is just a rediculous restriction.

And last time I checked Linux wasn't a special purpose operating
system, but lucky for you I hear there are lots of those around.

Andi Kleen

2014-05-06 18:48:40 UTC

So why bothers 3.15+ Linux kernel? Why not use an old kernel e.g. 2.4.x?
2.4.x kernel doesn't have so many new features you want to get rid of here.

Nobody wants to be stuck on an ancient kernel forever.

-Andi

--
***@linux.intel.com -- Speaking for myself only

j***@joshtriplett.org

2014-05-06 17:55:47 UTC

Post by David Miller
Date: Tue, 6 May 2014 09:41:08 -0700

Every KB of RAM costs real money and SoC die area (for eDRAM/eSRAM).

Another poster commented that 16MB of DRAM would be cheaper than
the 2MB of ram you have on these boards, probably one that fits
your size profile is available as well.
2MB is just a rediculous restriction.

Embedded systems experts disagree with you there; there *are* systems
where the most cost-efficient approach is a few MB (or a few hundred KB)
of non-discrete memory. We're not talking about socketed memory or even
soldered-down memory; we're talking about entire systems that fit on a
small SoC die. The space not used by that extra RAM may well be better
spent on CPU optimizations, or some other integrated component.

Such boards will be built, and many of them will run Linux, despite your
incredulity. When you're building millions of a board, it's well worth
optimizing software to eliminate components from the bill of materials.

And even on a system with 4MB or 8MB or 16MB of memory, a few hundred
extra KB used by the kernel and unavailable to userspace *matters*; that
could be the difference between fitting in 8MB or 16MB.

Post by David Miller
And last time I checked Linux wasn't a special purpose operating
system

No, it's an extremely general-purpose operating system, supporting
enough customization to run on everything from supercomputers to some
embedded systems. And that's partly because people who care about those
systems submit patches to make Linux scale. You take patches to scale
*up* to absurdly huge systems; what makes it so painful to take patches
to scale *down*?

Post by David Miller
, but lucky for you I hear there are lots of those around.

Why would I want to run one of those when I can run Linux?

- Josh Triplett

Richard Cochran

2014-05-06 19:19:11 UTC

So why bothers 3.15+ Linux kernel? Why not use an old kernel e.g. 2.4.x?
2.4.x kernel doesn't have so many new features you want to get rid of here.

If you compare a 3.x and a 2.4.x kernel with the same minimal feature
set, you might see that the 3.x is bigger.

Thanks,
Richard

Cong Wang

2014-05-06 18:33:11 UTC

Post by j***@joshtriplett.org

Post by David Miller
Date: Tue, 6 May 2014 09:41:08 -0700

Every KB of RAM costs real money and SoC die area (for eDRAM/eSRAM).

Another poster commented that 16MB of DRAM would be cheaper than
the 2MB of ram you have on these boards, probably one that fits
your size profile is available as well.
2MB is just a rediculous restriction.

Embedded systems experts disagree with you there; there *are* systems
where the most cost-efficient approach is a few MB (or a few hundred KB)
of non-discrete memory. We're not talking about socketed memory or even
soldered-down memory; we're talking about entire systems that fit on a
small SoC die. The space not used by that extra RAM may well be better
spent on CPU optimizations, or some other integrated component.
Such boards will be built, and many of them will run Linux, despite your
incredulity. When you're building millions of a board, it's well worth
optimizing software to eliminate components from the bill of materials.

So why bothers 3.15+ Linux kernel? Why not use an old kernel e.g. 2.4.x?
2.4.x kernel doesn't have so many new features you want to get rid of here.

David Miller

2014-05-06 20:44:10 UTC

From: Cong Wang <***@gmail.com>
Date: Tue, 6 May 2014 11:33:11 -0700

So why bothers 3.15+ Linux kernel? Why not use an old kernel e.g. 2.4.x?
2.4.x kernel doesn't have so many new features you want to get rid of here.

+1

j***@joshtriplett.org

2014-05-06 16:41:08 UTC

Post by David Miller
Date: Tue, 6 May 2014 08:57:03 -0700

Date: Tue, 6 May 2014 05:21:14 +0200

What parts would you remove to get the foot print down for a 2MB
single purpose machine?

I wouldn't use Linux, end of story.
Maybe two decades ago, but not now, those days are over.

Making 2MB RAM machines today makes no sense at all.
The lowest end dirt cheap smartphone, something which fits on
someone's pocket, has gigabytes of ram.

The lowest-end smartphone isn't anywhere close to "dirt cheap", and
hardly counts as "embedded" at all anymore. Smartphones cost $100+;
we're talking about systems in the low tens of dollars or less. These
systems will have no graphics, no peripherals, and only one or two
specific functions. The entirety of their functionality will likely
consist of a single userspace program; they might not even have a PID 2.
*That's* the kind of "embedded" we're talking about, not the
supercomputers we carry around in our pockets.

Every KB of RAM costs real money and SoC die area (for eDRAM/eSRAM).
Look at how much cache low-end processors have, and imagine running
entirely out of *that*. Let's not surrender that entire class of
devices to VxWorks, FreeRTOS, and other painfully non-Linux systems,
when we already know it's possible to run Linux on them successfully.

- Josh Triplett

Richard Cochran

2014-05-06 19:14:47 UTC

Making 2MB RAM machines today makes no sense at all.

Besides cost, one of the main reasons for designing tiny systems today
is battery life. Some devices cannot be recharged every week, like
your smart phone must.

The lowest end dirt cheap smartphone, something which fits on
someone's pocket, has gigabytes of ram.

Right, these low end smart phones are nicer than the DEC Alpha work
stations we had at university. I would not call them "small" embedded
systems.

Every KB of RAM costs real money and SoC die area (for eDRAM/eSRAM).
Look at how much cache low-end processors have, and imagine running
entirely out of *that*. Let's not surrender that entire class of
devices to VxWorks, FreeRTOS, and other painfully non-Linux systems,
when we already know it's possible to run Linux on them successfully.

I have also been working on tiny system recently (and hope to get out
of it soon ;). This whole IoT trend might just go away, I sure hope it
does, but in general there is a growing need for tiny systems with
excellent networking.

Davem's attitude is understandable, and Linux should not be expected
to fit into every last micro controller, but still I observe the kernel
getting ever bigger, even in the most basic configurations. I don't
think there is valid technical reason for bloat, but rather it is an
issue that doesn't affect too many people.

In any case I would really like to see the possibility of leaving
pieces out for these tiny systems, but it would be a balancing act.
On the one hand, we want the stable/powerful/wonderful Linux stack in
our tiny systems. On the other hand, if we rip everything out to make
it fit, then it is no longer the same thing. So I think Dave is right
in rejecting anything that compromises the _quality_ of the stack.

Off on a tangent:

Regarding the multiplicity of RTOSs out there, all I can say is, they
all suck, especially the ones you pay money for. It would be great to
have a small Linux like OS for micro controllers and tiny micro
processors. I have looked and looked for an open source, posix like
alternative, but all I found was Nuttx, ActionOS, and RTEMS. I looked
closely at the first two, and putting aside technical issues, neither
seems to have any steam in terms of active development. RTEMS says it
has a BSD stack, and it seems to have a respectable development
effort, but I did not look too closely at it.

There is a huge area out there (think of all the Cortex M3) that needs
a real networking stack, but I don't see much hope. Minimizing Linux
is a big PITA (tons of work), and building a suitably small OS from
scratch is hopeless. As was said, it is easier just to buy a bigger
memory. The people who can't or won't (who are also building the IoT)
will just throw in some lwIP or uIP. You can imagine how secure these
systems will be.

Thanks,
Richard

Andi Kleen

2014-05-06 19:50:49 UTC

Post by Richard Cochran
So I think Dave is right
in rejecting anything that compromises the _quality_ of the stack.

I don't think anything I removed compromised quality (modulo bugs)
It's still a more-features-than-your-typical-BSD TCP/IP stack

-Andi

Richard Cochran

2014-05-06 20:07:38 UTC

Post by Andi Kleen

Post by Richard Cochran
So I think Dave is right
in rejecting anything that compromises the _quality_ of the stack.

I don't think anything I removed compromised quality (modulo bugs)
It's still a more-features-than-your-typical-BSD TCP/IP stack

But Dave seems to think so.

My (obvious?) point is, if you make the stack different, and not just
smaller by omitting optional features, then that defeats the point of
wanting the Linux stack in the first place.

Thanks,
Richard

David Miller

2014-05-06 20:46:17 UTC

From: Andi Kleen <***@linux.intel.com>
Date: Tue, 6 May 2014 12:50:49 -0700

Post by Andi Kleen
It's still a more-features-than-your-typical-BSD TCP/IP stack

Said the guy posting patches to remove TCP metrics.

Richard Weinberger

2014-05-06 07:25:19 UTC

There has been a lot of interest recently to run Linux on very small systems,
like Quark systems. These may have only 2-4MB memory. They are also limited
by flash space.
One problem on these small system is the size of the network stack.
Currently enabling IPv4 costs about 400k in text, which is prohibitive on
a 2MB system, and very expensive with 4MB.
There were proposals to instead use LWIP in user space. LWIP with
its socket interface comes in at a bit over 100k overhead per application.
I maintain that the Linux network stack is actually not that bloated,
it just has a lot of features :-) The goal of this project was to
subset it in a sensible way so that the native kernel stack becomes
competitive with LWIP.
It turns out that the standard stack has a couple of features that
are not really needed on client systems. Luckily it is also
relatively well modularized, so it becomes possible to stub
out these features at the edge.
With removing these features we still have a powerful TCP/IP stack,
but one that fits better into small systems.
It would have been prohibitive to ifdef every optional feature.
This patchkit relies heavily on LTO to effectively remove unused
code. This allows to disable features only at the module boundaries,
and rely on the compiler to drop unreferenced code and data.
A few features have been also reimplemented in a simpler way.
And I shrank a number of data structures based on CONFIG_BASE_SMALL.
With these changes I can get a fully featured network stack down
to about 170k with LTO. Without LTO there are also benefits,
but somewhat less.
- Full featured like today.
- Client only subset, but still works with standard distribution userland.
Remove some obscure features like fastopen, make all tables smaller,
packet socket mmap code, use a simpler routing table, remove
high speed networking features like RPX, XPS, GRO offload.
Disable SNMP, TCP metrics
- Minimal subset for deeply embedded systems that can use special userland.
Remove rtnetlink (ioctl only), remove ethtool, raw sockets.
Right now I'm using own Kconfigs for every removed features. I realize
this somewhat increases the compile test matrix. It would be possible
to hide some of the options and select them using higher level
configurations like the ones listed above. I haven't done this
in this version.
At this point I'm mainly interested in review and comments.
git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc net/debloat
Main tree
git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc net/debloat-3.14
3.14 based tree.
Thanks to Tom Zanussi for contributions and testing.

What kind of userspace do you use on such a small system?
It looks like you run kernels without procfs and netlink, so not even
ps would work. :)

--
Thanks,
//richard

David Laight

2014-05-06 09:18:25 UTC

From: Andi Kleen

There has been a lot of interest recently to run Linux on very small systems,
like Quark systems. These may have only 2-4MB memory. They are also limited
by flash space.

I'm intrigued about the 2-4MB memory.
That is more that would typically be available on-chip in a DSP or FPGA.
It sounds like an expensive SRAM chip.
OTOH a single SDRAM gives 16MB and DDR a lot more - and are a lot cheaper
and lower power.
Most modern silicon can easily have SDRAM/DDR interfaces.

You may want some size reduction to run in 16MB, but it is not as problematic
as running in 2MB.

With that little memory I wouldn't want to run anything that relied on
dynamic memory allocation (after startup) - except for fixed size data
buffers.

David

122 Replies
8 Views
Permalink to this page
Disable enhanced parsing

Thread Navigation

Andi Kleen 2014-05-05 22:25:52 UTC

Yuchung Cheng 2014-05-05 23:18:05 UTC

Andi Kleen 2014-05-05 23:28:21 UTC

Andi Kleen 2014-05-05 22:25:56 UTC

David Miller 2014-05-06 03:04:27 UTC

Andi Kleen 2014-05-05 22:26:05 UTC

David Miller 2014-05-06 03:06:44 UTC

Andi Kleen 2014-05-05 22:25:58 UTC

David Miller 2014-05-06 03:22:31 UTC

David Miller 2014-05-06 03:11:40 UTC

Andi Kleen 2014-05-06 03:14:12 UTC

Andi Kleen 2014-05-05 22:26:12 UTC

Nicolas Palix 2014-05-06 09:27:27 UTC

David Miller 2014-05-06 15:05:04 UTC

Andi Kleen 2014-05-05 22:26:03 UTC

David Miller 2014-05-06 03:05:58 UTC

Andi Kleen 2014-05-05 22:25:50 UTC

David Miller 2014-05-06 03:03:47 UTC

Andi Kleen 2014-05-05 22:25:53 UTC

David Miller 2014-05-06 03:10:49 UTC

Andi Kleen 2014-05-05 22:26:09 UTC

Andi Kleen 2014-05-05 22:25:54 UTC

David Miller 2014-05-06 03:12:06 UTC

Andi Kleen 2014-05-05 22:26:08 UTC

Andi Kleen 2014-05-06 03:16:21 UTC

David Miller 2014-05-06 03:14:33 UTC

Bjørn Mork 2014-05-06 08:32:03 UTC

Andi Kleen 2014-05-05 22:25:59 UTC

David Miller 2014-05-06 03:10:15 UTC

Andi Kleen 2014-05-05 22:25:55 UTC

David Miller 2014-05-06 03:11:21 UTC

Eric Dumazet 2014-05-06 14:26:42 UTC

Eric Dumazet 2014-05-06 19:25:54 UTC

Andi Kleen 2014-05-06 18:23:44 UTC

Andi Kleen 2014-05-05 22:26:00 UTC

Andi Kleen 2014-05-05 22:26:01 UTC

David Miller 2014-05-06 03:08:30 UTC

Andi Kleen 2014-05-06 03:11:27 UTC

Andi Kleen 2014-05-05 22:26:04 UTC

Andi Kleen 2014-05-05 22:26:07 UTC

David Miller 2014-05-06 03:12:51 UTC

Andi Kleen 2014-05-05 22:26:13 UTC

Andi Kleen 2014-05-05 22:25:57 UTC

Tom Zanussi 2014-05-06 04:39:08 UTC

David Miller 2014-05-06 03:12:29 UTC

j***@joshtriplett.org 2014-05-06 17:30:15 UTC

Eric Dumazet 2014-05-06 17:03:24 UTC

j***@joshtriplett.org 2014-05-06 15:57:03 UTC

Eric Dumazet 2014-05-06 16:39:19 UTC

j***@joshtriplett.org 2014-05-06 16:45:46 UTC

David Miller 2014-05-06 17:17:52 UTC

Tom Herbert 2014-05-06 18:58:38 UTC

j***@joshtriplett.org 2014-05-06 19:37:50 UTC

Andi Kleen 2014-05-06 19:57:04 UTC

Andi Kleen 2014-05-06 18:32:16 UTC

Eric Dumazet 2014-05-06 20:17:58 UTC

j***@joshtriplett.org 2014-05-06 20:27:19 UTC

Andi Kleen 2014-05-06 20:37:43 UTC

David Miller 2014-05-06 20:48:29 UTC

David Miller 2014-05-06 03:23:27 UTC

Andi Kleen 2014-05-06 03:21:14 UTC

Andi Kleen 2014-05-05 22:26:10 UTC

David Miller 2014-05-06 03:09:25 UTC

Andi Kleen 2014-05-05 22:26:11 UTC

David Miller 2014-05-06 03:21:05 UTC

David Miller 2014-05-06 03:02:59 UTC

Andi Kleen 2014-05-06 03:08:33 UTC

Andi Kleen 2014-05-05 22:26:02 UTC

David Miller 2014-05-06 03:01:18 UTC

Andi Kleen 2014-05-06 03:03:28 UTC

Andi Kleen 2014-05-05 22:25:51 UTC

Andi Kleen 2014-05-05 22:26:06 UTC

David Miller 2014-05-06 17:14:42 UTC

Rick Jones 2014-05-06 17:12:11 UTC

j***@joshtriplett.org 2014-05-06 18:09:19 UTC

Alexei Starovoitov 2014-05-06 15:20:50 UTC

Tom Zanussi 2014-05-06 15:34:50 UTC

Alexei Starovoitov 2014-05-06 17:20:26 UTC

Andi Kleen 2014-05-06 20:00:47 UTC

Tom Zanussi 2014-05-06 13:34:06 UTC

j***@joshtriplett.org 2014-05-06 17:21:06 UTC

David Miller 2014-05-06 17:25:01 UTC

Andi Kleen 2014-05-06 20:06:44 UTC

David Miller 2014-05-06 20:47:42 UTC

j***@joshtriplett.org 2014-05-06 20:11:45 UTC

Andi Kleen 2014-05-06 20:44:17 UTC

David Miller 2014-05-06 15:59:41 UTC

David Miller 2014-05-06 17:16:43 UTC

Andi Kleen 2014-05-06 18:48:40 UTC

j***@joshtriplett.org 2014-05-06 17:55:47 UTC

Richard Cochran 2014-05-06 19:19:11 UTC

Cong Wang 2014-05-06 18:33:11 UTC

David Miller 2014-05-06 20:44:10 UTC

j***@joshtriplett.org 2014-05-06 16:41:08 UTC

Richard Cochran 2014-05-06 19:14:47 UTC

Andi Kleen 2014-05-06 19:50:49 UTC

Richard Cochran 2014-05-06 20:07:38 UTC

David Miller 2014-05-06 20:46:17 UTC

Richard Weinberger 2014-05-06 07:25:19 UTC

David Laight 2014-05-06 09:18:25 UTC

about - legalese

Loading...