diff -uNr linux-2.6.25.6/include/linux/sysctl.h linux-2.6.25.6-tcpcp/include/linux/sysctl.h --- linux-2.6.25.6/include/linux/sysctl.h 2008-06-09 14:27:19.000000000 -0400 +++ linux-2.6.25.6-tcpcp/include/linux/sysctl.h 2008-06-11 12:25:39.000000000 -0400 @@ -435,6 +435,7 @@ NET_TCP_ALLOWED_CONG_CONTROL=123, NET_TCP_MAX_SSTHRESH=124, NET_TCP_FRTO_RESPONSE=125, + NET_TCP_CP_PRIV=126, }; enum { diff -uNr linux-2.6.25.6/include/linux/tcpcp.h linux-2.6.25.6-tcpcp/include/linux/tcpcp.h --- linux-2.6.25.6/include/linux/tcpcp.h 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.25.6-tcpcp/include/linux/tcpcp.h 2008-06-11 14:26:05.000000000 -0400 @@ -0,0 +1,227 @@ +/* + * linux/tcpcp.h - TCP connection passing, data structures and prototypes + * + * Written 2002 by Werner Almesberger + */ + +#ifndef _LINUX_TCPCP_H +#define _LINUX_TCPCP_H + +#ifdef __KERNEL__ +//#include +//#include +#endif /* __KERNEL__ */ + + +/* + * The socket option numbers should be in linux/tcp.h, but tcp.h can't be + * included by user space, so in order to avoid duplication, they go here. + */ + +#define TCP_MAXICISIZE 20 /* Max. size of Internal Conn. Info */ +#define TCP_ICI 21 /* Retrieve/set Internal Conn. Info */ +#define TCP_CP_FN 22 /* Perform special tcpcp operation */ + + +/* TCP_KICK sub-function codes */ +#define TCPCP_ACTIVATE 0 /* Activate dormant connection */ + +/* + * For simplicity, ICIs (Internal Connection Information) use a fixed-size + * struct, which is followed by the variable-size send and receive buffers. + * + * All ICI elements are padded to a multiple of four bytes. Numbers are always + * in network byte order. + * + * When extending the ICI, fields that can safely be ignored should be added in + * new IEs, before the buffer list, and only the minor version number needs to + * be incremented. If the content of existing IEs changes, or if new + * information can't be simply ignored, the major version number must be + * incremented. + * + * Should ICI use TLVs (Type-Length-Value), like netlink does ? Well, maybe. + * TLVs are more flexible, but they have also a bit more implementation + * overhead, and you can't just "print" them from a debugger. So for now, + * plain structs are better for development. + */ + +/* + * ICIEs represent a more or less arbitrary division of ICI data. The idea + * behind dividing this into separate elements is to allow for future + * replacements of relatively small blocks, in case kernel data structures + * change. + */ + +/* ICI element: ICI version and flags */ + +struct tcpcp_icie_version { + uint8_t major; /* incompatible structure revision */ + /* 0: current version */ + uint8_t minor; /* compatible structure extension */ + /* 0: current version */ + uint8_t ip_version; /* IP version */ + /* 4: IPv4 */ + uint8_t __pad1; + uint16_t flags; /* see TCPCP_ICIF_*, below */ + uint16_t __pad2; + uint16_t ici_hdr_size; /* sizeof(struct tcpcp_ici) */ + uint16_t buf_hdr_size; /* sizeof(struct tcpcp_icie_buf) */ +}; + +enum { + TCPCP_ICIF_USEPERF = 1, /* use perf. data (tcpcp_set_cong) */ +}; + + +/* ICI element: globally unique TCP connection ID */ + +struct tcpcp_icie_id4 { + uint32_t ip_src; /* source IP address */ + uint32_t ip_dst; /* destination IP address */ +}; + +struct tcpcp_icie_id { + union { + struct tcpcp_icie_id4 v4; /* IPv4 */ + } ip; + uint16_t tcp_sport; /* TCP source port */ + uint16_t tcp_dport; /* TCP destination port */ +}; + + +/* ICI element: fixed general data */ + +struct tcpcp_icie_fixgen { + uint8_t tcp_flags; /* TCP flags; from linux/tcp.h */ + /* 1: TCPI_OPT_TIMESTAMPS */ + /* 2: TCPI_OPT_SACK */ + /* 4: TCPI_OPT_WSCALE */ + /* 8: TCPI_OPT_ECN */ + uint8_t snd_wscale; /* send window scale (0 if unused) */ + uint8_t rcv_wscale; /* receive window scale (0 if unused) */ + uint8_t __pad; + uint16_t snd_mss; /* MSS sent */ + uint16_t rcv_mss; /* MSS received */ +}; + + +/* ICI element: variable general data */ + +struct tcpcp_icie_vargen { + uint8_t state; /* connection state; from linux/tcp.h */ + /* 1: TCP_ESTABLISHED */ + /* 2: TCP_SYN_SENT */ + /* 3: TCP_SYN_RECV */ + /* 4: TCP_FIN_WAIT1 */ + /* 5: TCP_FIN_WAIT2 */ + /* 6: TCP_TIME_WAIT */ + /* 7: TCP_CLOSE */ + /* 8: TCP_CLOSE_WAIT */ + /* 9: TCP_LAST_ACK */ + /* 10: TCP_LISTEN */ + /* 11: TCP_CLOSING */ + /* Note: TCP_ICI may not ever use some of these + values. */ + uint8_t __pad1; + uint8_t __pad2; + uint8_t __pad3; + uint32_t snd_nxt; /* sequence number of next new byte to send */ + uint32_t rcv_nxt; /* sequence number of next new byte expected to + receive */ + uint32_t snd_wnd; /* window received from peer */ + uint32_t rcv_wnd; /* window advertized to peer */ + uint32_t ts_recent; /* cached timestamp from peer (0 if none) */ + uint32_t ts_gen; /* current locally generated timestamp */ + /* (0 if not using timestamps) */ +}; + + +/* ICI element: congestion avoidance data */ + +struct tcpcp_icie_cong { +}; + + +/* ICI element: connection statistics */ + +struct tcpcp_icie_stat { + /* [0-3]: retransmits + / * [4-7]: probes sent + / * [8-11]: backoff */ +}; + + +/* ICI element: send or receive buffer */ + +struct tcpcp_icie_buf { + /*** These fields must be first and in this order ! ******************/ + uint8_t type; /* buffer type (TCPCP_ICIE_BUF_*, see below) */ + uint8_t __pad; /**/ + uint16_t length; /* segment data length */ + /*********************************************************************/ + uint32_t seq; /* sequence number of first byte */ + uint8_t data[0]; /* data, padded to multiple of 4 bytes */ +}; + +enum { + TPCPC_ICIE_BUF_SND = 1, /* send buffer (only TCP segment, no IP) */ + TPCPC_ICIE_BUF_OOO = 2, /* out of order buffer (only TCP segment) */ +}; + + +/* Internal Connection Information (ICI) */ + +struct tcpcp_ici { + uint32_t ici_length; /* total length of ICI */ + struct tcpcp_icie_version v; /* ICI version and flags */ + struct tcpcp_icie_id id; /* globally unique TCP connection ID */ + struct tcpcp_icie_fixgen fixgen; /* fixed general data */ + struct tcpcp_icie_vargen vargen; /* variable general data */ + struct tcpcp_icie_cong cong; /* congestion avoidance data */ + struct tcpcp_icie_stat stat; /* connection statistics */ + /* ----- ADD NEW IEs HERE ----- */ + struct tcpcp_icie_buf buf[0]; +}; + +/* + * Buffers are in sequence, first all send, then all out-of-order buffers. + * Buffers must not overlap, and may not contain any extraneous data (e.g. + * ack'ed bytes, or such). snd_nxt does not have to be at a buffer boundary. + */ + + +#ifdef __KERNEL__ + +#if defined(CONFIG_TCPCP) || defined(CONFIG_TCPCP_MODULE) + +#include + +extern int sysctl_tcpcp_privileged; + +extern int (*tcpcp_maxicisize_hook)(struct sock *sk,int *size); +extern int (*tcpcp_getici_hook)(struct sock *sk,struct tcpcp_ici *user_ici, + int *user_size); +extern int (*tcpcp_setici_hook)(struct sock *sk, + const struct tcpcp_ici *user_ici,int size); +extern int (*tcpcp_fn_hook)(struct sock *sk,int fn_code); + +void tcpcp_lock_hooks(void); +void tcpcp_unlock_hooks(void); + +int tcpcp_maxicisize(struct sock *sk,int *size); +int tcpcp_getici(struct sock *sk,struct tcpcp_ici *user_ici,int *user_size); +int tcpcp_setici(struct sock *sk,const struct tcpcp_ici *user_ici,int size); +int tcpcp_fn(struct sock *sk,int fn_code); + +#else /* defined(CONFIG_TCPCP) || defined(CONFIG_TCPCP_MODULE) */ + +#define tcpcp_maxicisize(sk,val) (-ENOPROTOOPT) +#define tcpcp_getici(sk,val,size) (-ENOPROTOOPT) +#define tcpcp_setici(sk,val,size) (-ENOPROTOOPT) +#define tcpcp_fn(sk,fn_code) (-ENOPROTOOPT) + +#endif /* !defined(CONFIG_TCPCP) && !defined(CONFIG_TCPCP_MODULE) */ + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_TCPCP_H */ diff -uNr linux-2.6.25.6/include/linux/tcp.h linux-2.6.25.6-tcpcp/include/linux/tcp.h --- linux-2.6.25.6/include/linux/tcp.h 2008-06-09 14:27:19.000000000 -0400 +++ linux-2.6.25.6-tcpcp/include/linux/tcp.h 2008-06-11 13:16:21.000000000 -0400 @@ -405,6 +405,10 @@ /* TCP MD5 Signagure Option information */ struct tcp_md5sig_info *md5sig_info; #endif + +#if defined(CONFIG_TCPCP) || defined(CONFIG_TCPCP_MODULE) + u32 ts_offset; /* offset from tcp_time_stamp */ +#endif }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) @@ -423,6 +427,9 @@ u16 tw_md5_keylen; u8 tw_md5_key[TCP_MD5SIG_MAXKEYLEN]; #endif +#if defined(CONFIG_TCPCP) || defined(CONFIG_TCPCP_MODULE) + u32 ts_offset; +#endif }; static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) diff -uNr linux-2.6.25.6/include/net/sock.h linux-2.6.25.6-tcpcp/include/net/sock.h --- linux-2.6.25.6/include/net/sock.h 2008-06-09 14:27:19.000000000 -0400 +++ linux-2.6.25.6-tcpcp/include/net/sock.h 2008-06-11 17:00:27.000000000 -0400 @@ -443,6 +443,12 @@ return sk->sk_ack_backlog > sk->sk_max_ack_backlog; } +static inline void sk_charge_skb(struct sock *sk, struct sk_buff *skb) +{ + sk->sk_wmem_queued += skb->truesize; + sk->sk_forward_alloc -= skb->truesize; +} + /* * Compute minimal free write space needed to queue new packets. */ diff -uNr linux-2.6.25.6/include/net/tcp.h linux-2.6.25.6-tcpcp/include/net/tcp.h --- linux-2.6.25.6/include/net/tcp.h 2008-06-09 14:27:19.000000000 -0400 +++ linux-2.6.25.6-tcpcp/include/net/tcp.h 2008-06-11 14:29:28.000000000 -0400 @@ -1381,4 +1381,14 @@ extern void tcp_v4_init(struct net_proto_family *ops); extern void tcp_init(void); +#define tw_time_stamp(tw) tp_time_stamp(tw) + +//extern struct tcp_func ipv4_specific; + +#if defined(CONFIG_TCPCP) || defined(CONFIG_TCPCP_MODULE) +#define tp_time_stamp(tp) (tcp_time_stamp+(tp)->ts_offset) +#else +#define tp_time_stamp(tp) tcp_time_stamp +#endif + #endif /* _TCP_H */ diff -uNr linux-2.6.25.6/net/ipv4/Kconfig linux-2.6.25.6-tcpcp/net/ipv4/Kconfig --- linux-2.6.25.6/net/ipv4/Kconfig 2008-06-09 14:27:19.000000000 -0400 +++ linux-2.6.25.6-tcpcp/net/ipv4/Kconfig 2008-06-11 12:41:59.000000000 -0400 @@ -291,6 +291,14 @@ and you should also say Y to "Kernel/User network link driver", below. If unsure, say N. +config TCPCP + tristate "IP: TCP connection mobility (EXPERIMENTAL)" + depends on EXPERIMENTAL + ---help--- + Support for retrieving internal information of TCP connections, and + recreating connections from this information. Highly experimental, + so if in doubt, say N. + config SYN_COOKIES bool "IP: TCP syncookie support (disabled per default)" ---help--- diff -uNr linux-2.6.25.6/net/ipv4/Makefile linux-2.6.25.6-tcpcp/net/ipv4/Makefile --- linux-2.6.25.6/net/ipv4/Makefile 2008-06-09 14:27:19.000000000 -0400 +++ linux-2.6.25.6-tcpcp/net/ipv4/Makefile 2008-06-11 12:44:33.000000000 -0400 @@ -11,7 +11,7 @@ datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o \ - inet_fragment.o + inet_fragment.o tcpcp_hooks.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o @@ -36,6 +36,7 @@ obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o +obj-$(CONFIG_TCPCP) += tcpcp.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o diff -uNr linux-2.6.25.6/net/ipv4/sysctl_net_ipv4.c linux-2.6.25.6-tcpcp/net/ipv4/sysctl_net_ipv4.c --- linux-2.6.25.6/net/ipv4/sysctl_net_ipv4.c 2008-06-09 14:27:19.000000000 -0400 +++ linux-2.6.25.6-tcpcp/net/ipv4/sysctl_net_ipv4.c 2008-06-11 12:48:32.000000000 -0400 @@ -31,6 +31,8 @@ extern seqlock_t sysctl_port_range_lock; extern int sysctl_local_port_range[2]; +extern int sysctl_tcpcp_privileged; + /* Update system visible IP port range */ static void set_local_port_range(int range[2]) { @@ -641,6 +643,16 @@ .mode = 0644, .proc_handler = &proc_dointvec, }, +#if defined(CONFIG_TCPCP) || defined(CONFIG_TCPCP_MODULE) + { + .ctl_name = NET_TCP_CP_PRIV, + .procname = "tcpcp_privileged", + .data = &sysctl_tcpcp_privileged, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = NET_TCP_MODERATE_RCVBUF, .procname = "tcp_moderate_rcvbuf", diff -uNr linux-2.6.25.6/net/ipv4/tcp.c linux-2.6.25.6-tcpcp/net/ipv4/tcp.c --- linux-2.6.25.6/net/ipv4/tcp.c 2008-06-09 14:27:19.000000000 -0400 +++ linux-2.6.25.6-tcpcp/net/ipv4/tcp.c 2008-06-11 14:25:36.000000000 -0400 @@ -263,6 +263,7 @@ #include #include #include +#include #include #include @@ -2152,6 +2153,17 @@ break; #endif +#if defined(CONFIG_TCPCP) || defined(CONFIG_TCPCP_MODULE) + case TCP_ICI: + err = tcpcp_setici(sk,(const struct tcpcp_ici *) optval, + optlen); + break; + + case TCP_CP_FN: + err = tcpcp_fn(sk,val); + break; +#endif + default: err = -ENOPROTOOPT; break; @@ -2319,7 +2331,19 @@ case TCP_QUICKACK: val = !icsk->icsk_ack.pingpong; break; +#if defined(CONFIG_TCPCP) || defined(CONFIG_TCPCP_MODULE) + case TCP_MAXICISIZE: { + int error; + + error = tcpcp_maxicisize(sk,&val); + if (error) return error; + break; + } + case TCP_ICI: + return tcpcp_getici(sk,(struct tcpcp_ici *) optval,optlen); + +#endif case TCP_CONGESTION: if (get_user(len, optlen)) return -EFAULT; diff -uNr linux-2.6.25.6/net/ipv4/tcpcp.c linux-2.6.25.6-tcpcp/net/ipv4/tcpcp.c --- linux-2.6.25.6/net/ipv4/tcpcp.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.25.6-tcpcp/net/ipv4/tcpcp.c 2008-06-11 17:20:44.000000000 -0400 @@ -0,0 +1,1112 @@ +/* + * tcpcp.c - TCP connection passing + * + * Written 2002-2005 by Werner Almesberger + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#if 1 +#define DPRINTK(...) printk(KERN_DEBUG __VA_ARGS__) +#else +#define DPRINTK(...) +#endif + +#define TCPCP_CURRENT_ICI_MAJOR 0 +#define TCPCP_CURRENT_ICI_MINOR 0 + +#define TCP_TIME_RETRANS 1 + +static inline void sk_stream_free_skb(struct sock *sk, struct sk_buff *skb) +{ + skb_truesize_check(skb); + sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + sk->sk_wmem_queued -= skb->truesize; + sk->sk_forward_alloc += skb->truesize; + kfree_skb(skb); +} + +/* ----- Get maximum ICI size ---------------------------------------------- */ + + +static int __tcpcp_maxicisize(struct sock *sk,int *size) +{ + DPRINTK("tcpcp_maxicisize(sk %p,size %p)\n",sk,size); + if (sk->sk_socket->state != SS_CONNECTED) + return -EBADFD; + lock_sock(sk); + /* + * Allocations against sk->sk_rcvbuf and sk->sk_sndbuf are made for + * skb->sk_truesize, so they include skb overhead, which is + * significantly bigger than per-buffer overhead in the ICI, so the ICI + * size estimate errs on the safe side. + */ + *size = sizeof(struct tcpcp_ici)+sk->sk_rcvbuf+sk->sk_sndbuf; + release_sock(sk); + DPRINTK(" size = %d\n",*size); + return 0; +} + + +/* ----- Get Internal Connection Information (ICI) ------------------------- */ + + +/* + * No flags ? Indeed, we don't copy TCP flags. + * + * Since we only dump connections in state ESTABLISHED, we don't have to worry + * about segments with SYN, or outbound FIN. Also, any inbound segments with + * FIN have not been acknowledged yet (or the connection would be in + * CLOSE_WAIT), so we can just ignore the FIN bit, and wait for the + * retransmission. + * + * The PSH flag is of little practical relevance, and easily regenerated in the + * outbound direction, so we just ignore it. RST is similar to FIN - if + * something was wrong enough to cause RST, it will continue to be wrong when + * the retransmission hits. We don't queue outbound RSTs. + * + * The information of inbound ACKs is already represented in the connection + * state, so the ACK data in inbound segments is redundant. Outbound ACKs will + * just be re-generated, with up to date information. + * + * This leaves URG. For now, we just completely ignore that stuff. (@@@) + */ + +static int put_buffer(uint8_t type,struct tcpcp_icie_buf **user_buf, + const void *end,struct sk_buff *skb) +{ + struct tcpcp_icie_buf buf; + int error,length; + + DPRINTK("put_buffer(type %d,user_buf %p,end %p,skb %p\n", + type,*user_buf,end,skb); + + /* + * All the cloning and linearizing is terribly inefficient, but that's + * not really an issue here. The data will be copied a few times + * anyway, and normally, we're never going to put these skbs on the + * network. + */ + error = skb_linearize(skb); + if (error) + return error; + + buf.type = type; + length = TCP_SKB_CB(skb)->end_seq-TCP_SKB_CB(skb)->seq; + buf.length = htons(length); + buf.seq = htonl(TCP_SKB_CB(skb)->seq); + DPRINTK(" skb->len %u, length %d, seq %lu\n",skb->len,length, + (unsigned long) ntohl(buf.seq)); + if (copy_to_user(*user_buf,&buf,sizeof(buf))) + return -EFAULT; + if (copy_to_user((*user_buf)->data,skb->tail-length,length)) + return -EFAULT; + *user_buf = + (struct tcpcp_icie_buf *) ((*user_buf)->data+((length+3) & ~3)); + return 0; +} + + +/* + * In order to avoid keeping the socket locked "forever", and accumulating a + * hefty backlog, we unlock the socket during copies to user space. Changes to + * socket variables during these copies don't matter, because we either don't + * guarantee atomicity anyway (TCP_ICI), or we have silenced the socket, so no + * resposes will go to the peer, and the only thing that could change (e.g. if + * the user attempts concurrent writes) is buffer content, in which case we + * report an error. + * + * (Probably false) assumption: if anything happens to the send/retransmit + * buffer, we see this in a change of the list head/tail. + * + * If there's no useful way for detecting inconsistencies, maybe we just have + * to drop TCP_ICI without shutdown. It's not all that useful anyway ... + */ + + +static int copy_queue(struct sock *sk,uint8_t type,struct tcpcp_icie_buf **buf, + const void *end,struct sk_buff_head *queue) +{ + const struct sk_buff *old_head,*old_tail; + struct sk_buff *skb,*clone; + int error; + + DPRINTK("copy_queue(sk %p,type %d,buf %p->%p,end %p,queue %p\n",sk, + type,buf,*buf,end,queue); + old_head = skb_peek(queue); + old_tail = skb_peek_tail(queue); + skb_queue_walk(queue,skb) { + clone = skb_clone(skb,GFP_KERNEL); + if (!clone) + return -ENOMEM; + release_sock(sk); + error = put_buffer(type,buf,end,clone); + lock_sock(sk); + kfree_skb(clone); + if (error) return error; + if (old_head != skb_peek(queue) || + old_tail != skb_peek_tail(queue)) + return -EBUSY; + } + return 0; +} + + +static int do_getici(struct sock *sk,struct tcpcp_ici *ici, + struct tcpcp_ici *user_ici,int *size) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct tcpcp_icie_buf *buf; + void *end; + int error; + + DPRINTK("do_getici(sk %p,ici %p,user_ici %p,size %p = %d)\n",sk,ici, + user_ici,size,*size); + DPRINTK(" sk->sk_send_head %p (seq %lu)\n",sk->sk_send_head, + sk->sk_send_head ? + (unsigned long) TCP_SKB_CB(sk->sk_send_head)->seq : 0); + DPRINTK(" tp->snd_una = %lu, tp->snd_sml = %lu\n", + (unsigned long) tp->snd_una,(unsigned long) tp->snd_sml); + DPRINTK(" tp->pushed_seq = %lu, skb->copied_seq = %lu\n", + (unsigned long) tp->pushed_seq,(unsigned long) tp->copied_seq); + DPRINTK(" tp->snd_wnd = %lu, tp->rcv_wnd = %lu\n", + (unsigned long) tp->snd_wnd,(unsigned long) tp->rcv_wnd); + DPRINTK(" tp->packets_out %lu, sk->sk_wmem_alloc %lu\n", + (unsigned long) tp->packets_out, + (unsigned long) atomic_read(&sk->sk_wmem_alloc)); + DPRINTK(" sk->sk_wmem_alloc %lu, sk->sk_wmem_queued %lu, " + "sk->sk_sndbuf %lu\n", + (unsigned long) atomic_read(&sk->sk_wmem_alloc), + (unsigned long) sk->sk_wmem_queued,(unsigned long) sk->sk_sndbuf); + + memset(ici,0,sizeof(*ici)); + /* make sure padding bytes don't leak data */ + + /* version */ + ici->v.major = TCPCP_CURRENT_ICI_MAJOR; + ici->v.minor = TCPCP_CURRENT_ICI_MINOR; + ici->v.ip_version = 4; + ici->v.flags = 0; + ici->v.ici_hdr_size = htons(sizeof(struct tcpcp_ici)); + ici->v.buf_hdr_size = htons(sizeof(struct tcpcp_icie_buf)); + + /* globally unique TCPv4 connection ID */ + ici->id.ip.v4.ip_src = inet->saddr; + ici->id.ip.v4.ip_dst = inet->daddr; + ici->id.tcp_sport = inet->sport; + ici->id.tcp_dport = inet->dport; + + /* fixed general data */ + ici->fixgen.tcp_flags = 0; + if (tp->rx_opt.tstamp_ok) + ici->fixgen.tcp_flags |= TCPI_OPT_TIMESTAMPS; + if (tp->rx_opt.sack_ok) + ici->fixgen.tcp_flags |= TCPI_OPT_SACK; + if (tp->rx_opt.wscale_ok) { + ici->fixgen.tcp_flags |= TCPI_OPT_WSCALE; + ici->fixgen.snd_wscale = tp->rx_opt.snd_wscale; + ici->fixgen.rcv_wscale = tp->rx_opt.rcv_wscale; + } + else { + ici->fixgen.snd_wscale = 0; + ici->fixgen.rcv_wscale = 0; + } + if (tp->ecn_flags & TCP_ECN_OK) + ici->fixgen.tcp_flags |= TCPI_OPT_ECN; + ici->fixgen.snd_mss = htons(tp->advmss); + ici->fixgen.rcv_mss = htons(tp->rx_opt.mss_clamp); + + /* variable general data */ + ici->vargen.state = sk->sk_state; + ici->vargen.snd_nxt = htonl(tp->snd_nxt); + ici->vargen.rcv_nxt = htonl(tp->rcv_nxt); + ici->vargen.snd_wnd = htonl(tp->snd_wnd); + ici->vargen.rcv_wnd = htonl(tcp_receive_window(tp)); + ici->vargen.ts_gen = htonl(tp_time_stamp(tp)); + ici->vargen.ts_recent = + htonl(tp->rx_opt.saw_tstamp ? tp->rx_opt.ts_recent : 0); + + /* copy buffers */ + buf = (struct tcpcp_icie_buf *) &user_ici->buf; + end = (char *) user_ici+*size; + error = copy_queue(sk,TPCPC_ICIE_BUF_SND,&buf,end,&sk->sk_write_queue); + if (error) + return error; + error = copy_queue(sk,TPCPC_ICIE_BUF_OOO,&buf,end, + &tp->out_of_order_queue); + if (error) + return error; + + *size = (char *) buf-(char *) user_ici; + ici->ici_length = htonl(*size); + DPRINTK(" size = %d\n",*size); + return 0; +} + + +static int do_tcpcp_getici(struct sock *sk,struct tcpcp_ici *user_ici, + int *user_size) +{ + struct tcpcp_ici *ici; + int size,error; + + DPRINTK("do_tcpcp_getici(sk %p,user_ici %p,user_size %p)\n",sk, + user_ici,user_size); + + if (get_user(size,user_size)) + return -EFAULT; + if (size < sizeof(struct tcpcp_ici)) + return -EMSGSIZE; + + ici = kmalloc(sizeof(struct tcpcp_ici),GFP_KERNEL); + if (!ici) + return -ENOMEM; + + lock_sock(sk); + error = do_getici(sk,ici,user_ici,&size); + release_sock(sk); + + if (!error && put_user(size,user_size)) + error = -EFAULT; + if (!error && copy_to_user(user_ici,ici,sizeof(*ici))) + error = -EFAULT; + + kfree(ici); + return error; +} + + +/* ----- Get Internal Connection Information (ICI), plus shutdown ---------- */ + + +/* + * FFS: don't filter, but reconstruct any changes possibly caused by new + * packets on the fly. shouldn't be too hard, because in most cases, we + * put them into the backlog anyway. @@@ + * + * Advantages: + * - no dependency on CONFIG_FILTER + * - no conflict with obscure uses of CONFIG_FILTER + * + * What can happen ? + * - we get moved from ESTABLISHED to CLOSE_WAIT + * - segments move from ooo to receive buffer + * - overlapping segments get recombined (aiee!) + * - new segments (ACK, window probe) get added to send buffer (are they + * queued ?) + * - user may add/remove data, but that's a violation of the stability + * requirement, and we just quit in this case + */ + +static struct sk_filter drop_all = { + .refcnt = ATOMIC_INIT(1), /* never kfree this ... */ + .len = 0, /* return 0 */ +}; + + +static struct tcp_func tcpcp_af_specific; + + +static int tcpcp_discard_xmit(struct sk_buff *skb,int ipfragok) +{ + DPRINTK("tcpcp_discard_xmit(skb %p)\n",skb); + kfree_skb(skb); + return 0; +} + + +/* + * NOTE: backlog is lost. There's probably not much there anyway ... + * (At least not in a way that would be particularly useful for us - i.e. + * segments accumulated while we're holding the socket lock would be pretty + * hard to use.) + * + * NOTE: also the error queue is lost. The data in there isn't guaranteed + * anyway, and actual uses of this with TCP seem to be rare. + */ + +static int __tcpcp_getici(struct sock *sk,struct tcpcp_ici *user_ici, + int *user_size) +{ + struct tcp_sock *tp = tcp_sk(sk); + int error; + + DPRINTK("tcpcp_getici(sk %p,user_ici %p,user_size %p)\n",sk,user_ici, + user_size); + + /* + * @@@ Allow TCP states SYN_RCVD, SYN_SENT, ESTABLISHED, and + * CLOSE_WAIT. May also allow accept-but-no-SYNACK state, FFS. + */ + if (sk->sk_socket->state != SS_CONNECTED) + return -EBADFD; + + error = sock_error(sk); + if (error) + return error; + lock_sock(sk); + + /* + * Set up dummy socket filter that just drops all inbound packets. + */ + if (sk->sk_filter) sk_filter_release(sk->sk_filter); + sk->sk_filter = &drop_all; + sk_filter_charge(sk,sk->sk_filter); + +#ifdef CONFIG_TCP_MD5SIG + /* + * Prevent socket from sending by inserting a fake set of AF-specific + * functions. Very ugly, of course. + */ + tp->af_specific = &tcpcp_af_specific; +#endif + /* + * Since our socket is now deaf and mute, there's no point in lingering + */ + sock_set_flag(sk,SOCK_LINGER); + sk->sk_lingertime = 0; + + release_sock(sk); + + error = do_tcpcp_getici(sk,user_ici,user_size); + if (error) + return error; + + /* + * Something may have set sk->sk_err while we've been copying data. If + * so, return it now. + */ + return sock_error(sk); +} + + +/* ----- Set Internal Connection Information (ICI) ------------------------- */ + + +/* + * Instead of populating every little field "manually", we choose the more + * drastic but ultimately probably more reliable approach of going through as + * much of the normal connection setup process as possible. This bears the + * following risks: + * + * - fields may be overlooked, and initialized to invalid values + * - there may be side-effects (e.g. spurious packet emission) + * - cleanup after tcp_v4_connect is kind of messy + * + * However, it has the following benefits: + * + * - fields aren't likely to be left uninitialized + * - most sanity checks (consistency, compatibility, security, etc.) are + * performed, including ones that may be added in the future + * - tcpcp has a lot fewer explicit dependencies on the rest of the TCP code + * - tcpcp code is simpler + * + * We could possibly trade the messy cleanup for bad karma by also faking the + * SYN+ACK. + */ + + +/* + * tcpcp_compatibility checks that a connection with the characteristics + * described in the ICI is compatible with the functionality available on the + * local system. If not, the operation fails. (E.g. if SACK is disabled here, + * we cannot negotiate out of using it.) + */ + +static int tcpcp_compatibility(struct sock *sk,const struct tcpcp_ici *ici) +{ + uint16_t flags = ntohs(ici->v.flags); + + DPRINTK("tcpcp_compatibility(sk %p,ici %p,flags %u)\n",sk,ici, + (unsigned) flags); + + if ((flags & TCPCP_ICIF_USEPERF) && !capable(CAP_NET_RAW)) + return -EPERM; + + if ((ici->fixgen.tcp_flags & TCPI_OPT_TIMESTAMPS) && + !sysctl_tcp_timestamps) + return -ENOPROTOOPT; + if ((ici->fixgen.tcp_flags & TCPI_OPT_SACK) && !sysctl_tcp_sack) + return -ENOPROTOOPT; + if ((ici->fixgen.tcp_flags & TCPI_OPT_WSCALE) && + !sysctl_tcp_window_scaling) + return -ENOPROTOOPT; + if ((ici->fixgen.tcp_flags & TCPI_OPT_ECN) && !sysctl_tcp_ecn) + return -ENOPROTOOPT; + +// MSS compatibility ? +// max. window >= advertized window ? (taking into account pending data) + return 0; +} + + +/* + * tcpcp_bind just binds the socket, plain and simple. Binding does not leave + * any nasty surprises like running timers, so we don't need to worry about + * locking at this point. (inet_bind does its own locking.) + */ + +static int tcpcp_bind(struct sock *sk,const struct tcpcp_ici *ici) +{ + struct inet_sock *inet = inet_sk(sk); + struct sockaddr_in addr; + + DPRINTK("tcpcp_bind(sk %p,ici %p)\n",sk,ici); + /* + * In some cases, we may not want to use the same address/port + * combination as in the original connection, e.g. if the connection + * switching is done by a NAT. + * + * Users can manipulate the source and destination address/port pairs + * directly in the ICI. However, this would not allow them to let the + * kernel look up the local port, without also reserving it. In order + * to support that, setici simply skips the bind step if the socket is + * already bound. + */ + if (inet->rcv_saddr) + return 0; + + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = ici->id.ip.v4.ip_src; + addr.sin_port = ici->id.tcp_sport; + return inet_bind(sk->sk_socket,(struct sockaddr *) &addr,sizeof(addr)); +} + + +static int tcpcp_send_buf(struct sock *sk,void *data,uint32_t seq,int length) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int error = 0; + + DPRINTK("tcpcp_send_buf(sk %p,data %p,seq %lu,length %d)\n",sk,data, + (unsigned long) seq,length); + + /* + * @@@ check sequence numbers + */ + + skb = alloc_skb(length+MAX_TCP_HEADER,GFP_KERNEL); + if (!skb) + return -ENOMEM; + skb_reserve(skb,MAX_TCP_HEADER); + skb->csum = csum_and_copy_from_user(data,skb_put(skb,length),length,0, + &error); + if (error) { + kfree_skb(skb); + return error; + } + + /* + * Do what tcp.c:skb_entail does ... + */ + TCP_SKB_CB(skb)->seq = seq; + TCP_SKB_CB(skb)->end_seq = seq+length; + TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; /* @@@ sure ? */ + TCP_SKB_CB(skb)->sacked = 0; + sk_charge_skb(sk,skb); + __skb_queue_tail(&sk->sk_write_queue,skb); + + /* + * @@@ Can send_head point to buffers with partially new and partially + * old data ? If not, and somebody feeds us with such buffers (If they + * come from a Linux kernel, send_head rules should be compatible. That + * is, unless someone changed the rules.), we need to split the skb. + */ + + if (between(tp->snd_nxt,seq,seq+length-1)) + sk->sk_send_head = skb; + if (!sk->sk_send_head) + tp->packets_out += tcp_skb_pcount(skb); + return 0; +} + + +static int tcpcp_ooo_buf(struct sock *sk,void *data,uint32_t seq,int length) +{ + DPRINTK("tcpcp_ooo_buf(sk %p,data %p,seq %lu,length %d)\n",sk,data, + (unsigned long) seq,length); + + /* + * Lazy bastard trick #666: TCP doesn't actually require us to preserve + * OOO data, so we don't. Ha, that was easy :-) + */ + return 0; + + +#if 0 + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + /* + * Make sure we don't get excessively unreasonable data ... + */ + if (atomic_read(&sk->sk_rmem_alloc)+length < sk->sk_rcvbuf) + return -ENOBUFS; + skb = alloc_skb(length,GFP_KERNEL); + if (!skb) + return -ENOMEM; +/* + * when populating buffers, remember skb->cb (see tcp_ipv4.c:tcp_v4_rcv) ! ??? + * .. and tcp.c:skb_entail + */ + skb->dev = + skb->nh.iph = + skb->th.tcp = + + if (copy_from_user(skb->h.th,data,length)) { + kfree_skb(skb); + return -EFAULT; + } + /* + * Packets in the out-of-order queue are already checksummed, so we + * probably don't need to re-checksum everything. (FFS @@@) + */ + tcp_data_queue(sk,skb); + if (skb_peek(&tp->out_of_order_queue) != skb) { + printk(KERN_DEBUG "tcpcp_ooo_buf: skb didn't show up in " + "queue\n"); + return -ENOBUFS; + } + + return 0; +#endif +} + + +static int tcpcp_buffers(struct sock *sk,const struct tcpcp_ici *ici, + const struct tcpcp_ici *user_ici,int size) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcpcp_icie_buf *buf = user_ici->buf; + void *end = (char *) user_ici+size; + int first_send = 1; + + DPRINTK("tcpcp_buffers(sk %p,ici %p,user_ici %p,size %d)\n",sk,ici, + user_ici,size); +/* + * set up sequence numbers so that tcp_data_queue works @@@ + */ + tp->snd_nxt = tp->snd_una = tp->write_seq = ntohl(ici->vargen.snd_nxt); +/* + * BEWARE!! tcp_output.c:tcp_connect_init resets lots of variables, including + * err, snd_una, snd_sml, rcv_nxt, rcv_wup, and copied_seq + */ + DPRINTK(" tp->rcv_nxt %lu\n",(unsigned long) tp->rcv_nxt); + + while (buf != end) { + struct tcpcp_icie_buf buf_dsc; + int length,error; + uint32_t seq; + + DPRINTK(" (buf %p, end %p)\n",buf,end); + if ((void *) buf->data > end) + return -EFAULT; + if (copy_from_user(&buf_dsc,buf,sizeof(*buf))) + return -EFAULT; + length = ntohs(buf_dsc.length); + seq = ntohl(buf_dsc.seq); + DPRINTK(" (type %d,length %d,seq %lu)\n",buf_dsc.type,length, + (unsigned long) seq); + switch (buf_dsc.type) { + case TPCPC_ICIE_BUF_SND: + if (first_send) { + tp->snd_una = seq; + first_send = 0; + } + else { + if (sk->sk_send_head) { + if (seq != tp->write_seq) + return -EINVAL; + } + else { + if (before(seq,tp->write_seq)) + return -EINVAL; + } + } + tp->write_seq = seq+length; + error = tcpcp_send_buf(sk,buf->data,seq, + length); + break; + case TPCPC_ICIE_BUF_OOO: + error = tcpcp_ooo_buf(sk,buf->data,seq,length); + break; + default: + return -EINVAL; + } + if (error) + return error; + buf = (struct tcpcp_icie_buf *) (buf->data+((length+3) & ~3)); + } + if (!between(tp->snd_nxt,tp->snd_una,tp->write_seq)) + return -EINVAL; + return 0; +} + + +/* + * tcpcp_connect walks TCP through to the SYN_SENT state. Since we don't want + * TCP to really emit a SYN packet, we "mute" the socket during all this. Note + * that the socket is locked , and will remain so, until after we've forced the + * socket into full compliance with the ICI. + */ + +/* + * control enabling of SACK, wscale, etc. @@@ + */ + +static int tcpcp_connect(struct sock *sk,const struct tcpcp_ici *ici) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sockaddr_in addr; + struct sk_buff *skb; + uint32_t old_snd_una = tp->snd_una; + uint8_t flags; + int error; + + DPRINTK("tcpcp_connect(sk %p,ici %p)\n",sk,ici); + /* + * "Mute" socket + */ +#ifdef CONFIG_TCP_MD5SIG + tp->af_specific = &tcpcp_af_specific; +#endif + tp->rx_opt.mss_clamp = ntohs(ici->fixgen.rcv_mss); + tp->advmss = ntohs(ici->fixgen.snd_mss); + +// set up mss, sequence, timestamps?, etc. + + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = ici->id.ip.v4.ip_dst; + addr.sin_port = ici->id.tcp_dport; + error = tcp_v4_connect(sk,(struct sockaddr *) &addr,sizeof(addr)); + if (error) + return error; + + /* + * Get rid of the SYN segment enqueued by tcp_v4_connect + */ + skb = __skb_dequeue_tail(&sk->sk_write_queue); + if (!skb) { + printk(KERN_ERR "tcpcp_connect: tcp_v4_connect did not enqueue " + "anything\n"); + return -EINVAL; + } + flags = TCP_SKB_CB(skb)->flags; + if (!(flags & TCPCB_FLAG_SYN) || (flags & TCPCB_FLAG_ACK)) { + printk(KERN_ERR "tcpcp_connect: tcp_v4_connect did not enqueue " + "SYN segment (0x%02x)\n",flags); + error = -EINVAL; + } + if (sk->sk_send_head == skb) + sk->sk_send_head = NULL; + tp->packets_out -= tcp_skb_pcount(skb); + + /* + * Undo sequence number change for (discarded) SYN bit + */ + tp->write_seq--; + tp->snd_nxt--; + tp->pushed_seq--; + + /* + * And, of course, tcp_connect also changes snd_una ... + */ + tp->snd_una = old_snd_una; + + sk_stream_free_skb(sk,skb); + tcp_clear_xmit_timers(sk); + return error; +} + + +/* + * tcpcp_fixup adjusts all variables such that the connection looks like the + * one described in the ICI. At the end of tcpcp_fixup, the connection is in + * a valid state and fully operational. + */ + +static int tcpcp_fixup(struct sock *sk,const struct tcpcp_ici *ici) +{ + struct tcp_sock *tp = tcp_sk(sk); + + DPRINTK("tcpcp_fixup(sk %p,ici %p)\n",sk,ici); + DPRINTK(" (1) tp->mss_clamp %lu, tp->mss_cache %lu\n", + (unsigned long) tp->rx_opt.mss_clamp, + (unsigned long) tp->mss_cache); + tp->rcv_nxt = tp->copied_seq = tp->rcv_wup = tp->snd_wl1 = + ntohl(ici->vargen.rcv_nxt); + tp->snd_wnd = tp->max_window = ntohl(ici->vargen.snd_wnd); + tp->rcv_wnd = ntohl(ici->vargen.rcv_wnd); + DPRINTK(" (1) sk->sk_rcvbuf %lu, sk->sk_sndbuf %lu\n", + (unsigned long) sk->sk_rcvbuf,(unsigned long) sk->sk_sndbuf); + + /* @@@ should detect these a little earlier ... */ + if (tp->rcv_wnd > tcp_space(sk)) { + int i; + + if (tp->rcv_wnd > tcp_win_from_space(sysctl_tcp_rmem[2])) + return -ENOBUFS; + /* + * tcp_win_from_space^-1 + * This is easier than + * sk->sk_rcvbuf = tp->rcv_wnd << -sysctl_tcp_adv_win_scale, + * and + * sk->sk_rcvbuf = tp->rcv_wnd/(1-2^-sysctl_tcp_adv_win_scale) + */ + for (i = 30; i >= 0; i--) { + if (tp->rcv_wnd >= + tcp_win_from_space((sk->sk_rcvbuf- + atomic_read(&sk->sk_rmem_alloc)) | (1 << i))) + sk->sk_rcvbuf |= 1 << i; + } + if (tp->rcv_wnd > tcp_space(sk)) + sk->sk_rcvbuf++; + } + if (sk->sk_sndbuf < sk->sk_wmem_queued) { + if (sk->sk_wmem_queued > sysctl_tcp_wmem[2]) + return -ENOBUFS; + sk->sk_sndbuf = sk->sk_wmem_queued; + /* + * @@@ better ?: export tcp_input.c:tcp_fixup_sndbuf and use + * that + */ + } + DPRINTK(" (2) sk->sk_rcvbuf %lu, sk->sk_sndbuf %lu\n", + (unsigned long) sk->sk_rcvbuf,(unsigned long) sk->sk_sndbuf); + + tp->rx_opt.tstamp_ok = !!(ici->fixgen.tcp_flags & TCPI_OPT_TIMESTAMPS); + tp->rx_opt.sack_ok = !!(ici->fixgen.tcp_flags & TCPI_OPT_SACK); + tp->rx_opt.wscale_ok = !!(ici->fixgen.tcp_flags & TCPI_OPT_WSCALE); + tp->ecn_flags = ici->fixgen.tcp_flags & TCPI_OPT_ECN ? TCP_ECN_OK : 0; + tp->tcp_header_len = sizeof(struct tcphdr); + if (tp->rx_opt.tstamp_ok) + tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; + tp->rx_opt.snd_wscale = ici->fixgen.snd_wscale; + tp->rx_opt.rcv_wscale = ici->fixgen.rcv_wscale; + + if (tp->rx_opt.tstamp_ok && ici->vargen.ts_recent) { + /* + * like tcp_input.c:tcp_store_ts_recent + */ + tp->rx_opt.ts_recent = ntohl(ici->vargen.ts_recent); + tp->rx_opt.ts_recent_stamp = xtime.tv_sec; + } + tp->ts_offset = ntohl(ici->vargen.ts_gen)-tcp_time_stamp; + + tp->rx_opt.mss_clamp = ntohs(ici->fixgen.rcv_mss); + tcp_sync_mss(sk,dst_mtu(__sk_dst_get(sk))); + DPRINTK(" (2) tp->mss_clamp %lu, tp->mss_cache %lu\n", + (unsigned long) tp->rx_opt.mss_clamp, + (unsigned long) tp->mss_cache); + + sk->sk_socket->state = SS_CONNECTED; + tcp_set_state(sk,TCP_ESTABLISHED); + + /* + * Ready to go ! + */ +#ifdef CONFIG_TCP_MD5SIG + tp->af_specific = &ipv4_specific; +#endif + return 0; +} + + +static int do_setici(struct sock *sk,const struct tcpcp_ici *ici, + const struct tcpcp_ici *user_ici,int size) +{ + int error; + + DPRINTK("do_setici(sk %p,ici %p,user_ici %p,size %d)\n", + sk,ici,user_ici,size); + if (ici->v.major != TCPCP_CURRENT_ICI_MAJOR) + return -EINVAL; + if (size < sizeof(*ici) || size != ntohl(ici->ici_length)) + return -EINVAL; + error = tcpcp_compatibility(sk,ici); + if (error) + return error; + error = tcpcp_bind(sk,ici); + if (error) + return error; + error = tcpcp_buffers(sk,ici,user_ici,size); + if (error) + return error; + lock_sock(sk); /* weird races may happen */ + error = tcpcp_connect(sk,ici); + if (!error) + error = tcpcp_fixup(sk,ici); + + /* + * In case of an error, don't make the poor socket linger ... + */ + if (error) { + sock_set_flag(sk,SOCK_LINGER); + sk->sk_lingertime = 0; + } + release_sock(sk); + return error; +} + + +static int __tcpcp_setici(struct sock *sk,const struct tcpcp_ici *user_ici, + int size) +{ + struct tcpcp_ici *ici; + int error = 0; + + DPRINTK("tcpcp_setici(sk %p,user_ici %p,size %d)\n",sk,user_ici,size); + if (sysctl_tcpcp_privileged && !capable(CAP_NET_RAW)) + return -EPERM; + if (sk->sk_family != PF_INET) + return -EPFNOSUPPORT; + if (sk->sk_type != SOCK_STREAM) + return -ESOCKTNOSUPPORT; + if (sk->sk_protocol != IPPROTO_TCP) + return -EPROTONOSUPPORT; + /* @@@ disconnect/kill socket if not unconnected ? */ + if (sk->sk_socket->state != SS_UNCONNECTED) + return -EBADFD; + +#ifndef CODE_IS_MATURE + /* + * Fine, we don't care or even know about compatibility for now. + * So, anything that doesn't fit _exactly_, just gets thrown away. + * Later on, we should leave this decision to do_setici. + */ + if (size < sizeof(struct tcpcp_ici)) + return -EINVAL; +#endif + if (size < sizeof(struct tcpcp_icie_version)) + return -EINVAL; + /* wrong - need to consider fixed-size prefix @@@ */ + /* use &version[1] */ + /* + * Various functions called by do_setici will attempt to lock the + * socket, so we must undo tcp_setsockopt's locking here. + * + * We need to take into account that users may attempt to change the + * socket state underneath us, so at least we shouldn't oops in such + * cases. (FFS) @@@ + */ + release_sock(sk); + ici = kmalloc(sizeof(struct tcpcp_ici),GFP_KERNEL); + if (!ici) { + error = -ENOMEM; + goto out; + } + if (copy_from_user(ici,user_ici, + size > sizeof(struct tcpcp_ici) ? sizeof(struct tcpcp_ici) : size)) + error = -EFAULT; + else error = do_setici(sk,ici,user_ici,size); + +out: + lock_sock(sk); + kfree(ici); + return error; +} + + +/* ----- Activate dormant connection (kick it) ----------------------------- */ + + +/* + * Check that the connection isn't doing anything. In particular, we want to + * keep people from successfully kicking "normal" connections. + * + * The indicators we use are timers and whether we've received an ACK since + * resurrecting the connection. The latter is needed, because timers are also + * stopped if all outbound data has been ACKed, and we don't have any pending + * delayed ACKs. + * + * @@@ The tp->rcv_tstamp test fails every once in a while, and can probably + * be exploited for emitting a burst of restart segments. Need to also check + * a reliable indicator that data hs been sent. + */ + +static int check_tcp_idle(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); +#if 0 + DPRINTK("check_tcp_idle(sk %p,tp %p)\n",sk,tp); + DPRINTK(" tp->pending %d, tp->ack.pending %d\n",tp->pending, + tp->ack.pending); + DPRINTK(" tp->ack.lrcvtime %lu, tp->rcv_tstamp %lu\n", + (unsigned long) tp->ack.lrcvtime,(unsigned long) tp->rcv_tstamp); + if (tp->pending || tp->ack.pending) + return -EALREADY; + if (tp->rcv_tstamp) + return -EALREADY; +#endif + return 0; +} + + +/* + * tcpcp_restart tries to get data to flow. The socket is unmuted, but still + * locked, so tcpcp_start can perform all operations ordinary TCP uses. + */ + +static void tcpcp_restart(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + DPRINTK("tcpcp_restart(sk %p)\n",sk); + DPRINTK(" tp->rcv_nxt %lu, tp->rcv_wup %lu\n", + (unsigned long) tp->rcv_nxt, (unsigned long) tp->rcv_wup); + + /* + * If there is no data to send, all we can do is emit an ACK for the + * last segment received from the peer. The usefulness of this is + * probably marginal, since a single ACK won't be enough to cause Fast + * Retransmit, and there shouldn't be any unACKed in-order data at this + * moment. The only case where this ACK is useful is if we have shrunk + * our window to 0. + */ + skb = skb_peek(&sk->sk_write_queue); + if (!skb) { + DPRINTK(" Empty write queue -> ACK\n"); + /* + * Actually, I lied. For now, we're just silent :-) @@@ + */ + /* SEEMS TO WORK */ + return; + } + + /* + * If we have data to retransmit, we will just do so now. This may + * cause unnecessary retransmission. FFS @@@. + */ + if (sk->sk_send_head != skb) { + DPRINTK(" Got something to retransmit\n"); + DPRINTK(" skb %p, seq %lu...%lu, tp->snd_una = %lu\n",skb, + (unsigned long) TCP_SKB_CB(skb)->seq, + (unsigned long) TCP_SKB_CB(skb)->end_seq, + (unsigned long) tp->snd_una); + DPRINTK(" sk->sk_wmem_alloc %lu, sk->sk_wmem_queued %lu, " + "sk->sk_sndbuf %lu\n", + (unsigned long) atomic_read(&sk->sk_wmem_alloc), + (unsigned long) sk->sk_wmem_queued, + (unsigned long) sk->sk_sndbuf); + DPRINTK(" tp->snd_wnd %lu\n",(unsigned long) tp->snd_wnd); + DPRINTK(" tp->packets_out %lu\n", + (unsigned long) tp->packets_out); + tcp_retransmit_skb(sk,skb_peek(&sk->sk_write_queue)); + /* SEEMS TO WORK */ + /* ANOMALY - skbs get either split into smaller segments, or + merged to very large sizes, indicating that something's + wrong with our (nearly nonexistent) MSS calculation */ + return; + } + + /* + * Okay, so we have data to send, and nothing waiting for + * retransmission. If the receives can still take more data + * (window > 0), we send some. + */ + if (tp->snd_wnd) { + DPRINTK(" Can send more data (window %lu)\n", + (unsigned long) tp->snd_wnd); + /* is this really correct ??? looks too easy :-) @@@ */ + tcp_push_one(sk,tcp_current_mss(sk,1)); + /* NOT TESTED (need to simulate flow where we're stopped by + cwnd ...) */ + return; + } + + /* + * If all else fails, we can still send a window probe ... + */ + DPRINTK(" Sending window probe\n"); + tcp_send_probe0(sk); + /* SEEMS TO WORK */ +} + + +static int __tcpcp_fn(struct sock *sk,int fn_code) +{ + int error; + + DPRINTK("tcpcp_fn(sk %p,fn_code %d)\n",sk,fn_code); + + if (sk->sk_socket->state != SS_CONNECTED) + return -EBADFD; + if (sk->sk_family != PF_INET) + return -EPFNOSUPPORT; + if (sk->sk_type != SOCK_STREAM) + return -ESOCKTNOSUPPORT; + if (sk->sk_protocol != IPPROTO_TCP) + return -EPROTONOSUPPORT; + + switch (fn_code) { + case TCPCP_ACTIVATE: + error = check_tcp_idle(sk); + if (!error) + tcpcp_restart(sk); + break; + default: + error = -ENOPROTOOPT; + } + return error; +} + + +/* ----- Initialization and (module) exit ---------------------------------- */ + + +static int __init tcpcp_init(void) +{ +#ifdef CONFIG_TCP_MD5SIG + tcpcp_af_specific = ipv4_specific; + tcpcp_af_specific.queue_xmit = tcpcp_discard_xmit; +#endif + tcpcp_lock_hooks(); + tcpcp_maxicisize_hook = __tcpcp_maxicisize; + tcpcp_getici_hook = __tcpcp_getici; + tcpcp_setici_hook = __tcpcp_setici; + tcpcp_fn_hook = __tcpcp_fn; + tcpcp_unlock_hooks(); + + printk(KERN_INFO "tcpcp: ready for mischief (ICI format version " + "%u.%u)\n",TCPCP_CURRENT_ICI_MAJOR,TCPCP_CURRENT_ICI_MINOR); + return 0; +} + + +static void __exit tcpcp_exit(void) +{ + tcpcp_lock_hooks(); + tcpcp_maxicisize_hook = NULL; + tcpcp_getici_hook = NULL; + tcpcp_setici_hook = NULL; + tcpcp_fn_hook = NULL; + tcpcp_unlock_hooks(); +} + + +module_init(tcpcp_init); +module_exit(tcpcp_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Werner Almesberger "); +MODULE_DESCRIPTION("TCP Connection Passing"); diff -uNr linux-2.6.25.6/net/ipv4/tcpcp_hooks.c linux-2.6.25.6-tcpcp/net/ipv4/tcpcp_hooks.c --- linux-2.6.25.6/net/ipv4/tcpcp_hooks.c 1969-12-31 19:00:00.000000000 -0500 +++ linux-2.6.25.6-tcpcp/net/ipv4/tcpcp_hooks.c 2008-06-11 14:30:56.000000000 -0400 @@ -0,0 +1,129 @@ +/* + * tcpcp_hooks.c - Hooks when using TCP connection passing as a module + * + * Written 2002,2004 by Werner Almesberger + */ + + +//#include + +#if defined(CONFIG_TCPCP) || defined(CONFIG_TCPCP_MODULE) + +#include +#include +#include +#include + + +int sysctl_tcpcp_privileged = 1; /* setting TCP_ICI required CAP_NET_RAW */ + +EXPORT_SYMBOL(sysctl_tcpcp_privileged); + + +/* ----- Hooks ------------------------------------------------------------- */ + + +int (*tcpcp_maxicisize_hook)(struct sock *sk,int *size) = NULL; +int (*tcpcp_getici_hook)(struct sock *sk,struct tcpcp_ici *user_ici, + int *user_size) = NULL; +int (*tcpcp_setici_hook)(struct sock *sk,const struct tcpcp_ici *user_ici, + int size) = NULL; +int (*tcpcp_fn_hook)(struct sock *sk,int fn_code) = NULL; + +EXPORT_SYMBOL(tcpcp_maxicisize_hook); +EXPORT_SYMBOL(tcpcp_getici_hook); +EXPORT_SYMBOL(tcpcp_setici_hook); +EXPORT_SYMBOL(tcpcp_fn_hook); + + +static DECLARE_RWSEM(tcpcp_sem); + + +void tcpcp_lock_hooks(void) +{ + down_write(&tcpcp_sem); +} + +EXPORT_SYMBOL(tcpcp_lock_hooks); + + +void tcpcp_unlock_hooks(void) +{ + up_write(&tcpcp_sem); +} + +EXPORT_SYMBOL(tcpcp_unlock_hooks); + + +/* ----- Interface to ipv4/tcp.c ------------------------------------------- */ + + +/* + * @@@ Known bug: we race with module unloading vs. request_module. E.g. if + * tcpcp is loaded, the hook tests may pass, so we don't call request_module. + * Then, an unload may happen, before we down tcpcp_sem. The correct solution + * is to down tcpcp_sem before testing the hook, and to set some flag that + * tells tcpcp.c:tcpcp_init that the hooks are already locked. (Actually, make + * this a semaphore we down_trylock, so that concurrent manual loading doesn't + * break things.) + * + * Since this race is pretty obscure, we keep this as a good exercise for + * future regression testing with umlsim. + */ + +int tcpcp_maxicisize(struct sock *sk,int *size) +{ + int error = -ENOSYS; + + if (!tcpcp_maxicisize_hook) + request_module("tcpcp"); + down_read(&tcpcp_sem); + if (tcpcp_maxicisize_hook) + error = tcpcp_maxicisize_hook(sk,size); + up_read(&tcpcp_sem); + return error; +} + + +int tcpcp_getici(struct sock *sk,struct tcpcp_ici *user_ici,int *user_size) +{ + int error = -ENOSYS; + + if (!tcpcp_getici_hook) + request_module("tcpcp"); + down_read(&tcpcp_sem); + if (tcpcp_getici_hook) + error = tcpcp_getici_hook(sk,user_ici,user_size); + up_read(&tcpcp_sem); + return error; +} + + +int tcpcp_setici(struct sock *sk,const struct tcpcp_ici *user_ici,int size) +{ + int error = -ENOSYS; + + if (!tcpcp_setici_hook) + request_module("tcpcp"); + down_read(&tcpcp_sem); + if (tcpcp_setici_hook) + error = tcpcp_setici_hook(sk,user_ici,size); + up_read(&tcpcp_sem); + return error; +} + + +int tcpcp_fn(struct sock *sk,int fn_code) +{ + int error = -ENOSYS; + + if (!tcpcp_fn_hook) + request_module("tcpcp"); + down_read(&tcpcp_sem); + if (tcpcp_fn_hook) + error = tcpcp_fn_hook(sk,fn_code); + up_read(&tcpcp_sem); + return error; +} + +#endif /* defined(CONFIG_TCPCP) || defined(CONFIG_TCPCP_MODULE) */ diff -uNr linux-2.6.25.6/net/ipv4/tcp_input.c linux-2.6.25.6-tcpcp/net/ipv4/tcp_input.c --- linux-2.6.25.6/net/ipv4/tcp_input.c 2008-06-09 14:27:19.000000000 -0400 +++ linux-2.6.25.6-tcpcp/net/ipv4/tcp_input.c 2008-06-11 12:56:10.000000000 -0400 @@ -1990,7 +1990,7 @@ static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) { - return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); + return (tp_time_stamp(tcp_sk(sk)) - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); } static inline int tcp_head_timedout(struct sock *sk) @@ -2794,7 +2794,7 @@ struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; - u32 now = tcp_time_stamp; + u32 now = tp_time_stamp(tp); int fully_acked = 1; int flag = 0; u32 pkts_acked = 0; @@ -4904,7 +4904,7 @@ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, - tcp_time_stamp)) { + tp_time_stamp(tp))) { NET_INC_STATS_BH(LINUX_MIB_PAWSACTIVEREJECTED); goto reset_and_undo; } @@ -5391,3 +5391,7 @@ EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); EXPORT_SYMBOL(tcp_initialize_rcv_mss); +EXPORT_SYMBOL(sysctl_tcp_timestamps); +EXPORT_SYMBOL(sysctl_tcp_window_scaling); +EXPORT_SYMBOL(sysctl_tcp_sack); +EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); diff -uNr linux-2.6.25.6/net/ipv4/tcp_ipv4.c linux-2.6.25.6-tcpcp/net/ipv4/tcp_ipv4.c --- linux-2.6.25.6/net/ipv4/tcp_ipv4.c 2008-06-09 14:27:19.000000000 -0400 +++ linux-2.6.25.6-tcpcp/net/ipv4/tcp_ipv4.c 2008-06-11 14:28:41.000000000 -0400 @@ -610,7 +610,7 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, struct sk_buff *skb, u32 seq, u32 ack, - u32 win, u32 ts) + u32 win, u32 ts_out, u32 ts) { struct tcphdr *th = tcp_hdr(skb); struct { @@ -637,7 +637,7 @@ (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); rep.opt[1] = htonl(tcp_time_stamp); - rep.opt[2] = htonl(ts); + rep.opt[2] = htonl(ts_out); arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED; } @@ -705,8 +705,9 @@ tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, + tw_time_stamp(tcptw), tcptw->tw_ts_recent); - + inet_twsk_put(tw); } @@ -715,7 +716,7 @@ { tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, - req->ts_recent); + tcp_time_stamp, req->ts_recent); } /* diff -uNr linux-2.6.25.6/net/ipv4/tcp_minisocks.c linux-2.6.25.6-tcpcp/net/ipv4/tcp_minisocks.c --- linux-2.6.25.6/net/ipv4/tcp_minisocks.c 2008-06-09 14:27:19.000000000 -0400 +++ linux-2.6.25.6-tcpcp/net/ipv4/tcp_minisocks.c 2008-06-11 14:30:19.000000000 -0400 @@ -327,6 +327,10 @@ } while (0); #endif +#if defined(CONFIG_TCPCP) || defined(CONFIG_TCPCP_MODULE) + tcptw->ts_offset = tp->ts_offset; +#endif + /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); diff -uNr linux-2.6.25.6/net/ipv4/tcp_output.c linux-2.6.25.6-tcpcp/net/ipv4/tcp_output.c --- linux-2.6.25.6/net/ipv4/tcp_output.c 2008-06-09 14:27:19.000000000 -0400 +++ linux-2.6.25.6-tcpcp/net/ipv4/tcp_output.c 2008-06-11 13:24:22.000000000 -0400 @@ -1424,7 +1424,7 @@ /* We're ready to send. If this fails, the probe will * be resegmented into mss-sized pieces by tcp_write_xmit(). */ - TCP_SKB_CB(nskb)->when = tcp_time_stamp; + TCP_SKB_CB(nskb)->when = tp_time_stamp(tp); if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending * effectively two packets. */ @@ -1504,7 +1504,7 @@ unlikely(tso_fragment(sk, skb, limit, mss_now))) break; - TCP_SKB_CB(skb)->when = tcp_time_stamp; + TCP_SKB_CB(skb)->when = tp_time_stamp(tcp_sk(sk)); if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC))) break; @@ -1568,7 +1568,7 @@ return; /* Send it out now. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; + TCP_SKB_CB(skb)->when = tp_time_stamp(tcp_sk(sk)); if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) { tcp_event_new_data_sent(sk, skb); @@ -1905,7 +1905,7 @@ /* Make a copy, if the first transmission SKB clone we made * is still in somebody's hands, else make a clone. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; + TCP_SKB_CB(skb)->when = tp_time_stamp(tp); err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); @@ -2126,7 +2126,7 @@ tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK | TCPCB_FLAG_RST); /* Send it off. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; + TCP_SKB_CB(skb)->when = tp_time_stamp(tcp_sk(sk)); if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); } @@ -2162,7 +2162,7 @@ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK; TCP_ECN_send_synack(tcp_sk(sk), skb); } - TCP_SKB_CB(skb)->when = tcp_time_stamp; + TCP_SKB_CB(skb)->when = tp_time_stamp(tcp_sk(sk)); return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } @@ -2237,7 +2237,7 @@ /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rcv_wnd, 65535U)); - TCP_SKB_CB(skb)->when = tcp_time_stamp; + TCP_SKB_CB(skb)->when = tp_time_stamp(tp); tcp_syn_build_options((__be32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok, ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale, TCP_SKB_CB(skb)->when, @@ -2345,7 +2345,7 @@ TCP_ECN_send_syn(sk, buff); /* Send it off. */ - TCP_SKB_CB(buff)->when = tcp_time_stamp; + TCP_SKB_CB(buff)->when = tp_time_stamp(tp); tp->retrans_stamp = TCP_SKB_CB(buff)->when; skb_header_release(buff); __tcp_add_write_queue_tail(sk, buff); @@ -2450,7 +2450,7 @@ tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK); /* Send it off, this clears delayed acks for us. */ - TCP_SKB_CB(buff)->when = tcp_time_stamp; + TCP_SKB_CB(buff)->when = tp_time_stamp(tcp_sk(sk)); tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); } @@ -2482,7 +2482,7 @@ * send it. */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK); - TCP_SKB_CB(skb)->when = tcp_time_stamp; + TCP_SKB_CB(skb)->when = tp_time_stamp(tp); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } @@ -2517,7 +2517,7 @@ tcp_set_skb_tso_segs(sk, skb, mss); TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; - TCP_SKB_CB(skb)->when = tcp_time_stamp; + TCP_SKB_CB(skb)->when = tp_time_stamp(tp); err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) tcp_event_new_data_sent(sk, skb); @@ -2576,3 +2576,8 @@ EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_sync_mss); EXPORT_SYMBOL(tcp_mtup_init); + +EXPORT_SYMBOL(tcp_push_one); +EXPORT_SYMBOL(tcp_current_mss); +EXPORT_SYMBOL(tcp_retransmit_skb); +EXPORT_SYMBOL(tcp_send_probe0);