root / lab4 / .minix-src / include / netinet / tcp_vtw.h @ 14
History | View | Annotate | Download (12.4 KB)
1 |
/* $NetBSD: tcp_vtw.h,v 1.6 2012/11/23 14:48:31 joerg Exp $ */
|
---|---|
2 |
/*
|
3 |
* Copyright (c) 2011 The NetBSD Foundation, Inc.
|
4 |
* All rights reserved.
|
5 |
*
|
6 |
* This code is derived from software contributed to The NetBSD Foundation
|
7 |
* by Coyote Point Systems, Inc.
|
8 |
*
|
9 |
* Redistribution and use in source and binary forms, with or without
|
10 |
* modification, are permitted provided that the following conditions
|
11 |
* are met:
|
12 |
* 1. Redistributions of source code must retain the above copyright
|
13 |
* notice, this list of conditions and the following disclaimer.
|
14 |
* 2. Redistributions in binary form must reproduce the above copyright
|
15 |
* notice, this list of conditions and the following disclaimer in the
|
16 |
* documentation and/or other materials provided with the distribution.
|
17 |
*
|
18 |
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
|
19 |
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
20 |
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
21 |
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
|
22 |
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
23 |
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
24 |
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
25 |
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
26 |
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
27 |
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
28 |
* POSSIBILITY OF SUCH DAMAGE.
|
29 |
*/
|
30 |
|
31 |
/*
|
32 |
* Vestigial time-wait.
|
33 |
*
|
34 |
* This implementation uses cache-efficient techniques, which will
|
35 |
* appear somewhat peculiar. The main philosophy is to optimise the
|
36 |
* amount of information available within a cache line. Cache miss is
|
37 |
* expensive. So we employ ad-hoc techniques to pull a series of
|
38 |
* linked-list follows into a cache line. One cache line, multiple
|
39 |
* linked-list equivalents.
|
40 |
*
|
41 |
* One such ad-hoc technique is fat pointers. Additional degrees of
|
42 |
* ad-hoqueness result from having to hand tune it for pointer size
|
43 |
* and for cache line size.
|
44 |
*
|
45 |
* The 'fat pointer' approach aggregates, for x86_32, 15 linked-list
|
46 |
* data structures into one cache line. The additional 32 bits in the
|
47 |
* cache line are used for linking fat pointers, and for
|
48 |
* allocation/bookkeeping.
|
49 |
*
|
50 |
* The 15 32-bit tags encode the pointers to the linked list elements,
|
51 |
* and also encode the results of a search comparison.
|
52 |
*
|
53 |
* First, some more assumptions/restrictions.
|
54 |
*
|
55 |
* All the fat pointers are from a contiguous allocation arena. Thus,
|
56 |
* we can refer to them by offset from a base, not as full pointers.
|
57 |
*
|
58 |
* All the linked list data elements are also from a contiguous
|
59 |
* allocation arena, again so that we can refer to them as offset from
|
60 |
* a base.
|
61 |
*
|
62 |
* In order to add a data element to a fat pointer, a key value is
|
63 |
* computed, based on unique data within the data element. It is the
|
64 |
* linear searching of the linked lists of these elements based on
|
65 |
* these unique data that are being optimised here.
|
66 |
*
|
67 |
* Lets call the function that computes the key k(e), where e is the
|
68 |
* data element. In this example, k(e) returns 32-bits.
|
69 |
*
|
70 |
* Consider a set E (say of order 15) of data elements. Let K be
|
71 |
* the set of the k(e) for e in E.
|
72 |
*
|
73 |
* Let O be the set of the offsets from the base of the data elements in E.
|
74 |
*
|
75 |
* For each x in K, for each matching o in O, let t be x ^ o. These
|
76 |
* are the tags. (More or less).
|
77 |
*
|
78 |
* In order to search all the data elements in E, we compute the
|
79 |
* search key, and one at a time, XOR the key into the tags. If any
|
80 |
* result is a valid data element index, we have a possible match. If
|
81 |
* not, there is no match.
|
82 |
*
|
83 |
* The no-match cases mean we do not have to de-reference the pointer
|
84 |
* to the data element in question. We save cache miss penalty and
|
85 |
* cache load decreases. Only in the case of a valid looking data
|
86 |
* element index, do we have to look closer.
|
87 |
*
|
88 |
* Thus, in the absence of false positives, 15 data elements can be
|
89 |
* searched with one cache line fill, as opposed to 15 cache line
|
90 |
* fills for the usual implementation.
|
91 |
*
|
92 |
* The vestigial time waits (vtw_t), the data elements in the above, are
|
93 |
* searched by faddr, fport, laddr, lport. The key is a function of
|
94 |
* these values.
|
95 |
*
|
96 |
* We hash these keys into the traditional hash chains to reduce the
|
97 |
* search time, and use fat pointers to reduce the cache impacts of
|
98 |
* searching.
|
99 |
*
|
100 |
* The vtw_t are, per requirement, in a contiguous chunk. Allocation
|
101 |
* is done with a clock hand, and all vtw_t within one allocation
|
102 |
* domain have the same lifetime, so they will always be sorted by
|
103 |
* age.
|
104 |
*
|
105 |
* A vtw_t will be allocated, timestamped, and have a fixed future
|
106 |
* expiration. It will be added to a hash bucket implemented with fat
|
107 |
* pointers, which means that a cache line will be allocated in the
|
108 |
* hash bucket, placed at the head (more recent in time) and the vtw_t
|
109 |
* will be added to this. As more entries are added, the fat pointer
|
110 |
* cache line will fill, requiring additional cache lines for fat
|
111 |
* pointers to be allocated. These will be added at the head, and the
|
112 |
* aged entries will hang down, tapeworm like. As the vtw_t entries
|
113 |
* expire, the corresponding slot in the fat pointer will be
|
114 |
* reclaimed, and eventually the cache line will completely empty and
|
115 |
* be re-cycled, if not at the head of the chain.
|
116 |
*
|
117 |
* At times, a time-wait timer is restarted. This corresponds to
|
118 |
* deleting the current entry and re-adding it.
|
119 |
*
|
120 |
* Most of the time, they are just placed here to die.
|
121 |
*/
|
122 |
#ifndef _NETINET_TCP_VTW_H
|
123 |
#define _NETINET_TCP_VTW_H
|
124 |
|
125 |
#include <sys/types.h> |
126 |
#include <sys/socket.h> |
127 |
#include <sys/sysctl.h> |
128 |
#include <net/if.h> |
129 |
#include <net/route.h> |
130 |
#include <netinet/in.h> |
131 |
#include <netinet/in_systm.h> |
132 |
#include <netinet/ip.h> |
133 |
#include <netinet/in_pcb.h> |
134 |
#include <netinet/in_var.h> |
135 |
#include <netinet/ip_var.h> |
136 |
#include <netinet/in.h> |
137 |
#include <netinet/tcp.h> |
138 |
#include <netinet/tcp_timer.h> |
139 |
#include <netinet/tcp_var.h> |
140 |
#include <netinet6/in6.h> |
141 |
#include <netinet/ip6.h> |
142 |
#include <netinet6/ip6_var.h> |
143 |
#include <netinet6/in6_pcb.h> |
144 |
#include <netinet6/ip6_var.h> |
145 |
#include <netinet6/in6_var.h> |
146 |
#include <netinet/icmp6.h> |
147 |
#include <netinet6/nd6.h> |
148 |
|
149 |
#define VTW_NCLASS (1+3) /* # different classes */ |
150 |
|
151 |
/*
|
152 |
* fat pointers, MI.
|
153 |
*/
|
154 |
struct fatp_mi;
|
155 |
|
156 |
typedef uint32_t fatp_word_t;
|
157 |
|
158 |
typedef struct fatp_mi fatp_t; |
159 |
|
160 |
/* Supported cacheline sizes: 32 64 128 bytes. See fatp_key(),
|
161 |
* fatp_slot_from_key(), fatp_xtra[].
|
162 |
*/
|
163 |
#define FATP_NTAGS (CACHE_LINE_SIZE / sizeof(fatp_word_t) - 1) |
164 |
#define FATP_NXT_WIDTH (sizeof(fatp_word_t) * NBBY - FATP_NTAGS) |
165 |
|
166 |
#define FATP_MAX (1 << FATP_NXT_WIDTH) |
167 |
|
168 |
/* Worked example: ULP32 with 64-byte cacheline (32-bit x86):
|
169 |
* 15 tags per cacheline. At most 2^17 fat pointers per fatp_ctl_t.
|
170 |
* The comments on the fatp_mi members, below, correspond to the worked
|
171 |
* example.
|
172 |
*/
|
173 |
struct fatp_mi {
|
174 |
fatp_word_t inuse : FATP_NTAGS; /* (1+15)*4 == CL_SIZE */
|
175 |
fatp_word_t nxt : FATP_NXT_WIDTH;/* at most 2^17 fat pointers */
|
176 |
fatp_word_t tag[FATP_NTAGS]; /* 15 tags per CL */
|
177 |
}; |
178 |
|
179 |
static inline int |
180 |
fatp_ntags(void)
|
181 |
{ |
182 |
return FATP_NTAGS;
|
183 |
} |
184 |
|
185 |
static inline int |
186 |
fatp_full(fatp_t *fp) |
187 |
{ |
188 |
fatp_t full; |
189 |
|
190 |
full.inuse = (1U << FATP_NTAGS) - 1U; |
191 |
|
192 |
return (fp->inuse == full.inuse);
|
193 |
} |
194 |
|
195 |
struct vtw_common;
|
196 |
struct vtw_v4;
|
197 |
struct vtw_v6;
|
198 |
struct vtw_ctl;
|
199 |
|
200 |
/*!\brief common to all vtw
|
201 |
*/
|
202 |
typedef struct vtw_common { |
203 |
struct timeval expire; /* date of birth+msl */ |
204 |
uint32_t key; /* hash key: full hash */
|
205 |
uint32_t port_key; /* hash key: local port hash */
|
206 |
uint32_t rcv_nxt; |
207 |
uint32_t rcv_wnd; |
208 |
uint32_t snd_nxt; |
209 |
uint32_t snd_scale : 8; /* window scaling for send win */ |
210 |
uint32_t msl_class : 2; /* TCP MSL class {0,1,2,3} */ |
211 |
uint32_t reuse_port : 1;
|
212 |
uint32_t reuse_addr : 1;
|
213 |
uint32_t v6only : 1;
|
214 |
uint32_t hashed : 1; /* reachable via FATP */ |
215 |
uint32_t uid; |
216 |
} vtw_t; |
217 |
|
218 |
/*!\brief vestigial timewait for IPv4
|
219 |
*/
|
220 |
typedef struct vtw_v4 { |
221 |
vtw_t common; /* must be first */
|
222 |
uint16_t lport; |
223 |
uint16_t fport; |
224 |
uint32_t laddr; |
225 |
uint32_t faddr; |
226 |
} vtw_v4_t; |
227 |
|
228 |
/*!\brief vestigial timewait for IPv6
|
229 |
*/
|
230 |
typedef struct vtw_v6 { |
231 |
vtw_t common; /* must be first */
|
232 |
uint16_t lport; |
233 |
uint16_t fport; |
234 |
struct in6_addr laddr;
|
235 |
struct in6_addr faddr;
|
236 |
} vtw_v6_t; |
237 |
|
238 |
struct fatp_ctl;
|
239 |
typedef struct vtw_ctl vtw_ctl_t; |
240 |
typedef struct fatp_ctl fatp_ctl_t; |
241 |
|
242 |
/*
|
243 |
* The vestigial time waits are kept in a contiguous chunk.
|
244 |
* Allocation and free pointers run as clock hands thru this array.
|
245 |
*/
|
246 |
struct vtw_ctl {
|
247 |
fatp_ctl_t *fat; /* collection of fatp to use */
|
248 |
vtw_ctl_t *ctl; /* <! controller's controller */
|
249 |
union {
|
250 |
vtw_t *v; /* common */
|
251 |
struct vtw_v4 *v4; /* IPv4 resources */ |
252 |
struct vtw_v6 *v6; /* IPv6 resources */ |
253 |
} base, /* base of vtw_t array */
|
254 |
/**/ lim, /* extent of vtw_t array */ |
255 |
/**/ alloc, /* allocation pointer */ |
256 |
/**/ oldest; /* ^ to oldest */ |
257 |
uint32_t nfree; /* # free */
|
258 |
uint32_t nalloc; /* # allocated */
|
259 |
uint32_t idx_mask; /* mask capturing all index bits*/
|
260 |
uint32_t is_v4 : 1;
|
261 |
uint32_t is_v6 : 1;
|
262 |
uint32_t idx_bits: 6;
|
263 |
uint32_t clidx : 3; /* <! class index */ |
264 |
}; |
265 |
|
266 |
/*!\brief Collections of fat pointers.
|
267 |
*/
|
268 |
struct fatp_ctl {
|
269 |
vtw_ctl_t *vtw; /* associated VTWs */
|
270 |
fatp_t *base; /* base of fatp_t array */
|
271 |
fatp_t *lim; /* extent of fatp_t array */
|
272 |
fatp_t *free; /* free list */
|
273 |
uint32_t mask; /* hash mask */
|
274 |
uint32_t nfree; /* # free */
|
275 |
uint32_t nalloc; /* # allocated */
|
276 |
fatp_t **hash; /* hash anchors */
|
277 |
fatp_t **port; /* port hash anchors */
|
278 |
}; |
279 |
|
280 |
/*!\brief stats
|
281 |
*/
|
282 |
struct vtw_stats {
|
283 |
uint64_t ins; /* <! inserts */
|
284 |
uint64_t del; /* <! deleted */
|
285 |
uint64_t kill; /* <! assassination */
|
286 |
uint64_t look[2]; /* <! lookup: full hash, port hash */ |
287 |
uint64_t hit[2]; /* <! lookups that hit */ |
288 |
uint64_t miss[2]; /* <! lookups that miss */ |
289 |
uint64_t probe[2]; /* <! hits+miss */ |
290 |
uint64_t losing[2]; /* <! misses requiring dereference */ |
291 |
uint64_t max_chain[2]; /* <! max fatp chain traversed */ |
292 |
uint64_t max_probe[2]; /* <! max probes in any one chain */ |
293 |
uint64_t max_loss[2]; /* <! max losing probes in any one |
294 |
* chain
|
295 |
*/
|
296 |
}; |
297 |
|
298 |
typedef struct vtw_stats vtw_stats_t; |
299 |
|
300 |
/*!\brief follow fatp next 'pointer'
|
301 |
*/
|
302 |
static inline fatp_t * |
303 |
fatp_next(fatp_ctl_t *fat, fatp_t *fp) |
304 |
{ |
305 |
return fp->nxt ? fat->base + fp->nxt-1 : 0; |
306 |
} |
307 |
|
308 |
/*!\brief determine a collection-relative fat pointer index.
|
309 |
*/
|
310 |
static inline uint32_t |
311 |
fatp_index(fatp_ctl_t *fat, fatp_t *fp) |
312 |
{ |
313 |
return fp ? 1 + (fp - fat->base) : 0; |
314 |
} |
315 |
|
316 |
|
317 |
static inline uint32_t |
318 |
v4_tag(uint32_t faddr, uint32_t fport, uint32_t laddr, uint32_t lport) |
319 |
{ |
320 |
return (ntohl(faddr) + ntohs(fport)
|
321 |
+ ntohl(laddr) + ntohs(lport)); |
322 |
} |
323 |
|
324 |
static inline uint32_t |
325 |
v6_tag(const struct in6_addr *faddr, uint16_t fport, |
326 |
const struct in6_addr *laddr, uint16_t lport) |
327 |
{ |
328 |
#ifdef IN6_HASH
|
329 |
return IN6_HASH(faddr, fport, laddr, lport);
|
330 |
#else
|
331 |
return 0; |
332 |
#endif
|
333 |
} |
334 |
|
335 |
static inline uint32_t |
336 |
v4_port_tag(uint16_t lport) |
337 |
{ |
338 |
uint32_t tag = lport ^ (lport << 11);
|
339 |
|
340 |
tag ^= tag << 3;
|
341 |
tag += tag >> 5;
|
342 |
tag ^= tag << 4;
|
343 |
tag += tag >> 17;
|
344 |
tag ^= tag << 25;
|
345 |
tag += tag >> 6;
|
346 |
|
347 |
return tag;
|
348 |
} |
349 |
|
350 |
static inline uint32_t |
351 |
v6_port_tag(uint16_t lport) |
352 |
{ |
353 |
return v4_port_tag(lport);
|
354 |
} |
355 |
|
356 |
struct tcpcb;
|
357 |
struct tcphdr;
|
358 |
|
359 |
int vtw_add(int, struct tcpcb *); |
360 |
void vtw_del(vtw_ctl_t *, vtw_t *);
|
361 |
int vtw_lookup_v4(const struct ip *ip, const struct tcphdr *th, |
362 |
uint32_t faddr, uint16_t fport, |
363 |
uint32_t laddr, uint16_t lport); |
364 |
struct ip6_hdr;
|
365 |
struct in6_addr;
|
366 |
|
367 |
int vtw_lookup_v6(const struct ip6_hdr *ip, const struct tcphdr *th, |
368 |
const struct in6_addr *faddr, uint16_t fport, |
369 |
const struct in6_addr *laddr, uint16_t lport); |
370 |
|
371 |
typedef struct vestigial_inpcb { |
372 |
union {
|
373 |
struct in_addr v4;
|
374 |
struct in6_addr v6;
|
375 |
} faddr, laddr; |
376 |
uint16_t fport, lport; |
377 |
uint32_t valid : 1;
|
378 |
uint32_t v4 : 1;
|
379 |
uint32_t reuse_addr : 1;
|
380 |
uint32_t reuse_port : 1;
|
381 |
uint32_t v6only : 1;
|
382 |
uint32_t more_tbd : 1;
|
383 |
uint32_t uid; |
384 |
uint32_t rcv_nxt; |
385 |
uint32_t rcv_wnd; |
386 |
uint32_t snd_nxt; |
387 |
struct vtw_common *vtw;
|
388 |
struct vtw_ctl *ctl;
|
389 |
} vestigial_inpcb_t; |
390 |
|
391 |
#ifdef _KERNEL
|
392 |
void vtw_restart(vestigial_inpcb_t*);
|
393 |
int vtw_earlyinit(void); |
394 |
int sysctl_tcp_vtw_enable(SYSCTLFN_PROTO);
|
395 |
#endif /* _KERNEL */ |
396 |
|
397 |
#ifdef VTW_DEBUG
|
398 |
typedef struct sin_either { |
399 |
uint8_t sin_len; |
400 |
uint8_t sin_family; |
401 |
uint16_t sin_port; |
402 |
union {
|
403 |
struct in_addr v4;
|
404 |
struct in6_addr v6;
|
405 |
} sin_addr; |
406 |
} sin_either_t; |
407 |
|
408 |
int vtw_debug_add(int af, sin_either_t *, sin_either_t *, int, int); |
409 |
|
410 |
typedef struct vtw_sysargs { |
411 |
uint32_t op; |
412 |
sin_either_t fa; |
413 |
sin_either_t la; |
414 |
} vtw_sysargs_t; |
415 |
|
416 |
#endif /* VTW_DEBUG */ |
417 |
|
418 |
#endif /* _NETINET_TCP_VTW_H */ |