root / lab4 / .minix-src / include / netinet / tcp_vtw.h @ 13
History | View | Annotate | Download (12.4 KB)
1 | 13 | up20180614 | /* $NetBSD: tcp_vtw.h,v 1.6 2012/11/23 14:48:31 joerg Exp $ */
|
---|---|---|---|
2 | /*
|
||
3 | * Copyright (c) 2011 The NetBSD Foundation, Inc.
|
||
4 | * All rights reserved.
|
||
5 | *
|
||
6 | * This code is derived from software contributed to The NetBSD Foundation
|
||
7 | * by Coyote Point Systems, Inc.
|
||
8 | *
|
||
9 | * Redistribution and use in source and binary forms, with or without
|
||
10 | * modification, are permitted provided that the following conditions
|
||
11 | * are met:
|
||
12 | * 1. Redistributions of source code must retain the above copyright
|
||
13 | * notice, this list of conditions and the following disclaimer.
|
||
14 | * 2. Redistributions in binary form must reproduce the above copyright
|
||
15 | * notice, this list of conditions and the following disclaimer in the
|
||
16 | * documentation and/or other materials provided with the distribution.
|
||
17 | *
|
||
18 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
|
||
19 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||
20 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||
21 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
|
||
22 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||
23 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||
24 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||
25 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||
26 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||
27 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||
28 | * POSSIBILITY OF SUCH DAMAGE.
|
||
29 | */
|
||
30 | |||
31 | /*
|
||
32 | * Vestigial time-wait.
|
||
33 | *
|
||
34 | * This implementation uses cache-efficient techniques, which will
|
||
35 | * appear somewhat peculiar. The main philosophy is to optimise the
|
||
36 | * amount of information available within a cache line. Cache miss is
|
||
37 | * expensive. So we employ ad-hoc techniques to pull a series of
|
||
38 | * linked-list follows into a cache line. One cache line, multiple
|
||
39 | * linked-list equivalents.
|
||
40 | *
|
||
41 | * One such ad-hoc technique is fat pointers. Additional degrees of
|
||
42 | * ad-hoqueness result from having to hand tune it for pointer size
|
||
43 | * and for cache line size.
|
||
44 | *
|
||
45 | * The 'fat pointer' approach aggregates, for x86_32, 15 linked-list
|
||
46 | * data structures into one cache line. The additional 32 bits in the
|
||
47 | * cache line are used for linking fat pointers, and for
|
||
48 | * allocation/bookkeeping.
|
||
49 | *
|
||
50 | * The 15 32-bit tags encode the pointers to the linked list elements,
|
||
51 | * and also encode the results of a search comparison.
|
||
52 | *
|
||
53 | * First, some more assumptions/restrictions.
|
||
54 | *
|
||
55 | * All the fat pointers are from a contiguous allocation arena. Thus,
|
||
56 | * we can refer to them by offset from a base, not as full pointers.
|
||
57 | *
|
||
58 | * All the linked list data elements are also from a contiguous
|
||
59 | * allocation arena, again so that we can refer to them as offset from
|
||
60 | * a base.
|
||
61 | *
|
||
62 | * In order to add a data element to a fat pointer, a key value is
|
||
63 | * computed, based on unique data within the data element. It is the
|
||
64 | * linear searching of the linked lists of these elements based on
|
||
65 | * these unique data that are being optimised here.
|
||
66 | *
|
||
67 | * Lets call the function that computes the key k(e), where e is the
|
||
68 | * data element. In this example, k(e) returns 32-bits.
|
||
69 | *
|
||
70 | * Consider a set E (say of order 15) of data elements. Let K be
|
||
71 | * the set of the k(e) for e in E.
|
||
72 | *
|
||
73 | * Let O be the set of the offsets from the base of the data elements in E.
|
||
74 | *
|
||
75 | * For each x in K, for each matching o in O, let t be x ^ o. These
|
||
76 | * are the tags. (More or less).
|
||
77 | *
|
||
78 | * In order to search all the data elements in E, we compute the
|
||
79 | * search key, and one at a time, XOR the key into the tags. If any
|
||
80 | * result is a valid data element index, we have a possible match. If
|
||
81 | * not, there is no match.
|
||
82 | *
|
||
83 | * The no-match cases mean we do not have to de-reference the pointer
|
||
84 | * to the data element in question. We save cache miss penalty and
|
||
85 | * cache load decreases. Only in the case of a valid looking data
|
||
86 | * element index, do we have to look closer.
|
||
87 | *
|
||
88 | * Thus, in the absence of false positives, 15 data elements can be
|
||
89 | * searched with one cache line fill, as opposed to 15 cache line
|
||
90 | * fills for the usual implementation.
|
||
91 | *
|
||
92 | * The vestigial time waits (vtw_t), the data elements in the above, are
|
||
93 | * searched by faddr, fport, laddr, lport. The key is a function of
|
||
94 | * these values.
|
||
95 | *
|
||
96 | * We hash these keys into the traditional hash chains to reduce the
|
||
97 | * search time, and use fat pointers to reduce the cache impacts of
|
||
98 | * searching.
|
||
99 | *
|
||
100 | * The vtw_t are, per requirement, in a contiguous chunk. Allocation
|
||
101 | * is done with a clock hand, and all vtw_t within one allocation
|
||
102 | * domain have the same lifetime, so they will always be sorted by
|
||
103 | * age.
|
||
104 | *
|
||
105 | * A vtw_t will be allocated, timestamped, and have a fixed future
|
||
106 | * expiration. It will be added to a hash bucket implemented with fat
|
||
107 | * pointers, which means that a cache line will be allocated in the
|
||
108 | * hash bucket, placed at the head (more recent in time) and the vtw_t
|
||
109 | * will be added to this. As more entries are added, the fat pointer
|
||
110 | * cache line will fill, requiring additional cache lines for fat
|
||
111 | * pointers to be allocated. These will be added at the head, and the
|
||
112 | * aged entries will hang down, tapeworm like. As the vtw_t entries
|
||
113 | * expire, the corresponding slot in the fat pointer will be
|
||
114 | * reclaimed, and eventually the cache line will completely empty and
|
||
115 | * be re-cycled, if not at the head of the chain.
|
||
116 | *
|
||
117 | * At times, a time-wait timer is restarted. This corresponds to
|
||
118 | * deleting the current entry and re-adding it.
|
||
119 | *
|
||
120 | * Most of the time, they are just placed here to die.
|
||
121 | */
|
||
122 | #ifndef _NETINET_TCP_VTW_H
|
||
123 | #define _NETINET_TCP_VTW_H
|
||
124 | |||
125 | #include <sys/types.h> |
||
126 | #include <sys/socket.h> |
||
127 | #include <sys/sysctl.h> |
||
128 | #include <net/if.h> |
||
129 | #include <net/route.h> |
||
130 | #include <netinet/in.h> |
||
131 | #include <netinet/in_systm.h> |
||
132 | #include <netinet/ip.h> |
||
133 | #include <netinet/in_pcb.h> |
||
134 | #include <netinet/in_var.h> |
||
135 | #include <netinet/ip_var.h> |
||
136 | #include <netinet/in.h> |
||
137 | #include <netinet/tcp.h> |
||
138 | #include <netinet/tcp_timer.h> |
||
139 | #include <netinet/tcp_var.h> |
||
140 | #include <netinet6/in6.h> |
||
141 | #include <netinet/ip6.h> |
||
142 | #include <netinet6/ip6_var.h> |
||
143 | #include <netinet6/in6_pcb.h> |
||
144 | #include <netinet6/ip6_var.h> |
||
145 | #include <netinet6/in6_var.h> |
||
146 | #include <netinet/icmp6.h> |
||
147 | #include <netinet6/nd6.h> |
||
148 | |||
149 | #define VTW_NCLASS (1+3) /* # different classes */ |
||
150 | |||
151 | /*
|
||
152 | * fat pointers, MI.
|
||
153 | */
|
||
154 | struct fatp_mi;
|
||
155 | |||
156 | typedef uint32_t fatp_word_t;
|
||
157 | |||
158 | typedef struct fatp_mi fatp_t; |
||
159 | |||
160 | /* Supported cacheline sizes: 32 64 128 bytes. See fatp_key(),
|
||
161 | * fatp_slot_from_key(), fatp_xtra[].
|
||
162 | */
|
||
163 | #define FATP_NTAGS (CACHE_LINE_SIZE / sizeof(fatp_word_t) - 1) |
||
164 | #define FATP_NXT_WIDTH (sizeof(fatp_word_t) * NBBY - FATP_NTAGS) |
||
165 | |||
166 | #define FATP_MAX (1 << FATP_NXT_WIDTH) |
||
167 | |||
168 | /* Worked example: ULP32 with 64-byte cacheline (32-bit x86):
|
||
169 | * 15 tags per cacheline. At most 2^17 fat pointers per fatp_ctl_t.
|
||
170 | * The comments on the fatp_mi members, below, correspond to the worked
|
||
171 | * example.
|
||
172 | */
|
||
173 | struct fatp_mi {
|
||
174 | fatp_word_t inuse : FATP_NTAGS; /* (1+15)*4 == CL_SIZE */
|
||
175 | fatp_word_t nxt : FATP_NXT_WIDTH;/* at most 2^17 fat pointers */
|
||
176 | fatp_word_t tag[FATP_NTAGS]; /* 15 tags per CL */
|
||
177 | }; |
||
178 | |||
179 | static inline int |
||
180 | fatp_ntags(void)
|
||
181 | { |
||
182 | return FATP_NTAGS;
|
||
183 | } |
||
184 | |||
185 | static inline int |
||
186 | fatp_full(fatp_t *fp) |
||
187 | { |
||
188 | fatp_t full; |
||
189 | |||
190 | full.inuse = (1U << FATP_NTAGS) - 1U; |
||
191 | |||
192 | return (fp->inuse == full.inuse);
|
||
193 | } |
||
194 | |||
195 | struct vtw_common;
|
||
196 | struct vtw_v4;
|
||
197 | struct vtw_v6;
|
||
198 | struct vtw_ctl;
|
||
199 | |||
200 | /*!\brief common to all vtw
|
||
201 | */
|
||
202 | typedef struct vtw_common { |
||
203 | struct timeval expire; /* date of birth+msl */ |
||
204 | uint32_t key; /* hash key: full hash */
|
||
205 | uint32_t port_key; /* hash key: local port hash */
|
||
206 | uint32_t rcv_nxt; |
||
207 | uint32_t rcv_wnd; |
||
208 | uint32_t snd_nxt; |
||
209 | uint32_t snd_scale : 8; /* window scaling for send win */ |
||
210 | uint32_t msl_class : 2; /* TCP MSL class {0,1,2,3} */ |
||
211 | uint32_t reuse_port : 1;
|
||
212 | uint32_t reuse_addr : 1;
|
||
213 | uint32_t v6only : 1;
|
||
214 | uint32_t hashed : 1; /* reachable via FATP */ |
||
215 | uint32_t uid; |
||
216 | } vtw_t; |
||
217 | |||
218 | /*!\brief vestigial timewait for IPv4
|
||
219 | */
|
||
220 | typedef struct vtw_v4 { |
||
221 | vtw_t common; /* must be first */
|
||
222 | uint16_t lport; |
||
223 | uint16_t fport; |
||
224 | uint32_t laddr; |
||
225 | uint32_t faddr; |
||
226 | } vtw_v4_t; |
||
227 | |||
228 | /*!\brief vestigial timewait for IPv6
|
||
229 | */
|
||
230 | typedef struct vtw_v6 { |
||
231 | vtw_t common; /* must be first */
|
||
232 | uint16_t lport; |
||
233 | uint16_t fport; |
||
234 | struct in6_addr laddr;
|
||
235 | struct in6_addr faddr;
|
||
236 | } vtw_v6_t; |
||
237 | |||
238 | struct fatp_ctl;
|
||
239 | typedef struct vtw_ctl vtw_ctl_t; |
||
240 | typedef struct fatp_ctl fatp_ctl_t; |
||
241 | |||
242 | /*
|
||
243 | * The vestigial time waits are kept in a contiguous chunk.
|
||
244 | * Allocation and free pointers run as clock hands thru this array.
|
||
245 | */
|
||
246 | struct vtw_ctl {
|
||
247 | fatp_ctl_t *fat; /* collection of fatp to use */
|
||
248 | vtw_ctl_t *ctl; /* <! controller's controller */
|
||
249 | union {
|
||
250 | vtw_t *v; /* common */
|
||
251 | struct vtw_v4 *v4; /* IPv4 resources */ |
||
252 | struct vtw_v6 *v6; /* IPv6 resources */ |
||
253 | } base, /* base of vtw_t array */
|
||
254 | /**/ lim, /* extent of vtw_t array */ |
||
255 | /**/ alloc, /* allocation pointer */ |
||
256 | /**/ oldest; /* ^ to oldest */ |
||
257 | uint32_t nfree; /* # free */
|
||
258 | uint32_t nalloc; /* # allocated */
|
||
259 | uint32_t idx_mask; /* mask capturing all index bits*/
|
||
260 | uint32_t is_v4 : 1;
|
||
261 | uint32_t is_v6 : 1;
|
||
262 | uint32_t idx_bits: 6;
|
||
263 | uint32_t clidx : 3; /* <! class index */ |
||
264 | }; |
||
265 | |||
266 | /*!\brief Collections of fat pointers.
|
||
267 | */
|
||
268 | struct fatp_ctl {
|
||
269 | vtw_ctl_t *vtw; /* associated VTWs */
|
||
270 | fatp_t *base; /* base of fatp_t array */
|
||
271 | fatp_t *lim; /* extent of fatp_t array */
|
||
272 | fatp_t *free; /* free list */
|
||
273 | uint32_t mask; /* hash mask */
|
||
274 | uint32_t nfree; /* # free */
|
||
275 | uint32_t nalloc; /* # allocated */
|
||
276 | fatp_t **hash; /* hash anchors */
|
||
277 | fatp_t **port; /* port hash anchors */
|
||
278 | }; |
||
279 | |||
280 | /*!\brief stats
|
||
281 | */
|
||
282 | struct vtw_stats {
|
||
283 | uint64_t ins; /* <! inserts */
|
||
284 | uint64_t del; /* <! deleted */
|
||
285 | uint64_t kill; /* <! assassination */
|
||
286 | uint64_t look[2]; /* <! lookup: full hash, port hash */ |
||
287 | uint64_t hit[2]; /* <! lookups that hit */ |
||
288 | uint64_t miss[2]; /* <! lookups that miss */ |
||
289 | uint64_t probe[2]; /* <! hits+miss */ |
||
290 | uint64_t losing[2]; /* <! misses requiring dereference */ |
||
291 | uint64_t max_chain[2]; /* <! max fatp chain traversed */ |
||
292 | uint64_t max_probe[2]; /* <! max probes in any one chain */ |
||
293 | uint64_t max_loss[2]; /* <! max losing probes in any one |
||
294 | * chain
|
||
295 | */
|
||
296 | }; |
||
297 | |||
298 | typedef struct vtw_stats vtw_stats_t; |
||
299 | |||
300 | /*!\brief follow fatp next 'pointer'
|
||
301 | */
|
||
302 | static inline fatp_t * |
||
303 | fatp_next(fatp_ctl_t *fat, fatp_t *fp) |
||
304 | { |
||
305 | return fp->nxt ? fat->base + fp->nxt-1 : 0; |
||
306 | } |
||
307 | |||
308 | /*!\brief determine a collection-relative fat pointer index.
|
||
309 | */
|
||
310 | static inline uint32_t |
||
311 | fatp_index(fatp_ctl_t *fat, fatp_t *fp) |
||
312 | { |
||
313 | return fp ? 1 + (fp - fat->base) : 0; |
||
314 | } |
||
315 | |||
316 | |||
317 | static inline uint32_t |
||
318 | v4_tag(uint32_t faddr, uint32_t fport, uint32_t laddr, uint32_t lport) |
||
319 | { |
||
320 | return (ntohl(faddr) + ntohs(fport)
|
||
321 | + ntohl(laddr) + ntohs(lport)); |
||
322 | } |
||
323 | |||
324 | static inline uint32_t |
||
325 | v6_tag(const struct in6_addr *faddr, uint16_t fport, |
||
326 | const struct in6_addr *laddr, uint16_t lport) |
||
327 | { |
||
328 | #ifdef IN6_HASH
|
||
329 | return IN6_HASH(faddr, fport, laddr, lport);
|
||
330 | #else
|
||
331 | return 0; |
||
332 | #endif
|
||
333 | } |
||
334 | |||
335 | static inline uint32_t |
||
336 | v4_port_tag(uint16_t lport) |
||
337 | { |
||
338 | uint32_t tag = lport ^ (lport << 11);
|
||
339 | |||
340 | tag ^= tag << 3;
|
||
341 | tag += tag >> 5;
|
||
342 | tag ^= tag << 4;
|
||
343 | tag += tag >> 17;
|
||
344 | tag ^= tag << 25;
|
||
345 | tag += tag >> 6;
|
||
346 | |||
347 | return tag;
|
||
348 | } |
||
349 | |||
350 | static inline uint32_t |
||
351 | v6_port_tag(uint16_t lport) |
||
352 | { |
||
353 | return v4_port_tag(lport);
|
||
354 | } |
||
355 | |||
356 | struct tcpcb;
|
||
357 | struct tcphdr;
|
||
358 | |||
359 | int vtw_add(int, struct tcpcb *); |
||
360 | void vtw_del(vtw_ctl_t *, vtw_t *);
|
||
361 | int vtw_lookup_v4(const struct ip *ip, const struct tcphdr *th, |
||
362 | uint32_t faddr, uint16_t fport, |
||
363 | uint32_t laddr, uint16_t lport); |
||
364 | struct ip6_hdr;
|
||
365 | struct in6_addr;
|
||
366 | |||
367 | int vtw_lookup_v6(const struct ip6_hdr *ip, const struct tcphdr *th, |
||
368 | const struct in6_addr *faddr, uint16_t fport, |
||
369 | const struct in6_addr *laddr, uint16_t lport); |
||
370 | |||
371 | typedef struct vestigial_inpcb { |
||
372 | union {
|
||
373 | struct in_addr v4;
|
||
374 | struct in6_addr v6;
|
||
375 | } faddr, laddr; |
||
376 | uint16_t fport, lport; |
||
377 | uint32_t valid : 1;
|
||
378 | uint32_t v4 : 1;
|
||
379 | uint32_t reuse_addr : 1;
|
||
380 | uint32_t reuse_port : 1;
|
||
381 | uint32_t v6only : 1;
|
||
382 | uint32_t more_tbd : 1;
|
||
383 | uint32_t uid; |
||
384 | uint32_t rcv_nxt; |
||
385 | uint32_t rcv_wnd; |
||
386 | uint32_t snd_nxt; |
||
387 | struct vtw_common *vtw;
|
||
388 | struct vtw_ctl *ctl;
|
||
389 | } vestigial_inpcb_t; |
||
390 | |||
391 | #ifdef _KERNEL
|
||
392 | void vtw_restart(vestigial_inpcb_t*);
|
||
393 | int vtw_earlyinit(void); |
||
394 | int sysctl_tcp_vtw_enable(SYSCTLFN_PROTO);
|
||
395 | #endif /* _KERNEL */ |
||
396 | |||
397 | #ifdef VTW_DEBUG
|
||
398 | typedef struct sin_either { |
||
399 | uint8_t sin_len; |
||
400 | uint8_t sin_family; |
||
401 | uint16_t sin_port; |
||
402 | union {
|
||
403 | struct in_addr v4;
|
||
404 | struct in6_addr v6;
|
||
405 | } sin_addr; |
||
406 | } sin_either_t; |
||
407 | |||
408 | int vtw_debug_add(int af, sin_either_t *, sin_either_t *, int, int); |
||
409 | |||
410 | typedef struct vtw_sysargs { |
||
411 | uint32_t op; |
||
412 | sin_either_t fa; |
||
413 | sin_either_t la; |
||
414 | } vtw_sysargs_t; |
||
415 | |||
416 | #endif /* VTW_DEBUG */ |
||
417 | |||
418 | #endif /* _NETINET_TCP_VTW_H */ |