Gentoo Archives: gentoo-commits

From: Mike Pagano <mpagano@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] proj/linux-patches:4.4 commit in: /
Date: Fri, 08 Feb 2019 15:21:21
Message-Id: 1549639249.32d8aab8c0070a58fbb2a4f1d9cda28915ec17c2.mpagano@gentoo
1 commit: 32d8aab8c0070a58fbb2a4f1d9cda28915ec17c2
2 Author: Mike Pagano <mpagano <AT> gentoo <DOT> org>
3 AuthorDate: Fri Feb 8 15:20:49 2019 +0000
4 Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org>
5 CommitDate: Fri Feb 8 15:20:49 2019 +0000
6 URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=32d8aab8
7
8 proj/linux-patches: Linux patch 4.4.174
9
10 Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org>
11
12 0000_README | 4 +
13 1173_linux-4.4.174.patch | 3075 ++++++++++++++++++++++++++++++++++++++++++++++
14 2 files changed, 3079 insertions(+)
15
16 diff --git a/0000_README b/0000_README
17 index b00cafe..e836b73 100644
18 --- a/0000_README
19 +++ b/0000_README
20 @@ -735,6 +735,10 @@ Patch: 1172_linux-4.4.173.patch
21 From: http://www.kernel.org
22 Desc: Linux 4.4.173
23
24 +Patch: 1173_linux-4.4.174.patch
25 +From: http://www.kernel.org
26 +Desc: Linux 4.4.174
27 +
28 Patch: 1500_XATTR_USER_PREFIX.patch
29 From: https://bugs.gentoo.org/show_bug.cgi?id=470644
30 Desc: Support for namespace user.pax.* on tmpfs.
31
32 diff --git a/1173_linux-4.4.174.patch b/1173_linux-4.4.174.patch
33 new file mode 100644
34 index 0000000..3060cab
35 --- /dev/null
36 +++ b/1173_linux-4.4.174.patch
37 @@ -0,0 +1,3075 @@
38 +diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
39 +index 2ea4c45cf1c8..7c229f59016f 100644
40 +--- a/Documentation/networking/ip-sysctl.txt
41 ++++ b/Documentation/networking/ip-sysctl.txt
42 +@@ -112,14 +112,11 @@ min_adv_mss - INTEGER
43 +
44 + IP Fragmentation:
45 +
46 +-ipfrag_high_thresh - INTEGER
47 +- Maximum memory used to reassemble IP fragments. When
48 +- ipfrag_high_thresh bytes of memory is allocated for this purpose,
49 +- the fragment handler will toss packets until ipfrag_low_thresh
50 +- is reached. This also serves as a maximum limit to namespaces
51 +- different from the initial one.
52 +-
53 +-ipfrag_low_thresh - INTEGER
54 ++ipfrag_high_thresh - LONG INTEGER
55 ++ Maximum memory used to reassemble IP fragments.
56 ++
57 ++ipfrag_low_thresh - LONG INTEGER
58 ++ (Obsolete since linux-4.17)
59 + Maximum memory used to reassemble IP fragments before the kernel
60 + begins to remove incomplete fragment queues to free up resources.
61 + The kernel still accepts new fragments for defragmentation.
62 +diff --git a/Makefile b/Makefile
63 +index db7665e32da8..1fa281069379 100644
64 +--- a/Makefile
65 ++++ b/Makefile
66 +@@ -1,6 +1,6 @@
67 + VERSION = 4
68 + PATCHLEVEL = 4
69 +-SUBLEVEL = 173
70 ++SUBLEVEL = 174
71 + EXTRAVERSION =
72 + NAME = Blurry Fish Butt
73 +
74 +diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
75 +index e50b31d18462..e97cdfd6cba9 100644
76 +--- a/include/linux/rhashtable.h
77 ++++ b/include/linux/rhashtable.h
78 +@@ -133,23 +133,23 @@ struct rhashtable_params {
79 + /**
80 + * struct rhashtable - Hash table handle
81 + * @tbl: Bucket table
82 +- * @nelems: Number of elements in table
83 + * @key_len: Key length for hashfn
84 + * @elasticity: Maximum chain length before rehash
85 + * @p: Configuration parameters
86 + * @run_work: Deferred worker to expand/shrink asynchronously
87 + * @mutex: Mutex to protect current/future table swapping
88 + * @lock: Spin lock to protect walker list
89 ++ * @nelems: Number of elements in table
90 + */
91 + struct rhashtable {
92 + struct bucket_table __rcu *tbl;
93 +- atomic_t nelems;
94 + unsigned int key_len;
95 + unsigned int elasticity;
96 + struct rhashtable_params p;
97 + struct work_struct run_work;
98 + struct mutex mutex;
99 + spinlock_t lock;
100 ++ atomic_t nelems;
101 + };
102 +
103 + /**
104 +@@ -343,7 +343,8 @@ int rhashtable_init(struct rhashtable *ht,
105 + struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
106 + const void *key,
107 + struct rhash_head *obj,
108 +- struct bucket_table *old_tbl);
109 ++ struct bucket_table *old_tbl,
110 ++ void **data);
111 + int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl);
112 +
113 + int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter);
114 +@@ -514,18 +515,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg,
115 + return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len);
116 + }
117 +
118 +-/**
119 +- * rhashtable_lookup_fast - search hash table, inlined version
120 +- * @ht: hash table
121 +- * @key: the pointer to the key
122 +- * @params: hash table parameters
123 +- *
124 +- * Computes the hash value for the key and traverses the bucket chain looking
125 +- * for a entry with an identical key. The first matching entry is returned.
126 +- *
127 +- * Returns the first entry on which the compare function returned true.
128 +- */
129 +-static inline void *rhashtable_lookup_fast(
130 ++/* Internal function, do not use. */
131 ++static inline struct rhash_head *__rhashtable_lookup(
132 + struct rhashtable *ht, const void *key,
133 + const struct rhashtable_params params)
134 + {
135 +@@ -537,8 +528,6 @@ static inline void *rhashtable_lookup_fast(
136 + struct rhash_head *he;
137 + unsigned int hash;
138 +
139 +- rcu_read_lock();
140 +-
141 + tbl = rht_dereference_rcu(ht->tbl, ht);
142 + restart:
143 + hash = rht_key_hashfn(ht, tbl, key, params);
144 +@@ -547,8 +536,7 @@ restart:
145 + params.obj_cmpfn(&arg, rht_obj(ht, he)) :
146 + rhashtable_compare(&arg, rht_obj(ht, he)))
147 + continue;
148 +- rcu_read_unlock();
149 +- return rht_obj(ht, he);
150 ++ return he;
151 + }
152 +
153 + /* Ensure we see any new tables. */
154 +@@ -557,13 +545,64 @@ restart:
155 + tbl = rht_dereference_rcu(tbl->future_tbl, ht);
156 + if (unlikely(tbl))
157 + goto restart;
158 +- rcu_read_unlock();
159 +
160 + return NULL;
161 + }
162 +
163 +-/* Internal function, please use rhashtable_insert_fast() instead */
164 +-static inline int __rhashtable_insert_fast(
165 ++/**
166 ++ * rhashtable_lookup - search hash table
167 ++ * @ht: hash table
168 ++ * @key: the pointer to the key
169 ++ * @params: hash table parameters
170 ++ *
171 ++ * Computes the hash value for the key and traverses the bucket chain looking
172 ++ * for a entry with an identical key. The first matching entry is returned.
173 ++ *
174 ++ * This must only be called under the RCU read lock.
175 ++ *
176 ++ * Returns the first entry on which the compare function returned true.
177 ++ */
178 ++static inline void *rhashtable_lookup(
179 ++ struct rhashtable *ht, const void *key,
180 ++ const struct rhashtable_params params)
181 ++{
182 ++ struct rhash_head *he = __rhashtable_lookup(ht, key, params);
183 ++
184 ++ return he ? rht_obj(ht, he) : NULL;
185 ++}
186 ++
187 ++/**
188 ++ * rhashtable_lookup_fast - search hash table, without RCU read lock
189 ++ * @ht: hash table
190 ++ * @key: the pointer to the key
191 ++ * @params: hash table parameters
192 ++ *
193 ++ * Computes the hash value for the key and traverses the bucket chain looking
194 ++ * for a entry with an identical key. The first matching entry is returned.
195 ++ *
196 ++ * Only use this function when you have other mechanisms guaranteeing
197 ++ * that the object won't go away after the RCU read lock is released.
198 ++ *
199 ++ * Returns the first entry on which the compare function returned true.
200 ++ */
201 ++static inline void *rhashtable_lookup_fast(
202 ++ struct rhashtable *ht, const void *key,
203 ++ const struct rhashtable_params params)
204 ++{
205 ++ void *obj;
206 ++
207 ++ rcu_read_lock();
208 ++ obj = rhashtable_lookup(ht, key, params);
209 ++ rcu_read_unlock();
210 ++
211 ++ return obj;
212 ++}
213 ++
214 ++/* Internal function, please use rhashtable_insert_fast() instead. This
215 ++ * function returns the existing element already in hashes in there is a clash,
216 ++ * otherwise it returns an error via ERR_PTR().
217 ++ */
218 ++static inline void *__rhashtable_insert_fast(
219 + struct rhashtable *ht, const void *key, struct rhash_head *obj,
220 + const struct rhashtable_params params)
221 + {
222 +@@ -576,6 +615,7 @@ static inline int __rhashtable_insert_fast(
223 + spinlock_t *lock;
224 + unsigned int elasticity;
225 + unsigned int hash;
226 ++ void *data = NULL;
227 + int err;
228 +
229 + restart:
230 +@@ -600,11 +640,14 @@ restart:
231 +
232 + new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
233 + if (unlikely(new_tbl)) {
234 +- tbl = rhashtable_insert_slow(ht, key, obj, new_tbl);
235 ++ tbl = rhashtable_insert_slow(ht, key, obj, new_tbl, &data);
236 + if (!IS_ERR_OR_NULL(tbl))
237 + goto slow_path;
238 +
239 + err = PTR_ERR(tbl);
240 ++ if (err == -EEXIST)
241 ++ err = 0;
242 ++
243 + goto out;
244 + }
245 +
246 +@@ -618,25 +661,25 @@ slow_path:
247 + err = rhashtable_insert_rehash(ht, tbl);
248 + rcu_read_unlock();
249 + if (err)
250 +- return err;
251 ++ return ERR_PTR(err);
252 +
253 + goto restart;
254 + }
255 +
256 +- err = -EEXIST;
257 ++ err = 0;
258 + elasticity = ht->elasticity;
259 + rht_for_each(head, tbl, hash) {
260 + if (key &&
261 + unlikely(!(params.obj_cmpfn ?
262 + params.obj_cmpfn(&arg, rht_obj(ht, head)) :
263 +- rhashtable_compare(&arg, rht_obj(ht, head)))))
264 ++ rhashtable_compare(&arg, rht_obj(ht, head))))) {
265 ++ data = rht_obj(ht, head);
266 + goto out;
267 ++ }
268 + if (!--elasticity)
269 + goto slow_path;
270 + }
271 +
272 +- err = 0;
273 +-
274 + head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
275 +
276 + RCU_INIT_POINTER(obj->next, head);
277 +@@ -651,7 +694,7 @@ out:
278 + spin_unlock_bh(lock);
279 + rcu_read_unlock();
280 +
281 +- return err;
282 ++ return err ? ERR_PTR(err) : data;
283 + }
284 +
285 + /**
286 +@@ -674,7 +717,13 @@ static inline int rhashtable_insert_fast(
287 + struct rhashtable *ht, struct rhash_head *obj,
288 + const struct rhashtable_params params)
289 + {
290 +- return __rhashtable_insert_fast(ht, NULL, obj, params);
291 ++ void *ret;
292 ++
293 ++ ret = __rhashtable_insert_fast(ht, NULL, obj, params);
294 ++ if (IS_ERR(ret))
295 ++ return PTR_ERR(ret);
296 ++
297 ++ return ret == NULL ? 0 : -EEXIST;
298 + }
299 +
300 + /**
301 +@@ -703,11 +752,15 @@ static inline int rhashtable_lookup_insert_fast(
302 + const struct rhashtable_params params)
303 + {
304 + const char *key = rht_obj(ht, obj);
305 ++ void *ret;
306 +
307 + BUG_ON(ht->p.obj_hashfn);
308 +
309 +- return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj,
310 +- params);
311 ++ ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params);
312 ++ if (IS_ERR(ret))
313 ++ return PTR_ERR(ret);
314 ++
315 ++ return ret == NULL ? 0 : -EEXIST;
316 + }
317 +
318 + /**
319 +@@ -735,6 +788,32 @@ static inline int rhashtable_lookup_insert_fast(
320 + static inline int rhashtable_lookup_insert_key(
321 + struct rhashtable *ht, const void *key, struct rhash_head *obj,
322 + const struct rhashtable_params params)
323 ++{
324 ++ void *ret;
325 ++
326 ++ BUG_ON(!ht->p.obj_hashfn || !key);
327 ++
328 ++ ret = __rhashtable_insert_fast(ht, key, obj, params);
329 ++ if (IS_ERR(ret))
330 ++ return PTR_ERR(ret);
331 ++
332 ++ return ret == NULL ? 0 : -EEXIST;
333 ++}
334 ++
335 ++/**
336 ++ * rhashtable_lookup_get_insert_key - lookup and insert object into hash table
337 ++ * @ht: hash table
338 ++ * @obj: pointer to hash head inside object
339 ++ * @params: hash table parameters
340 ++ * @data: pointer to element data already in hashes
341 ++ *
342 ++ * Just like rhashtable_lookup_insert_key(), but this function returns the
343 ++ * object if it exists, NULL if it does not and the insertion was successful,
344 ++ * and an ERR_PTR otherwise.
345 ++ */
346 ++static inline void *rhashtable_lookup_get_insert_key(
347 ++ struct rhashtable *ht, const void *key, struct rhash_head *obj,
348 ++ const struct rhashtable_params params)
349 + {
350 + BUG_ON(!ht->p.obj_hashfn || !key);
351 +
352 +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
353 +index 6d39d81d3c38..502787c29ce9 100644
354 +--- a/include/linux/skbuff.h
355 ++++ b/include/linux/skbuff.h
356 +@@ -556,9 +556,14 @@ struct sk_buff {
357 + struct skb_mstamp skb_mstamp;
358 + };
359 + };
360 +- struct rb_node rbnode; /* used in netem & tcp stack */
361 ++ struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
362 + };
363 +- struct sock *sk;
364 ++
365 ++ union {
366 ++ struct sock *sk;
367 ++ int ip_defrag_offset;
368 ++ };
369 ++
370 + struct net_device *dev;
371 +
372 + /*
373 +@@ -2273,7 +2278,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
374 + kfree_skb(skb);
375 + }
376 +
377 +-void skb_rbtree_purge(struct rb_root *root);
378 ++unsigned int skb_rbtree_purge(struct rb_root *root);
379 +
380 + void *netdev_alloc_frag(unsigned int fragsz);
381 +
382 +@@ -2791,6 +2796,7 @@ static inline unsigned char *skb_push_rcsum(struct sk_buff *skb,
383 + return skb->data;
384 + }
385 +
386 ++int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len);
387 + /**
388 + * pskb_trim_rcsum - trim received skb and update checksum
389 + * @skb: buffer to trim
390 +@@ -2805,9 +2811,7 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
391 + {
392 + if (likely(len >= skb->len))
393 + return 0;
394 +- if (skb->ip_summed == CHECKSUM_COMPLETE)
395 +- skb->ip_summed = CHECKSUM_NONE;
396 +- return __pskb_trim(skb, len);
397 ++ return pskb_trim_rcsum_slow(skb, len);
398 + }
399 +
400 + #define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
401 +diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
402 +index c26a6e4dc306..6260ec146142 100644
403 +--- a/include/net/inet_frag.h
404 ++++ b/include/net/inet_frag.h
405 +@@ -1,13 +1,19 @@
406 + #ifndef __NET_FRAG_H__
407 + #define __NET_FRAG_H__
408 +
409 ++#include <linux/rhashtable.h>
410 ++
411 + struct netns_frags {
412 +- /* Keep atomic mem on separate cachelines in structs that include it */
413 +- atomic_t mem ____cacheline_aligned_in_smp;
414 + /* sysctls */
415 ++ long high_thresh;
416 ++ long low_thresh;
417 + int timeout;
418 +- int high_thresh;
419 +- int low_thresh;
420 ++ struct inet_frags *f;
421 ++
422 ++ struct rhashtable rhashtable ____cacheline_aligned_in_smp;
423 ++
424 ++ /* Keep atomic mem on separate cachelines in structs that include it */
425 ++ atomic_long_t mem ____cacheline_aligned_in_smp;
426 + };
427 +
428 + /**
429 +@@ -23,74 +29,68 @@ enum {
430 + INET_FRAG_COMPLETE = BIT(2),
431 + };
432 +
433 ++struct frag_v4_compare_key {
434 ++ __be32 saddr;
435 ++ __be32 daddr;
436 ++ u32 user;
437 ++ u32 vif;
438 ++ __be16 id;
439 ++ u16 protocol;
440 ++};
441 ++
442 ++struct frag_v6_compare_key {
443 ++ struct in6_addr saddr;
444 ++ struct in6_addr daddr;
445 ++ u32 user;
446 ++ __be32 id;
447 ++ u32 iif;
448 ++};
449 ++
450 + /**
451 + * struct inet_frag_queue - fragment queue
452 + *
453 +- * @lock: spinlock protecting the queue
454 ++ * @node: rhash node
455 ++ * @key: keys identifying this frag.
456 + * @timer: queue expiration timer
457 +- * @list: hash bucket list
458 ++ * @lock: spinlock protecting this frag
459 + * @refcnt: reference count of the queue
460 + * @fragments: received fragments head
461 ++ * @rb_fragments: received fragments rb-tree root
462 + * @fragments_tail: received fragments tail
463 ++ * @last_run_head: the head of the last "run". see ip_fragment.c
464 + * @stamp: timestamp of the last received fragment
465 + * @len: total length of the original datagram
466 + * @meat: length of received fragments so far
467 + * @flags: fragment queue flags
468 + * @max_size: maximum received fragment size
469 + * @net: namespace that this frag belongs to
470 +- * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
471 ++ * @rcu: rcu head for freeing deferall
472 + */
473 + struct inet_frag_queue {
474 +- spinlock_t lock;
475 ++ struct rhash_head node;
476 ++ union {
477 ++ struct frag_v4_compare_key v4;
478 ++ struct frag_v6_compare_key v6;
479 ++ } key;
480 + struct timer_list timer;
481 +- struct hlist_node list;
482 ++ spinlock_t lock;
483 + atomic_t refcnt;
484 +- struct sk_buff *fragments;
485 ++ struct sk_buff *fragments; /* Used in IPv6. */
486 ++ struct rb_root rb_fragments; /* Used in IPv4. */
487 + struct sk_buff *fragments_tail;
488 ++ struct sk_buff *last_run_head;
489 + ktime_t stamp;
490 + int len;
491 + int meat;
492 + __u8 flags;
493 + u16 max_size;
494 +- struct netns_frags *net;
495 +- struct hlist_node list_evictor;
496 +-};
497 +-
498 +-#define INETFRAGS_HASHSZ 1024
499 +-
500 +-/* averaged:
501 +- * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
502 +- * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
503 +- * struct frag_queue))
504 +- */
505 +-#define INETFRAGS_MAXDEPTH 128
506 +-
507 +-struct inet_frag_bucket {
508 +- struct hlist_head chain;
509 +- spinlock_t chain_lock;
510 ++ struct netns_frags *net;
511 ++ struct rcu_head rcu;
512 + };
513 +
514 + struct inet_frags {
515 +- struct inet_frag_bucket hash[INETFRAGS_HASHSZ];
516 +-
517 +- struct work_struct frags_work;
518 +- unsigned int next_bucket;
519 +- unsigned long last_rebuild_jiffies;
520 +- bool rebuild;
521 +-
522 +- /* The first call to hashfn is responsible to initialize
523 +- * rnd. This is best done with net_get_random_once.
524 +- *
525 +- * rnd_seqlock is used to let hash insertion detect
526 +- * when it needs to re-lookup the hash chain to use.
527 +- */
528 +- u32 rnd;
529 +- seqlock_t rnd_seqlock;
530 + int qsize;
531 +
532 +- unsigned int (*hashfn)(const struct inet_frag_queue *);
533 +- bool (*match)(const struct inet_frag_queue *q,
534 +- const void *arg);
535 + void (*constructor)(struct inet_frag_queue *q,
536 + const void *arg);
537 + void (*destructor)(struct inet_frag_queue *);
538 +@@ -98,56 +98,47 @@ struct inet_frags {
539 + void (*frag_expire)(unsigned long data);
540 + struct kmem_cache *frags_cachep;
541 + const char *frags_cache_name;
542 ++ struct rhashtable_params rhash_params;
543 + };
544 +
545 + int inet_frags_init(struct inet_frags *);
546 + void inet_frags_fini(struct inet_frags *);
547 +
548 +-static inline void inet_frags_init_net(struct netns_frags *nf)
549 ++static inline int inet_frags_init_net(struct netns_frags *nf)
550 + {
551 +- atomic_set(&nf->mem, 0);
552 ++ atomic_long_set(&nf->mem, 0);
553 ++ return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
554 + }
555 +-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
556 ++void inet_frags_exit_net(struct netns_frags *nf);
557 +
558 +-void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
559 +-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
560 +-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
561 +- struct inet_frags *f, void *key, unsigned int hash);
562 ++void inet_frag_kill(struct inet_frag_queue *q);
563 ++void inet_frag_destroy(struct inet_frag_queue *q);
564 ++struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
565 +
566 +-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
567 +- const char *prefix);
568 ++/* Free all skbs in the queue; return the sum of their truesizes. */
569 ++unsigned int inet_frag_rbtree_purge(struct rb_root *root);
570 +
571 +-static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
572 ++static inline void inet_frag_put(struct inet_frag_queue *q)
573 + {
574 + if (atomic_dec_and_test(&q->refcnt))
575 +- inet_frag_destroy(q, f);
576 +-}
577 +-
578 +-static inline bool inet_frag_evicting(struct inet_frag_queue *q)
579 +-{
580 +- return !hlist_unhashed(&q->list_evictor);
581 ++ inet_frag_destroy(q);
582 + }
583 +
584 + /* Memory Tracking Functions. */
585 +
586 +-static inline int frag_mem_limit(struct netns_frags *nf)
587 +-{
588 +- return atomic_read(&nf->mem);
589 +-}
590 +-
591 +-static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
592 ++static inline long frag_mem_limit(const struct netns_frags *nf)
593 + {
594 +- atomic_sub(i, &nf->mem);
595 ++ return atomic_long_read(&nf->mem);
596 + }
597 +
598 +-static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
599 ++static inline void sub_frag_mem_limit(struct netns_frags *nf, long val)
600 + {
601 +- atomic_add(i, &nf->mem);
602 ++ atomic_long_sub(val, &nf->mem);
603 + }
604 +
605 +-static inline int sum_frag_mem_limit(struct netns_frags *nf)
606 ++static inline void add_frag_mem_limit(struct netns_frags *nf, long val)
607 + {
608 +- return atomic_read(&nf->mem);
609 ++ atomic_long_add(val, &nf->mem);
610 + }
611 +
612 + /* RFC 3168 support :
613 +diff --git a/include/net/ip.h b/include/net/ip.h
614 +index 0530bcdbc212..7b968927477d 100644
615 +--- a/include/net/ip.h
616 ++++ b/include/net/ip.h
617 +@@ -524,7 +524,6 @@ static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *s
618 + return skb;
619 + }
620 + #endif
621 +-int ip_frag_mem(struct net *net);
622 +
623 + /*
624 + * Functions provided by ip_forward.c
625 +diff --git a/include/net/ipv6.h b/include/net/ipv6.h
626 +index 0e01d570fa22..c07cf9596b6f 100644
627 +--- a/include/net/ipv6.h
628 ++++ b/include/net/ipv6.h
629 +@@ -320,13 +320,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev)
630 + idev->cnf.accept_ra;
631 + }
632 +
633 +-#if IS_ENABLED(CONFIG_IPV6)
634 +-static inline int ip6_frag_mem(struct net *net)
635 +-{
636 +- return sum_frag_mem_limit(&net->ipv6.frags);
637 +-}
638 +-#endif
639 +-
640 + #define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */
641 + #define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */
642 + #define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */
643 +@@ -505,17 +498,8 @@ enum ip6_defrag_users {
644 + __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
645 + };
646 +
647 +-struct ip6_create_arg {
648 +- __be32 id;
649 +- u32 user;
650 +- const struct in6_addr *src;
651 +- const struct in6_addr *dst;
652 +- int iif;
653 +- u8 ecn;
654 +-};
655 +-
656 + void ip6_frag_init(struct inet_frag_queue *q, const void *a);
657 +-bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
658 ++extern const struct rhashtable_params ip6_rhash_params;
659 +
660 + /*
661 + * Equivalent of ipv4 struct ip
662 +@@ -523,19 +507,13 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
663 + struct frag_queue {
664 + struct inet_frag_queue q;
665 +
666 +- __be32 id; /* fragment id */
667 +- u32 user;
668 +- struct in6_addr saddr;
669 +- struct in6_addr daddr;
670 +-
671 + int iif;
672 + unsigned int csum;
673 + __u16 nhoffset;
674 + u8 ecn;
675 + };
676 +
677 +-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
678 +- struct inet_frags *frags);
679 ++void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq);
680 +
681 + static inline bool ipv6_addr_any(const struct in6_addr *a)
682 + {
683 +diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
684 +index 25a9ad8bcef1..9de808ebce05 100644
685 +--- a/include/uapi/linux/snmp.h
686 ++++ b/include/uapi/linux/snmp.h
687 +@@ -55,6 +55,7 @@ enum
688 + IPSTATS_MIB_ECT1PKTS, /* InECT1Pkts */
689 + IPSTATS_MIB_ECT0PKTS, /* InECT0Pkts */
690 + IPSTATS_MIB_CEPKTS, /* InCEPkts */
691 ++ IPSTATS_MIB_REASM_OVERLAPS, /* ReasmOverlaps */
692 + __IPSTATS_MIB_MAX
693 + };
694 +
695 +diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
696 +index 8a62cbfe1f2f..4e886ccd40db 100644
697 +--- a/kernel/rcu/tree.c
698 ++++ b/kernel/rcu/tree.c
699 +@@ -3817,7 +3817,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
700 + continue;
701 + rdp = per_cpu_ptr(rsp->rda, cpu);
702 + pr_cont(" %d-%c%c%c", cpu,
703 +- "O."[cpu_online(cpu)],
704 ++ "O."[!!cpu_online(cpu)],
705 + "o."[!!(rdp->grpmask & rnp->expmaskinit)],
706 + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
707 + }
708 +diff --git a/lib/rhashtable.c b/lib/rhashtable.c
709 +index 37ea94b636a3..7bb8649429bf 100644
710 +--- a/lib/rhashtable.c
711 ++++ b/lib/rhashtable.c
712 +@@ -250,8 +250,10 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
713 + if (!new_tbl)
714 + return 0;
715 +
716 +- for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
717 ++ for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
718 + rhashtable_rehash_chain(ht, old_hash);
719 ++ cond_resched();
720 ++ }
721 +
722 + /* Publish the new table pointer. */
723 + rcu_assign_pointer(ht->tbl, new_tbl);
724 +@@ -441,7 +443,8 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_rehash);
725 + struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
726 + const void *key,
727 + struct rhash_head *obj,
728 +- struct bucket_table *tbl)
729 ++ struct bucket_table *tbl,
730 ++ void **data)
731 + {
732 + struct rhash_head *head;
733 + unsigned int hash;
734 +@@ -452,8 +455,11 @@ struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
735 + spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
736 +
737 + err = -EEXIST;
738 +- if (key && rhashtable_lookup_fast(ht, key, ht->p))
739 +- goto exit;
740 ++ if (key) {
741 ++ *data = rhashtable_lookup_fast(ht, key, ht->p);
742 ++ if (*data)
743 ++ goto exit;
744 ++ }
745 +
746 + err = -E2BIG;
747 + if (unlikely(rht_grow_above_max(ht, tbl)))
748 +@@ -838,6 +844,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
749 + for (i = 0; i < tbl->size; i++) {
750 + struct rhash_head *pos, *next;
751 +
752 ++ cond_resched();
753 + for (pos = rht_dereference(tbl->buckets[i], ht),
754 + next = !rht_is_a_nulls(pos) ?
755 + rht_dereference(pos->next, ht) : NULL;
756 +diff --git a/net/core/skbuff.c b/net/core/skbuff.c
757 +index 8a57bbaf7452..fea7c24e99d0 100644
758 +--- a/net/core/skbuff.c
759 ++++ b/net/core/skbuff.c
760 +@@ -1502,6 +1502,21 @@ done:
761 + }
762 + EXPORT_SYMBOL(___pskb_trim);
763 +
764 ++/* Note : use pskb_trim_rcsum() instead of calling this directly
765 ++ */
766 ++int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
767 ++{
768 ++ if (skb->ip_summed == CHECKSUM_COMPLETE) {
769 ++ int delta = skb->len - len;
770 ++
771 ++ skb->csum = csum_block_sub(skb->csum,
772 ++ skb_checksum(skb, len, delta, 0),
773 ++ len);
774 ++ }
775 ++ return __pskb_trim(skb, len);
776 ++}
777 ++EXPORT_SYMBOL(pskb_trim_rcsum_slow);
778 ++
779 + /**
780 + * __pskb_pull_tail - advance tail of skb header
781 + * @skb: buffer to reallocate
782 +@@ -2380,23 +2395,27 @@ EXPORT_SYMBOL(skb_queue_purge);
783 + /**
784 + * skb_rbtree_purge - empty a skb rbtree
785 + * @root: root of the rbtree to empty
786 ++ * Return value: the sum of truesizes of all purged skbs.
787 + *
788 + * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
789 + * the list and one reference dropped. This function does not take
790 + * any lock. Synchronization should be handled by the caller (e.g., TCP
791 + * out-of-order queue is protected by the socket lock).
792 + */
793 +-void skb_rbtree_purge(struct rb_root *root)
794 ++unsigned int skb_rbtree_purge(struct rb_root *root)
795 + {
796 + struct rb_node *p = rb_first(root);
797 ++ unsigned int sum = 0;
798 +
799 + while (p) {
800 + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
801 +
802 + p = rb_next(p);
803 + rb_erase(&skb->rbnode, root);
804 ++ sum += skb->truesize;
805 + kfree_skb(skb);
806 + }
807 ++ return sum;
808 + }
809 +
810 + /**
811 +diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
812 +index b4e17a7c0df0..fdbebe51446f 100644
813 +--- a/net/ieee802154/6lowpan/6lowpan_i.h
814 ++++ b/net/ieee802154/6lowpan/6lowpan_i.h
815 +@@ -16,37 +16,19 @@ typedef unsigned __bitwise__ lowpan_rx_result;
816 + #define LOWPAN_DISPATCH_FRAG1 0xc0
817 + #define LOWPAN_DISPATCH_FRAGN 0xe0
818 +
819 +-struct lowpan_create_arg {
820 ++struct frag_lowpan_compare_key {
821 + u16 tag;
822 + u16 d_size;
823 +- const struct ieee802154_addr *src;
824 +- const struct ieee802154_addr *dst;
825 ++ struct ieee802154_addr src;
826 ++ struct ieee802154_addr dst;
827 + };
828 +
829 +-/* Equivalent of ipv4 struct ip
830 ++/* Equivalent of ipv4 struct ipq
831 + */
832 + struct lowpan_frag_queue {
833 + struct inet_frag_queue q;
834 +-
835 +- u16 tag;
836 +- u16 d_size;
837 +- struct ieee802154_addr saddr;
838 +- struct ieee802154_addr daddr;
839 + };
840 +
841 +-static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
842 +-{
843 +- switch (a->mode) {
844 +- case IEEE802154_ADDR_LONG:
845 +- return (((__force u64)a->extended_addr) >> 32) ^
846 +- (((__force u64)a->extended_addr) & 0xffffffff);
847 +- case IEEE802154_ADDR_SHORT:
848 +- return (__force u32)(a->short_addr);
849 +- default:
850 +- return 0;
851 +- }
852 +-}
853 +-
854 + /* private device info */
855 + struct lowpan_dev_info {
856 + struct net_device *wdev; /* wpan device ptr */
857 +diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
858 +index 12e8cf4bda9f..6183730d38db 100644
859 +--- a/net/ieee802154/6lowpan/reassembly.c
860 ++++ b/net/ieee802154/6lowpan/reassembly.c
861 +@@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags;
862 + static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
863 + struct sk_buff *prev, struct net_device *ldev);
864 +
865 +-static unsigned int lowpan_hash_frag(u16 tag, u16 d_size,
866 +- const struct ieee802154_addr *saddr,
867 +- const struct ieee802154_addr *daddr)
868 +-{
869 +- net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd));
870 +- return jhash_3words(ieee802154_addr_hash(saddr),
871 +- ieee802154_addr_hash(daddr),
872 +- (__force u32)(tag + (d_size << 16)),
873 +- lowpan_frags.rnd);
874 +-}
875 +-
876 +-static unsigned int lowpan_hashfn(const struct inet_frag_queue *q)
877 +-{
878 +- const struct lowpan_frag_queue *fq;
879 +-
880 +- fq = container_of(q, struct lowpan_frag_queue, q);
881 +- return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr);
882 +-}
883 +-
884 +-static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a)
885 +-{
886 +- const struct lowpan_frag_queue *fq;
887 +- const struct lowpan_create_arg *arg = a;
888 +-
889 +- fq = container_of(q, struct lowpan_frag_queue, q);
890 +- return fq->tag == arg->tag && fq->d_size == arg->d_size &&
891 +- ieee802154_addr_equal(&fq->saddr, arg->src) &&
892 +- ieee802154_addr_equal(&fq->daddr, arg->dst);
893 +-}
894 +-
895 + static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
896 + {
897 +- const struct lowpan_create_arg *arg = a;
898 ++ const struct frag_lowpan_compare_key *key = a;
899 + struct lowpan_frag_queue *fq;
900 +
901 + fq = container_of(q, struct lowpan_frag_queue, q);
902 +
903 +- fq->tag = arg->tag;
904 +- fq->d_size = arg->d_size;
905 +- fq->saddr = *arg->src;
906 +- fq->daddr = *arg->dst;
907 ++ BUILD_BUG_ON(sizeof(*key) > sizeof(q->key));
908 ++ memcpy(&q->key, key, sizeof(*key));
909 + }
910 +
911 + static void lowpan_frag_expire(unsigned long data)
912 +@@ -93,10 +61,10 @@ static void lowpan_frag_expire(unsigned long data)
913 + if (fq->q.flags & INET_FRAG_COMPLETE)
914 + goto out;
915 +
916 +- inet_frag_kill(&fq->q, &lowpan_frags);
917 ++ inet_frag_kill(&fq->q);
918 + out:
919 + spin_unlock(&fq->q.lock);
920 +- inet_frag_put(&fq->q, &lowpan_frags);
921 ++ inet_frag_put(&fq->q);
922 + }
923 +
924 + static inline struct lowpan_frag_queue *
925 +@@ -104,25 +72,20 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
926 + const struct ieee802154_addr *src,
927 + const struct ieee802154_addr *dst)
928 + {
929 +- struct inet_frag_queue *q;
930 +- struct lowpan_create_arg arg;
931 +- unsigned int hash;
932 + struct netns_ieee802154_lowpan *ieee802154_lowpan =
933 + net_ieee802154_lowpan(net);
934 ++ struct frag_lowpan_compare_key key = {};
935 ++ struct inet_frag_queue *q;
936 +
937 +- arg.tag = cb->d_tag;
938 +- arg.d_size = cb->d_size;
939 +- arg.src = src;
940 +- arg.dst = dst;
941 +-
942 +- hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst);
943 ++ key.tag = cb->d_tag;
944 ++ key.d_size = cb->d_size;
945 ++ key.src = *src;
946 ++ key.dst = *dst;
947 +
948 +- q = inet_frag_find(&ieee802154_lowpan->frags,
949 +- &lowpan_frags, &arg, hash);
950 +- if (IS_ERR_OR_NULL(q)) {
951 +- inet_frag_maybe_warn_overflow(q, pr_fmt());
952 ++ q = inet_frag_find(&ieee802154_lowpan->frags, &key);
953 ++ if (!q)
954 + return NULL;
955 +- }
956 ++
957 + return container_of(q, struct lowpan_frag_queue, q);
958 + }
959 +
960 +@@ -229,7 +192,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
961 + struct sk_buff *fp, *head = fq->q.fragments;
962 + int sum_truesize;
963 +
964 +- inet_frag_kill(&fq->q, &lowpan_frags);
965 ++ inet_frag_kill(&fq->q);
966 +
967 + /* Make the one we just received the head. */
968 + if (prev) {
969 +@@ -408,7 +371,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
970 + struct lowpan_frag_queue *fq;
971 + struct net *net = dev_net(skb->dev);
972 + struct lowpan_802154_cb *cb = lowpan_802154_cb(skb);
973 +- struct ieee802154_hdr hdr;
974 ++ struct ieee802154_hdr hdr = {};
975 + int err;
976 +
977 + if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0)
978 +@@ -437,7 +400,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
979 + ret = lowpan_frag_queue(fq, skb, frag_type);
980 + spin_unlock(&fq->q.lock);
981 +
982 +- inet_frag_put(&fq->q, &lowpan_frags);
983 ++ inet_frag_put(&fq->q);
984 + return ret;
985 + }
986 +
987 +@@ -447,24 +410,22 @@ err:
988 + }
989 +
990 + #ifdef CONFIG_SYSCTL
991 +-static int zero;
992 +
993 + static struct ctl_table lowpan_frags_ns_ctl_table[] = {
994 + {
995 + .procname = "6lowpanfrag_high_thresh",
996 + .data = &init_net.ieee802154_lowpan.frags.high_thresh,
997 +- .maxlen = sizeof(int),
998 ++ .maxlen = sizeof(unsigned long),
999 + .mode = 0644,
1000 +- .proc_handler = proc_dointvec_minmax,
1001 ++ .proc_handler = proc_doulongvec_minmax,
1002 + .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh
1003 + },
1004 + {
1005 + .procname = "6lowpanfrag_low_thresh",
1006 + .data = &init_net.ieee802154_lowpan.frags.low_thresh,
1007 +- .maxlen = sizeof(int),
1008 ++ .maxlen = sizeof(unsigned long),
1009 + .mode = 0644,
1010 +- .proc_handler = proc_dointvec_minmax,
1011 +- .extra1 = &zero,
1012 ++ .proc_handler = proc_doulongvec_minmax,
1013 + .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh
1014 + },
1015 + {
1016 +@@ -580,14 +541,20 @@ static int __net_init lowpan_frags_init_net(struct net *net)
1017 + {
1018 + struct netns_ieee802154_lowpan *ieee802154_lowpan =
1019 + net_ieee802154_lowpan(net);
1020 ++ int res;
1021 +
1022 + ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
1023 + ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
1024 + ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
1025 ++ ieee802154_lowpan->frags.f = &lowpan_frags;
1026 +
1027 +- inet_frags_init_net(&ieee802154_lowpan->frags);
1028 +-
1029 +- return lowpan_frags_ns_sysctl_register(net);
1030 ++ res = inet_frags_init_net(&ieee802154_lowpan->frags);
1031 ++ if (res < 0)
1032 ++ return res;
1033 ++ res = lowpan_frags_ns_sysctl_register(net);
1034 ++ if (res < 0)
1035 ++ inet_frags_exit_net(&ieee802154_lowpan->frags);
1036 ++ return res;
1037 + }
1038 +
1039 + static void __net_exit lowpan_frags_exit_net(struct net *net)
1040 +@@ -596,7 +563,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net)
1041 + net_ieee802154_lowpan(net);
1042 +
1043 + lowpan_frags_ns_sysctl_unregister(net);
1044 +- inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
1045 ++ inet_frags_exit_net(&ieee802154_lowpan->frags);
1046 + }
1047 +
1048 + static struct pernet_operations lowpan_frags_ops = {
1049 +@@ -604,33 +571,64 @@ static struct pernet_operations lowpan_frags_ops = {
1050 + .exit = lowpan_frags_exit_net,
1051 + };
1052 +
1053 +-int __init lowpan_net_frag_init(void)
1054 ++static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
1055 + {
1056 +- int ret;
1057 ++ return jhash2(data,
1058 ++ sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
1059 ++}
1060 +
1061 +- ret = lowpan_frags_sysctl_register();
1062 +- if (ret)
1063 +- return ret;
1064 ++static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed)
1065 ++{
1066 ++ const struct inet_frag_queue *fq = data;
1067 +
1068 +- ret = register_pernet_subsys(&lowpan_frags_ops);
1069 +- if (ret)
1070 +- goto err_pernet;
1071 ++ return jhash2((const u32 *)&fq->key,
1072 ++ sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
1073 ++}
1074 ++
1075 ++static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
1076 ++{
1077 ++ const struct frag_lowpan_compare_key *key = arg->key;
1078 ++ const struct inet_frag_queue *fq = ptr;
1079 ++
1080 ++ return !!memcmp(&fq->key, key, sizeof(*key));
1081 ++}
1082 ++
1083 ++static const struct rhashtable_params lowpan_rhash_params = {
1084 ++ .head_offset = offsetof(struct inet_frag_queue, node),
1085 ++ .hashfn = lowpan_key_hashfn,
1086 ++ .obj_hashfn = lowpan_obj_hashfn,
1087 ++ .obj_cmpfn = lowpan_obj_cmpfn,
1088 ++ .automatic_shrinking = true,
1089 ++};
1090 ++
1091 ++int __init lowpan_net_frag_init(void)
1092 ++{
1093 ++ int ret;
1094 +
1095 +- lowpan_frags.hashfn = lowpan_hashfn;
1096 + lowpan_frags.constructor = lowpan_frag_init;
1097 + lowpan_frags.destructor = NULL;
1098 + lowpan_frags.skb_free = NULL;
1099 + lowpan_frags.qsize = sizeof(struct frag_queue);
1100 +- lowpan_frags.match = lowpan_frag_match;
1101 + lowpan_frags.frag_expire = lowpan_frag_expire;
1102 + lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
1103 ++ lowpan_frags.rhash_params = lowpan_rhash_params;
1104 + ret = inet_frags_init(&lowpan_frags);
1105 + if (ret)
1106 +- goto err_pernet;
1107 ++ goto out;
1108 +
1109 ++ ret = lowpan_frags_sysctl_register();
1110 ++ if (ret)
1111 ++ goto err_sysctl;
1112 ++
1113 ++ ret = register_pernet_subsys(&lowpan_frags_ops);
1114 ++ if (ret)
1115 ++ goto err_pernet;
1116 ++out:
1117 + return ret;
1118 + err_pernet:
1119 + lowpan_frags_sysctl_unregister();
1120 ++err_sysctl:
1121 ++ inet_frags_fini(&lowpan_frags);
1122 + return ret;
1123 + }
1124 +
1125 +diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
1126 +index b2001b20e029..c03e5f5859e1 100644
1127 +--- a/net/ipv4/inet_fragment.c
1128 ++++ b/net/ipv4/inet_fragment.c
1129 +@@ -25,12 +25,6 @@
1130 + #include <net/inet_frag.h>
1131 + #include <net/inet_ecn.h>
1132 +
1133 +-#define INETFRAGS_EVICT_BUCKETS 128
1134 +-#define INETFRAGS_EVICT_MAX 512
1135 +-
1136 +-/* don't rebuild inetfrag table with new secret more often than this */
1137 +-#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
1138 +-
1139 + /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
1140 + * Value : 0xff if frame should be dropped.
1141 + * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
1142 +@@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = {
1143 + };
1144 + EXPORT_SYMBOL(ip_frag_ecn_table);
1145 +
1146 +-static unsigned int
1147 +-inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
1148 +-{
1149 +- return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
1150 +-}
1151 +-
1152 +-static bool inet_frag_may_rebuild(struct inet_frags *f)
1153 +-{
1154 +- return time_after(jiffies,
1155 +- f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
1156 +-}
1157 +-
1158 +-static void inet_frag_secret_rebuild(struct inet_frags *f)
1159 +-{
1160 +- int i;
1161 +-
1162 +- write_seqlock_bh(&f->rnd_seqlock);
1163 +-
1164 +- if (!inet_frag_may_rebuild(f))
1165 +- goto out;
1166 +-
1167 +- get_random_bytes(&f->rnd, sizeof(u32));
1168 +-
1169 +- for (i = 0; i < INETFRAGS_HASHSZ; i++) {
1170 +- struct inet_frag_bucket *hb;
1171 +- struct inet_frag_queue *q;
1172 +- struct hlist_node *n;
1173 +-
1174 +- hb = &f->hash[i];
1175 +- spin_lock(&hb->chain_lock);
1176 +-
1177 +- hlist_for_each_entry_safe(q, n, &hb->chain, list) {
1178 +- unsigned int hval = inet_frag_hashfn(f, q);
1179 +-
1180 +- if (hval != i) {
1181 +- struct inet_frag_bucket *hb_dest;
1182 +-
1183 +- hlist_del(&q->list);
1184 +-
1185 +- /* Relink to new hash chain. */
1186 +- hb_dest = &f->hash[hval];
1187 +-
1188 +- /* This is the only place where we take
1189 +- * another chain_lock while already holding
1190 +- * one. As this will not run concurrently,
1191 +- * we cannot deadlock on hb_dest lock below, if its
1192 +- * already locked it will be released soon since
1193 +- * other caller cannot be waiting for hb lock
1194 +- * that we've taken above.
1195 +- */
1196 +- spin_lock_nested(&hb_dest->chain_lock,
1197 +- SINGLE_DEPTH_NESTING);
1198 +- hlist_add_head(&q->list, &hb_dest->chain);
1199 +- spin_unlock(&hb_dest->chain_lock);
1200 +- }
1201 +- }
1202 +- spin_unlock(&hb->chain_lock);
1203 +- }
1204 +-
1205 +- f->rebuild = false;
1206 +- f->last_rebuild_jiffies = jiffies;
1207 +-out:
1208 +- write_sequnlock_bh(&f->rnd_seqlock);
1209 +-}
1210 +-
1211 +-static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
1212 +-{
1213 +- if (!hlist_unhashed(&q->list_evictor))
1214 +- return false;
1215 +-
1216 +- return q->net->low_thresh == 0 ||
1217 +- frag_mem_limit(q->net) >= q->net->low_thresh;
1218 +-}
1219 +-
1220 +-static unsigned int
1221 +-inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
1222 +-{
1223 +- struct inet_frag_queue *fq;
1224 +- struct hlist_node *n;
1225 +- unsigned int evicted = 0;
1226 +- HLIST_HEAD(expired);
1227 +-
1228 +- spin_lock(&hb->chain_lock);
1229 +-
1230 +- hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
1231 +- if (!inet_fragq_should_evict(fq))
1232 +- continue;
1233 +-
1234 +- if (!del_timer(&fq->timer))
1235 +- continue;
1236 +-
1237 +- hlist_add_head(&fq->list_evictor, &expired);
1238 +- ++evicted;
1239 +- }
1240 +-
1241 +- spin_unlock(&hb->chain_lock);
1242 +-
1243 +- hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
1244 +- f->frag_expire((unsigned long) fq);
1245 +-
1246 +- return evicted;
1247 +-}
1248 +-
1249 +-static void inet_frag_worker(struct work_struct *work)
1250 +-{
1251 +- unsigned int budget = INETFRAGS_EVICT_BUCKETS;
1252 +- unsigned int i, evicted = 0;
1253 +- struct inet_frags *f;
1254 +-
1255 +- f = container_of(work, struct inet_frags, frags_work);
1256 +-
1257 +- BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
1258 +-
1259 +- local_bh_disable();
1260 +-
1261 +- for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
1262 +- evicted += inet_evict_bucket(f, &f->hash[i]);
1263 +- i = (i + 1) & (INETFRAGS_HASHSZ - 1);
1264 +- if (evicted > INETFRAGS_EVICT_MAX)
1265 +- break;
1266 +- }
1267 +-
1268 +- f->next_bucket = i;
1269 +-
1270 +- local_bh_enable();
1271 +-
1272 +- if (f->rebuild && inet_frag_may_rebuild(f))
1273 +- inet_frag_secret_rebuild(f);
1274 +-}
1275 +-
1276 +-static void inet_frag_schedule_worker(struct inet_frags *f)
1277 +-{
1278 +- if (unlikely(!work_pending(&f->frags_work)))
1279 +- schedule_work(&f->frags_work);
1280 +-}
1281 +-
1282 + int inet_frags_init(struct inet_frags *f)
1283 + {
1284 +- int i;
1285 +-
1286 +- INIT_WORK(&f->frags_work, inet_frag_worker);
1287 +-
1288 +- for (i = 0; i < INETFRAGS_HASHSZ; i++) {
1289 +- struct inet_frag_bucket *hb = &f->hash[i];
1290 +-
1291 +- spin_lock_init(&hb->chain_lock);
1292 +- INIT_HLIST_HEAD(&hb->chain);
1293 +- }
1294 +-
1295 +- seqlock_init(&f->rnd_seqlock);
1296 +- f->last_rebuild_jiffies = 0;
1297 + f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
1298 + NULL);
1299 + if (!f->frags_cachep)
1300 +@@ -214,73 +59,53 @@ EXPORT_SYMBOL(inet_frags_init);
1301 +
1302 + void inet_frags_fini(struct inet_frags *f)
1303 + {
1304 +- cancel_work_sync(&f->frags_work);
1305 ++ /* We must wait that all inet_frag_destroy_rcu() have completed. */
1306 ++ rcu_barrier();
1307 ++
1308 + kmem_cache_destroy(f->frags_cachep);
1309 ++ f->frags_cachep = NULL;
1310 + }
1311 + EXPORT_SYMBOL(inet_frags_fini);
1312 +
1313 +-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
1314 ++static void inet_frags_free_cb(void *ptr, void *arg)
1315 + {
1316 +- unsigned int seq;
1317 +- int i;
1318 ++ struct inet_frag_queue *fq = ptr;
1319 +
1320 +- nf->low_thresh = 0;
1321 +-
1322 +-evict_again:
1323 +- local_bh_disable();
1324 +- seq = read_seqbegin(&f->rnd_seqlock);
1325 +-
1326 +- for (i = 0; i < INETFRAGS_HASHSZ ; i++)
1327 +- inet_evict_bucket(f, &f->hash[i]);
1328 +-
1329 +- local_bh_enable();
1330 +- cond_resched();
1331 +-
1332 +- if (read_seqretry(&f->rnd_seqlock, seq) ||
1333 +- sum_frag_mem_limit(nf))
1334 +- goto evict_again;
1335 +-}
1336 +-EXPORT_SYMBOL(inet_frags_exit_net);
1337 +-
1338 +-static struct inet_frag_bucket *
1339 +-get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
1340 +-__acquires(hb->chain_lock)
1341 +-{
1342 +- struct inet_frag_bucket *hb;
1343 +- unsigned int seq, hash;
1344 +-
1345 +- restart:
1346 +- seq = read_seqbegin(&f->rnd_seqlock);
1347 +-
1348 +- hash = inet_frag_hashfn(f, fq);
1349 +- hb = &f->hash[hash];
1350 ++ /* If we can not cancel the timer, it means this frag_queue
1351 ++ * is already disappearing, we have nothing to do.
1352 ++ * Otherwise, we own a refcount until the end of this function.
1353 ++ */
1354 ++ if (!del_timer(&fq->timer))
1355 ++ return;
1356 +
1357 +- spin_lock(&hb->chain_lock);
1358 +- if (read_seqretry(&f->rnd_seqlock, seq)) {
1359 +- spin_unlock(&hb->chain_lock);
1360 +- goto restart;
1361 ++ spin_lock_bh(&fq->lock);
1362 ++ if (!(fq->flags & INET_FRAG_COMPLETE)) {
1363 ++ fq->flags |= INET_FRAG_COMPLETE;
1364 ++ atomic_dec(&fq->refcnt);
1365 + }
1366 ++ spin_unlock_bh(&fq->lock);
1367 +
1368 +- return hb;
1369 ++ inet_frag_put(fq);
1370 + }
1371 +
1372 +-static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
1373 ++void inet_frags_exit_net(struct netns_frags *nf)
1374 + {
1375 +- struct inet_frag_bucket *hb;
1376 ++ nf->high_thresh = 0; /* prevent creation of new frags */
1377 +
1378 +- hb = get_frag_bucket_locked(fq, f);
1379 +- hlist_del(&fq->list);
1380 +- fq->flags |= INET_FRAG_COMPLETE;
1381 +- spin_unlock(&hb->chain_lock);
1382 ++ rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
1383 + }
1384 ++EXPORT_SYMBOL(inet_frags_exit_net);
1385 +
1386 +-void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
1387 ++void inet_frag_kill(struct inet_frag_queue *fq)
1388 + {
1389 + if (del_timer(&fq->timer))
1390 + atomic_dec(&fq->refcnt);
1391 +
1392 + if (!(fq->flags & INET_FRAG_COMPLETE)) {
1393 +- fq_unlink(fq, f);
1394 ++ struct netns_frags *nf = fq->net;
1395 ++
1396 ++ fq->flags |= INET_FRAG_COMPLETE;
1397 ++ rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
1398 + atomic_dec(&fq->refcnt);
1399 + }
1400 + }
1401 +@@ -294,11 +119,23 @@ static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
1402 + kfree_skb(skb);
1403 + }
1404 +
1405 +-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
1406 ++static void inet_frag_destroy_rcu(struct rcu_head *head)
1407 ++{
1408 ++ struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
1409 ++ rcu);
1410 ++ struct inet_frags *f = q->net->f;
1411 ++
1412 ++ if (f->destructor)
1413 ++ f->destructor(q);
1414 ++ kmem_cache_free(f->frags_cachep, q);
1415 ++}
1416 ++
1417 ++void inet_frag_destroy(struct inet_frag_queue *q)
1418 + {
1419 + struct sk_buff *fp;
1420 + struct netns_frags *nf;
1421 + unsigned int sum, sum_truesize = 0;
1422 ++ struct inet_frags *f;
1423 +
1424 + WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
1425 + WARN_ON(del_timer(&q->timer) != 0);
1426 +@@ -306,64 +143,35 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
1427 + /* Release all fragment data. */
1428 + fp = q->fragments;
1429 + nf = q->net;
1430 +- while (fp) {
1431 +- struct sk_buff *xp = fp->next;
1432 +-
1433 +- sum_truesize += fp->truesize;
1434 +- frag_kfree_skb(nf, f, fp);
1435 +- fp = xp;
1436 ++ f = nf->f;
1437 ++ if (fp) {
1438 ++ do {
1439 ++ struct sk_buff *xp = fp->next;
1440 ++
1441 ++ sum_truesize += fp->truesize;
1442 ++ frag_kfree_skb(nf, f, fp);
1443 ++ fp = xp;
1444 ++ } while (fp);
1445 ++ } else {
1446 ++ sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
1447 + }
1448 + sum = sum_truesize + f->qsize;
1449 +
1450 +- if (f->destructor)
1451 +- f->destructor(q);
1452 +- kmem_cache_free(f->frags_cachep, q);
1453 ++ call_rcu(&q->rcu, inet_frag_destroy_rcu);
1454 +
1455 + sub_frag_mem_limit(nf, sum);
1456 + }
1457 + EXPORT_SYMBOL(inet_frag_destroy);
1458 +
1459 +-static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
1460 +- struct inet_frag_queue *qp_in,
1461 +- struct inet_frags *f,
1462 +- void *arg)
1463 +-{
1464 +- struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
1465 +- struct inet_frag_queue *qp;
1466 +-
1467 +-#ifdef CONFIG_SMP
1468 +- /* With SMP race we have to recheck hash table, because
1469 +- * such entry could have been created on other cpu before
1470 +- * we acquired hash bucket lock.
1471 +- */
1472 +- hlist_for_each_entry(qp, &hb->chain, list) {
1473 +- if (qp->net == nf && f->match(qp, arg)) {
1474 +- atomic_inc(&qp->refcnt);
1475 +- spin_unlock(&hb->chain_lock);
1476 +- qp_in->flags |= INET_FRAG_COMPLETE;
1477 +- inet_frag_put(qp_in, f);
1478 +- return qp;
1479 +- }
1480 +- }
1481 +-#endif
1482 +- qp = qp_in;
1483 +- if (!mod_timer(&qp->timer, jiffies + nf->timeout))
1484 +- atomic_inc(&qp->refcnt);
1485 +-
1486 +- atomic_inc(&qp->refcnt);
1487 +- hlist_add_head(&qp->list, &hb->chain);
1488 +-
1489 +- spin_unlock(&hb->chain_lock);
1490 +-
1491 +- return qp;
1492 +-}
1493 +-
1494 + static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
1495 + struct inet_frags *f,
1496 + void *arg)
1497 + {
1498 + struct inet_frag_queue *q;
1499 +
1500 ++ if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
1501 ++ return NULL;
1502 ++
1503 + q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
1504 + if (!q)
1505 + return NULL;
1506 +@@ -374,75 +182,52 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
1507 +
1508 + setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
1509 + spin_lock_init(&q->lock);
1510 +- atomic_set(&q->refcnt, 1);
1511 ++ atomic_set(&q->refcnt, 3);
1512 +
1513 + return q;
1514 + }
1515 +
1516 + static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
1517 +- struct inet_frags *f,
1518 +- void *arg)
1519 ++ void *arg,
1520 ++ struct inet_frag_queue **prev)
1521 + {
1522 ++ struct inet_frags *f = nf->f;
1523 + struct inet_frag_queue *q;
1524 +
1525 + q = inet_frag_alloc(nf, f, arg);
1526 +- if (!q)
1527 +- return NULL;
1528 +-
1529 +- return inet_frag_intern(nf, q, f, arg);
1530 +-}
1531 +-
1532 +-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
1533 +- struct inet_frags *f, void *key,
1534 +- unsigned int hash)
1535 +-{
1536 +- struct inet_frag_bucket *hb;
1537 +- struct inet_frag_queue *q;
1538 +- int depth = 0;
1539 +-
1540 +- if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
1541 +- inet_frag_schedule_worker(f);
1542 ++ if (!q) {
1543 ++ *prev = ERR_PTR(-ENOMEM);
1544 + return NULL;
1545 + }
1546 +-
1547 +- if (frag_mem_limit(nf) > nf->low_thresh)
1548 +- inet_frag_schedule_worker(f);
1549 +-
1550 +- hash &= (INETFRAGS_HASHSZ - 1);
1551 +- hb = &f->hash[hash];
1552 +-
1553 +- spin_lock(&hb->chain_lock);
1554 +- hlist_for_each_entry(q, &hb->chain, list) {
1555 +- if (q->net == nf && f->match(q, key)) {
1556 +- atomic_inc(&q->refcnt);
1557 +- spin_unlock(&hb->chain_lock);
1558 +- return q;
1559 +- }
1560 +- depth++;
1561 +- }
1562 +- spin_unlock(&hb->chain_lock);
1563 +-
1564 +- if (depth <= INETFRAGS_MAXDEPTH)
1565 +- return inet_frag_create(nf, f, key);
1566 +-
1567 +- if (inet_frag_may_rebuild(f)) {
1568 +- if (!f->rebuild)
1569 +- f->rebuild = true;
1570 +- inet_frag_schedule_worker(f);
1571 ++ mod_timer(&q->timer, jiffies + nf->timeout);
1572 ++
1573 ++ *prev = rhashtable_lookup_get_insert_key(&nf->rhashtable, &q->key,
1574 ++ &q->node, f->rhash_params);
1575 ++ if (*prev) {
1576 ++ q->flags |= INET_FRAG_COMPLETE;
1577 ++ inet_frag_kill(q);
1578 ++ inet_frag_destroy(q);
1579 ++ return NULL;
1580 + }
1581 +-
1582 +- return ERR_PTR(-ENOBUFS);
1583 ++ return q;
1584 + }
1585 +-EXPORT_SYMBOL(inet_frag_find);
1586 ++EXPORT_SYMBOL(inet_frag_create);
1587 +
1588 +-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
1589 +- const char *prefix)
1590 ++/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
1591 ++struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
1592 + {
1593 +- static const char msg[] = "inet_frag_find: Fragment hash bucket"
1594 +- " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
1595 +- ". Dropping fragment.\n";
1596 ++ struct inet_frag_queue *fq = NULL, *prev;
1597 +
1598 +- if (PTR_ERR(q) == -ENOBUFS)
1599 +- net_dbg_ratelimited("%s%s", prefix, msg);
1600 ++ rcu_read_lock();
1601 ++ prev = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
1602 ++ if (!prev)
1603 ++ fq = inet_frag_create(nf, key, &prev);
1604 ++ if (prev && !IS_ERR(prev)) {
1605 ++ fq = prev;
1606 ++ if (!atomic_inc_not_zero(&fq->refcnt))
1607 ++ fq = NULL;
1608 ++ }
1609 ++ rcu_read_unlock();
1610 ++ return fq;
1611 + }
1612 +-EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
1613 ++EXPORT_SYMBOL(inet_frag_find);
1614 +diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
1615 +index 72915658a6b1..9b09a9b5a4fe 100644
1616 +--- a/net/ipv4/ip_fragment.c
1617 ++++ b/net/ipv4/ip_fragment.c
1618 +@@ -58,27 +58,64 @@
1619 + static int sysctl_ipfrag_max_dist __read_mostly = 64;
1620 + static const char ip_frag_cache_name[] = "ip4-frags";
1621 +
1622 +-struct ipfrag_skb_cb
1623 +-{
1624 ++/* Use skb->cb to track consecutive/adjacent fragments coming at
1625 ++ * the end of the queue. Nodes in the rb-tree queue will
1626 ++ * contain "runs" of one or more adjacent fragments.
1627 ++ *
1628 ++ * Invariants:
1629 ++ * - next_frag is NULL at the tail of a "run";
1630 ++ * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
1631 ++ */
1632 ++struct ipfrag_skb_cb {
1633 + struct inet_skb_parm h;
1634 +- int offset;
1635 ++ struct sk_buff *next_frag;
1636 ++ int frag_run_len;
1637 + };
1638 +
1639 +-#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
1640 ++#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
1641 ++
1642 ++static void ip4_frag_init_run(struct sk_buff *skb)
1643 ++{
1644 ++ BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
1645 ++
1646 ++ FRAG_CB(skb)->next_frag = NULL;
1647 ++ FRAG_CB(skb)->frag_run_len = skb->len;
1648 ++}
1649 ++
1650 ++/* Append skb to the last "run". */
1651 ++static void ip4_frag_append_to_last_run(struct inet_frag_queue *q,
1652 ++ struct sk_buff *skb)
1653 ++{
1654 ++ RB_CLEAR_NODE(&skb->rbnode);
1655 ++ FRAG_CB(skb)->next_frag = NULL;
1656 ++
1657 ++ FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
1658 ++ FRAG_CB(q->fragments_tail)->next_frag = skb;
1659 ++ q->fragments_tail = skb;
1660 ++}
1661 ++
1662 ++/* Create a new "run" with the skb. */
1663 ++static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb)
1664 ++{
1665 ++ if (q->last_run_head)
1666 ++ rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
1667 ++ &q->last_run_head->rbnode.rb_right);
1668 ++ else
1669 ++ rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
1670 ++ rb_insert_color(&skb->rbnode, &q->rb_fragments);
1671 ++
1672 ++ ip4_frag_init_run(skb);
1673 ++ q->fragments_tail = skb;
1674 ++ q->last_run_head = skb;
1675 ++}
1676 +
1677 + /* Describe an entry in the "incomplete datagrams" queue. */
1678 + struct ipq {
1679 + struct inet_frag_queue q;
1680 +
1681 +- u32 user;
1682 +- __be32 saddr;
1683 +- __be32 daddr;
1684 +- __be16 id;
1685 +- u8 protocol;
1686 + u8 ecn; /* RFC3168 support */
1687 + u16 max_df_size; /* largest frag with DF set seen */
1688 + int iif;
1689 +- int vif; /* L3 master device index */
1690 + unsigned int rid;
1691 + struct inet_peer *peer;
1692 + };
1693 +@@ -90,49 +127,9 @@ static u8 ip4_frag_ecn(u8 tos)
1694 +
1695 + static struct inet_frags ip4_frags;
1696 +
1697 +-int ip_frag_mem(struct net *net)
1698 +-{
1699 +- return sum_frag_mem_limit(&net->ipv4.frags);
1700 +-}
1701 +-
1702 +-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
1703 +- struct net_device *dev);
1704 +-
1705 +-struct ip4_create_arg {
1706 +- struct iphdr *iph;
1707 +- u32 user;
1708 +- int vif;
1709 +-};
1710 ++static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
1711 ++ struct sk_buff *prev_tail, struct net_device *dev);
1712 +
1713 +-static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
1714 +-{
1715 +- net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
1716 +- return jhash_3words((__force u32)id << 16 | prot,
1717 +- (__force u32)saddr, (__force u32)daddr,
1718 +- ip4_frags.rnd);
1719 +-}
1720 +-
1721 +-static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
1722 +-{
1723 +- const struct ipq *ipq;
1724 +-
1725 +- ipq = container_of(q, struct ipq, q);
1726 +- return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
1727 +-}
1728 +-
1729 +-static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
1730 +-{
1731 +- const struct ipq *qp;
1732 +- const struct ip4_create_arg *arg = a;
1733 +-
1734 +- qp = container_of(q, struct ipq, q);
1735 +- return qp->id == arg->iph->id &&
1736 +- qp->saddr == arg->iph->saddr &&
1737 +- qp->daddr == arg->iph->daddr &&
1738 +- qp->protocol == arg->iph->protocol &&
1739 +- qp->user == arg->user &&
1740 +- qp->vif == arg->vif;
1741 +-}
1742 +
1743 + static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
1744 + {
1745 +@@ -141,17 +138,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
1746 + frags);
1747 + struct net *net = container_of(ipv4, struct net, ipv4);
1748 +
1749 +- const struct ip4_create_arg *arg = a;
1750 ++ const struct frag_v4_compare_key *key = a;
1751 +
1752 +- qp->protocol = arg->iph->protocol;
1753 +- qp->id = arg->iph->id;
1754 +- qp->ecn = ip4_frag_ecn(arg->iph->tos);
1755 +- qp->saddr = arg->iph->saddr;
1756 +- qp->daddr = arg->iph->daddr;
1757 +- qp->vif = arg->vif;
1758 +- qp->user = arg->user;
1759 ++ q->key.v4 = *key;
1760 ++ qp->ecn = 0;
1761 + qp->peer = sysctl_ipfrag_max_dist ?
1762 +- inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
1763 ++ inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
1764 + NULL;
1765 + }
1766 +
1767 +@@ -169,7 +161,7 @@ static void ip4_frag_free(struct inet_frag_queue *q)
1768 +
1769 + static void ipq_put(struct ipq *ipq)
1770 + {
1771 +- inet_frag_put(&ipq->q, &ip4_frags);
1772 ++ inet_frag_put(&ipq->q);
1773 + }
1774 +
1775 + /* Kill ipq entry. It is not destroyed immediately,
1776 +@@ -177,7 +169,7 @@ static void ipq_put(struct ipq *ipq)
1777 + */
1778 + static void ipq_kill(struct ipq *ipq)
1779 + {
1780 +- inet_frag_kill(&ipq->q, &ip4_frags);
1781 ++ inet_frag_kill(&ipq->q);
1782 + }
1783 +
1784 + static bool frag_expire_skip_icmp(u32 user)
1785 +@@ -194,8 +186,11 @@ static bool frag_expire_skip_icmp(u32 user)
1786 + */
1787 + static void ip_expire(unsigned long arg)
1788 + {
1789 +- struct ipq *qp;
1790 ++ const struct iphdr *iph;
1791 ++ struct sk_buff *head = NULL;
1792 + struct net *net;
1793 ++ struct ipq *qp;
1794 ++ int err;
1795 +
1796 + qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
1797 + net = container_of(qp->q.net, struct net, ipv4.frags);
1798 +@@ -208,51 +203,65 @@ static void ip_expire(unsigned long arg)
1799 +
1800 + ipq_kill(qp);
1801 + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
1802 ++ IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
1803 +
1804 +- if (!inet_frag_evicting(&qp->q)) {
1805 +- struct sk_buff *clone, *head = qp->q.fragments;
1806 +- const struct iphdr *iph;
1807 +- int err;
1808 +-
1809 +- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
1810 ++ if (!(qp->q.flags & INET_FRAG_FIRST_IN))
1811 ++ goto out;
1812 +
1813 +- if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
1814 ++ /* sk_buff::dev and sk_buff::rbnode are unionized. So we
1815 ++ * pull the head out of the tree in order to be able to
1816 ++ * deal with head->dev.
1817 ++ */
1818 ++ if (qp->q.fragments) {
1819 ++ head = qp->q.fragments;
1820 ++ qp->q.fragments = head->next;
1821 ++ } else {
1822 ++ head = skb_rb_first(&qp->q.rb_fragments);
1823 ++ if (!head)
1824 + goto out;
1825 ++ if (FRAG_CB(head)->next_frag)
1826 ++ rb_replace_node(&head->rbnode,
1827 ++ &FRAG_CB(head)->next_frag->rbnode,
1828 ++ &qp->q.rb_fragments);
1829 ++ else
1830 ++ rb_erase(&head->rbnode, &qp->q.rb_fragments);
1831 ++ memset(&head->rbnode, 0, sizeof(head->rbnode));
1832 ++ barrier();
1833 ++ }
1834 ++ if (head == qp->q.fragments_tail)
1835 ++ qp->q.fragments_tail = NULL;
1836 +
1837 +- head->dev = dev_get_by_index_rcu(net, qp->iif);
1838 +- if (!head->dev)
1839 +- goto out;
1840 ++ sub_frag_mem_limit(qp->q.net, head->truesize);
1841 ++
1842 ++ head->dev = dev_get_by_index_rcu(net, qp->iif);
1843 ++ if (!head->dev)
1844 ++ goto out;
1845 +
1846 +
1847 +- /* skb has no dst, perform route lookup again */
1848 +- iph = ip_hdr(head);
1849 +- err = ip_route_input_noref(head, iph->daddr, iph->saddr,
1850 ++ /* skb has no dst, perform route lookup again */
1851 ++ iph = ip_hdr(head);
1852 ++ err = ip_route_input_noref(head, iph->daddr, iph->saddr,
1853 + iph->tos, head->dev);
1854 +- if (err)
1855 +- goto out;
1856 ++ if (err)
1857 ++ goto out;
1858 +
1859 +- /* Only an end host needs to send an ICMP
1860 +- * "Fragment Reassembly Timeout" message, per RFC792.
1861 +- */
1862 +- if (frag_expire_skip_icmp(qp->user) &&
1863 +- (skb_rtable(head)->rt_type != RTN_LOCAL))
1864 +- goto out;
1865 ++ /* Only an end host needs to send an ICMP
1866 ++ * "Fragment Reassembly Timeout" message, per RFC792.
1867 ++ */
1868 ++ if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
1869 ++ (skb_rtable(head)->rt_type != RTN_LOCAL))
1870 ++ goto out;
1871 +
1872 +- clone = skb_clone(head, GFP_ATOMIC);
1873 ++ spin_unlock(&qp->q.lock);
1874 ++ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
1875 ++ goto out_rcu_unlock;
1876 +
1877 +- /* Send an ICMP "Fragment Reassembly Timeout" message. */
1878 +- if (clone) {
1879 +- spin_unlock(&qp->q.lock);
1880 +- icmp_send(clone, ICMP_TIME_EXCEEDED,
1881 +- ICMP_EXC_FRAGTIME, 0);
1882 +- consume_skb(clone);
1883 +- goto out_rcu_unlock;
1884 +- }
1885 +- }
1886 + out:
1887 + spin_unlock(&qp->q.lock);
1888 + out_rcu_unlock:
1889 + rcu_read_unlock();
1890 ++ if (head)
1891 ++ kfree_skb(head);
1892 + ipq_put(qp);
1893 + }
1894 +
1895 +@@ -262,21 +271,20 @@ out_rcu_unlock:
1896 + static struct ipq *ip_find(struct net *net, struct iphdr *iph,
1897 + u32 user, int vif)
1898 + {
1899 ++ struct frag_v4_compare_key key = {
1900 ++ .saddr = iph->saddr,
1901 ++ .daddr = iph->daddr,
1902 ++ .user = user,
1903 ++ .vif = vif,
1904 ++ .id = iph->id,
1905 ++ .protocol = iph->protocol,
1906 ++ };
1907 + struct inet_frag_queue *q;
1908 +- struct ip4_create_arg arg;
1909 +- unsigned int hash;
1910 +-
1911 +- arg.iph = iph;
1912 +- arg.user = user;
1913 +- arg.vif = vif;
1914 +-
1915 +- hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
1916 +
1917 +- q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
1918 +- if (IS_ERR_OR_NULL(q)) {
1919 +- inet_frag_maybe_warn_overflow(q, pr_fmt());
1920 ++ q = inet_frag_find(&net->ipv4.frags, &key);
1921 ++ if (!q)
1922 + return NULL;
1923 +- }
1924 ++
1925 + return container_of(q, struct ipq, q);
1926 + }
1927 +
1928 +@@ -296,7 +304,7 @@ static int ip_frag_too_far(struct ipq *qp)
1929 + end = atomic_inc_return(&peer->rid);
1930 + qp->rid = end;
1931 +
1932 +- rc = qp->q.fragments && (end - start) > max;
1933 ++ rc = qp->q.fragments_tail && (end - start) > max;
1934 +
1935 + if (rc) {
1936 + struct net *net;
1937 +@@ -310,7 +318,6 @@ static int ip_frag_too_far(struct ipq *qp)
1938 +
1939 + static int ip_frag_reinit(struct ipq *qp)
1940 + {
1941 +- struct sk_buff *fp;
1942 + unsigned int sum_truesize = 0;
1943 +
1944 + if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
1945 +@@ -318,21 +325,16 @@ static int ip_frag_reinit(struct ipq *qp)
1946 + return -ETIMEDOUT;
1947 + }
1948 +
1949 +- fp = qp->q.fragments;
1950 +- do {
1951 +- struct sk_buff *xp = fp->next;
1952 +-
1953 +- sum_truesize += fp->truesize;
1954 +- kfree_skb(fp);
1955 +- fp = xp;
1956 +- } while (fp);
1957 ++ sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
1958 + sub_frag_mem_limit(qp->q.net, sum_truesize);
1959 +
1960 + qp->q.flags = 0;
1961 + qp->q.len = 0;
1962 + qp->q.meat = 0;
1963 + qp->q.fragments = NULL;
1964 ++ qp->q.rb_fragments = RB_ROOT;
1965 + qp->q.fragments_tail = NULL;
1966 ++ qp->q.last_run_head = NULL;
1967 + qp->iif = 0;
1968 + qp->ecn = 0;
1969 +
1970 +@@ -342,11 +344,13 @@ static int ip_frag_reinit(struct ipq *qp)
1971 + /* Add new segment to existing queue. */
1972 + static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
1973 + {
1974 +- struct sk_buff *prev, *next;
1975 ++ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
1976 ++ struct rb_node **rbn, *parent;
1977 ++ struct sk_buff *skb1, *prev_tail;
1978 ++ int ihl, end, skb1_run_end;
1979 + struct net_device *dev;
1980 + unsigned int fragsize;
1981 + int flags, offset;
1982 +- int ihl, end;
1983 + int err = -ENOENT;
1984 + u8 ecn;
1985 +
1986 +@@ -405,94 +409,68 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
1987 + if (err)
1988 + goto err;
1989 +
1990 +- /* Find out which fragments are in front and at the back of us
1991 +- * in the chain of fragments so far. We must know where to put
1992 +- * this fragment, right?
1993 +- */
1994 +- prev = qp->q.fragments_tail;
1995 +- if (!prev || FRAG_CB(prev)->offset < offset) {
1996 +- next = NULL;
1997 +- goto found;
1998 +- }
1999 +- prev = NULL;
2000 +- for (next = qp->q.fragments; next != NULL; next = next->next) {
2001 +- if (FRAG_CB(next)->offset >= offset)
2002 +- break; /* bingo! */
2003 +- prev = next;
2004 +- }
2005 +-
2006 +-found:
2007 +- /* We found where to put this one. Check for overlap with
2008 +- * preceding fragment, and, if needed, align things so that
2009 +- * any overlaps are eliminated.
2010 ++ /* Note : skb->rbnode and skb->dev share the same location. */
2011 ++ dev = skb->dev;
2012 ++ /* Makes sure compiler wont do silly aliasing games */
2013 ++ barrier();
2014 ++
2015 ++ /* RFC5722, Section 4, amended by Errata ID : 3089
2016 ++ * When reassembling an IPv6 datagram, if
2017 ++ * one or more its constituent fragments is determined to be an
2018 ++ * overlapping fragment, the entire datagram (and any constituent
2019 ++ * fragments) MUST be silently discarded.
2020 ++ *
2021 ++ * We do the same here for IPv4 (and increment an snmp counter) but
2022 ++ * we do not want to drop the whole queue in response to a duplicate
2023 ++ * fragment.
2024 + */
2025 +- if (prev) {
2026 +- int i = (FRAG_CB(prev)->offset + prev->len) - offset;
2027 +-
2028 +- if (i > 0) {
2029 +- offset += i;
2030 +- err = -EINVAL;
2031 +- if (end <= offset)
2032 +- goto err;
2033 +- err = -ENOMEM;
2034 +- if (!pskb_pull(skb, i))
2035 +- goto err;
2036 +- if (skb->ip_summed != CHECKSUM_UNNECESSARY)
2037 +- skb->ip_summed = CHECKSUM_NONE;
2038 +- }
2039 +- }
2040 +
2041 +- err = -ENOMEM;
2042 +-
2043 +- while (next && FRAG_CB(next)->offset < end) {
2044 +- int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
2045 +-
2046 +- if (i < next->len) {
2047 +- /* Eat head of the next overlapped fragment
2048 +- * and leave the loop. The next ones cannot overlap.
2049 +- */
2050 +- if (!pskb_pull(next, i))
2051 +- goto err;
2052 +- FRAG_CB(next)->offset += i;
2053 +- qp->q.meat -= i;
2054 +- if (next->ip_summed != CHECKSUM_UNNECESSARY)
2055 +- next->ip_summed = CHECKSUM_NONE;
2056 +- break;
2057 +- } else {
2058 +- struct sk_buff *free_it = next;
2059 +-
2060 +- /* Old fragment is completely overridden with
2061 +- * new one drop it.
2062 +- */
2063 +- next = next->next;
2064 +-
2065 +- if (prev)
2066 +- prev->next = next;
2067 ++ err = -EINVAL;
2068 ++ /* Find out where to put this fragment. */
2069 ++ prev_tail = qp->q.fragments_tail;
2070 ++ if (!prev_tail)
2071 ++ ip4_frag_create_run(&qp->q, skb); /* First fragment. */
2072 ++ else if (prev_tail->ip_defrag_offset + prev_tail->len < end) {
2073 ++ /* This is the common case: skb goes to the end. */
2074 ++ /* Detect and discard overlaps. */
2075 ++ if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
2076 ++ goto discard_qp;
2077 ++ if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
2078 ++ ip4_frag_append_to_last_run(&qp->q, skb);
2079 ++ else
2080 ++ ip4_frag_create_run(&qp->q, skb);
2081 ++ } else {
2082 ++ /* Binary search. Note that skb can become the first fragment,
2083 ++ * but not the last (covered above).
2084 ++ */
2085 ++ rbn = &qp->q.rb_fragments.rb_node;
2086 ++ do {
2087 ++ parent = *rbn;
2088 ++ skb1 = rb_to_skb(parent);
2089 ++ skb1_run_end = skb1->ip_defrag_offset +
2090 ++ FRAG_CB(skb1)->frag_run_len;
2091 ++ if (end <= skb1->ip_defrag_offset)
2092 ++ rbn = &parent->rb_left;
2093 ++ else if (offset >= skb1_run_end)
2094 ++ rbn = &parent->rb_right;
2095 ++ else if (offset >= skb1->ip_defrag_offset &&
2096 ++ end <= skb1_run_end)
2097 ++ goto err; /* No new data, potential duplicate */
2098 + else
2099 +- qp->q.fragments = next;
2100 +-
2101 +- qp->q.meat -= free_it->len;
2102 +- sub_frag_mem_limit(qp->q.net, free_it->truesize);
2103 +- kfree_skb(free_it);
2104 +- }
2105 ++ goto discard_qp; /* Found an overlap */
2106 ++ } while (*rbn);
2107 ++ /* Here we have parent properly set, and rbn pointing to
2108 ++ * one of its NULL left/right children. Insert skb.
2109 ++ */
2110 ++ ip4_frag_init_run(skb);
2111 ++ rb_link_node(&skb->rbnode, parent, rbn);
2112 ++ rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
2113 + }
2114 +
2115 +- FRAG_CB(skb)->offset = offset;
2116 +-
2117 +- /* Insert this fragment in the chain of fragments. */
2118 +- skb->next = next;
2119 +- if (!next)
2120 +- qp->q.fragments_tail = skb;
2121 +- if (prev)
2122 +- prev->next = skb;
2123 +- else
2124 +- qp->q.fragments = skb;
2125 +-
2126 +- dev = skb->dev;
2127 +- if (dev) {
2128 ++ if (dev)
2129 + qp->iif = dev->ifindex;
2130 +- skb->dev = NULL;
2131 +- }
2132 ++ skb->ip_defrag_offset = offset;
2133 ++
2134 + qp->q.stamp = skb->tstamp;
2135 + qp->q.meat += skb->len;
2136 + qp->ecn |= ecn;
2137 +@@ -514,7 +492,7 @@ found:
2138 + unsigned long orefdst = skb->_skb_refdst;
2139 +
2140 + skb->_skb_refdst = 0UL;
2141 +- err = ip_frag_reasm(qp, prev, dev);
2142 ++ err = ip_frag_reasm(qp, skb, prev_tail, dev);
2143 + skb->_skb_refdst = orefdst;
2144 + return err;
2145 + }
2146 +@@ -522,20 +500,23 @@ found:
2147 + skb_dst_drop(skb);
2148 + return -EINPROGRESS;
2149 +
2150 ++discard_qp:
2151 ++ inet_frag_kill(&qp->q);
2152 ++ IP_INC_STATS_BH(net, IPSTATS_MIB_REASM_OVERLAPS);
2153 + err:
2154 + kfree_skb(skb);
2155 + return err;
2156 + }
2157 +
2158 +-
2159 + /* Build a new IP datagram from all its fragments. */
2160 +-
2161 +-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
2162 +- struct net_device *dev)
2163 ++static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
2164 ++ struct sk_buff *prev_tail, struct net_device *dev)
2165 + {
2166 + struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
2167 + struct iphdr *iph;
2168 +- struct sk_buff *fp, *head = qp->q.fragments;
2169 ++ struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
2170 ++ struct sk_buff **nextp; /* To build frag_list. */
2171 ++ struct rb_node *rbn;
2172 + int len;
2173 + int ihlen;
2174 + int err;
2175 +@@ -549,26 +530,27 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
2176 + goto out_fail;
2177 + }
2178 + /* Make the one we just received the head. */
2179 +- if (prev) {
2180 +- head = prev->next;
2181 +- fp = skb_clone(head, GFP_ATOMIC);
2182 ++ if (head != skb) {
2183 ++ fp = skb_clone(skb, GFP_ATOMIC);
2184 + if (!fp)
2185 + goto out_nomem;
2186 +-
2187 +- fp->next = head->next;
2188 +- if (!fp->next)
2189 ++ FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
2190 ++ if (RB_EMPTY_NODE(&skb->rbnode))
2191 ++ FRAG_CB(prev_tail)->next_frag = fp;
2192 ++ else
2193 ++ rb_replace_node(&skb->rbnode, &fp->rbnode,
2194 ++ &qp->q.rb_fragments);
2195 ++ if (qp->q.fragments_tail == skb)
2196 + qp->q.fragments_tail = fp;
2197 +- prev->next = fp;
2198 +-
2199 +- skb_morph(head, qp->q.fragments);
2200 +- head->next = qp->q.fragments->next;
2201 +-
2202 +- consume_skb(qp->q.fragments);
2203 +- qp->q.fragments = head;
2204 ++ skb_morph(skb, head);
2205 ++ FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
2206 ++ rb_replace_node(&head->rbnode, &skb->rbnode,
2207 ++ &qp->q.rb_fragments);
2208 ++ consume_skb(head);
2209 ++ head = skb;
2210 + }
2211 +
2212 +- WARN_ON(!head);
2213 +- WARN_ON(FRAG_CB(head)->offset != 0);
2214 ++ WARN_ON(head->ip_defrag_offset != 0);
2215 +
2216 + /* Allocate a new buffer for the datagram. */
2217 + ihlen = ip_hdrlen(head);
2218 +@@ -592,35 +574,61 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
2219 + clone = alloc_skb(0, GFP_ATOMIC);
2220 + if (!clone)
2221 + goto out_nomem;
2222 +- clone->next = head->next;
2223 +- head->next = clone;
2224 + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
2225 + skb_frag_list_init(head);
2226 + for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
2227 + plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
2228 + clone->len = clone->data_len = head->data_len - plen;
2229 +- head->data_len -= clone->len;
2230 +- head->len -= clone->len;
2231 ++ head->truesize += clone->truesize;
2232 + clone->csum = 0;
2233 + clone->ip_summed = head->ip_summed;
2234 + add_frag_mem_limit(qp->q.net, clone->truesize);
2235 ++ skb_shinfo(head)->frag_list = clone;
2236 ++ nextp = &clone->next;
2237 ++ } else {
2238 ++ nextp = &skb_shinfo(head)->frag_list;
2239 + }
2240 +
2241 +- skb_shinfo(head)->frag_list = head->next;
2242 + skb_push(head, head->data - skb_network_header(head));
2243 +
2244 +- for (fp=head->next; fp; fp = fp->next) {
2245 +- head->data_len += fp->len;
2246 +- head->len += fp->len;
2247 +- if (head->ip_summed != fp->ip_summed)
2248 +- head->ip_summed = CHECKSUM_NONE;
2249 +- else if (head->ip_summed == CHECKSUM_COMPLETE)
2250 +- head->csum = csum_add(head->csum, fp->csum);
2251 +- head->truesize += fp->truesize;
2252 ++ /* Traverse the tree in order, to build frag_list. */
2253 ++ fp = FRAG_CB(head)->next_frag;
2254 ++ rbn = rb_next(&head->rbnode);
2255 ++ rb_erase(&head->rbnode, &qp->q.rb_fragments);
2256 ++ while (rbn || fp) {
2257 ++ /* fp points to the next sk_buff in the current run;
2258 ++ * rbn points to the next run.
2259 ++ */
2260 ++ /* Go through the current run. */
2261 ++ while (fp) {
2262 ++ *nextp = fp;
2263 ++ nextp = &fp->next;
2264 ++ fp->prev = NULL;
2265 ++ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
2266 ++ fp->sk = NULL;
2267 ++ head->data_len += fp->len;
2268 ++ head->len += fp->len;
2269 ++ if (head->ip_summed != fp->ip_summed)
2270 ++ head->ip_summed = CHECKSUM_NONE;
2271 ++ else if (head->ip_summed == CHECKSUM_COMPLETE)
2272 ++ head->csum = csum_add(head->csum, fp->csum);
2273 ++ head->truesize += fp->truesize;
2274 ++ fp = FRAG_CB(fp)->next_frag;
2275 ++ }
2276 ++ /* Move to the next run. */
2277 ++ if (rbn) {
2278 ++ struct rb_node *rbnext = rb_next(rbn);
2279 ++
2280 ++ fp = rb_to_skb(rbn);
2281 ++ rb_erase(rbn, &qp->q.rb_fragments);
2282 ++ rbn = rbnext;
2283 ++ }
2284 + }
2285 + sub_frag_mem_limit(qp->q.net, head->truesize);
2286 +
2287 ++ *nextp = NULL;
2288 + head->next = NULL;
2289 ++ head->prev = NULL;
2290 + head->dev = dev;
2291 + head->tstamp = qp->q.stamp;
2292 + IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
2293 +@@ -648,7 +656,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
2294 +
2295 + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
2296 + qp->q.fragments = NULL;
2297 ++ qp->q.rb_fragments = RB_ROOT;
2298 + qp->q.fragments_tail = NULL;
2299 ++ qp->q.last_run_head = NULL;
2300 + return 0;
2301 +
2302 + out_nomem:
2303 +@@ -656,7 +666,7 @@ out_nomem:
2304 + err = -ENOMEM;
2305 + goto out_fail;
2306 + out_oversize:
2307 +- net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
2308 ++ net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
2309 + out_fail:
2310 + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
2311 + return err;
2312 +@@ -734,25 +744,46 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
2313 + }
2314 + EXPORT_SYMBOL(ip_check_defrag);
2315 +
2316 ++unsigned int inet_frag_rbtree_purge(struct rb_root *root)
2317 ++{
2318 ++ struct rb_node *p = rb_first(root);
2319 ++ unsigned int sum = 0;
2320 ++
2321 ++ while (p) {
2322 ++ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
2323 ++
2324 ++ p = rb_next(p);
2325 ++ rb_erase(&skb->rbnode, root);
2326 ++ while (skb) {
2327 ++ struct sk_buff *next = FRAG_CB(skb)->next_frag;
2328 ++
2329 ++ sum += skb->truesize;
2330 ++ kfree_skb(skb);
2331 ++ skb = next;
2332 ++ }
2333 ++ }
2334 ++ return sum;
2335 ++}
2336 ++EXPORT_SYMBOL(inet_frag_rbtree_purge);
2337 ++
2338 + #ifdef CONFIG_SYSCTL
2339 +-static int zero;
2340 ++static int dist_min;
2341 +
2342 + static struct ctl_table ip4_frags_ns_ctl_table[] = {
2343 + {
2344 + .procname = "ipfrag_high_thresh",
2345 + .data = &init_net.ipv4.frags.high_thresh,
2346 +- .maxlen = sizeof(int),
2347 ++ .maxlen = sizeof(unsigned long),
2348 + .mode = 0644,
2349 +- .proc_handler = proc_dointvec_minmax,
2350 ++ .proc_handler = proc_doulongvec_minmax,
2351 + .extra1 = &init_net.ipv4.frags.low_thresh
2352 + },
2353 + {
2354 + .procname = "ipfrag_low_thresh",
2355 + .data = &init_net.ipv4.frags.low_thresh,
2356 +- .maxlen = sizeof(int),
2357 ++ .maxlen = sizeof(unsigned long),
2358 + .mode = 0644,
2359 +- .proc_handler = proc_dointvec_minmax,
2360 +- .extra1 = &zero,
2361 ++ .proc_handler = proc_doulongvec_minmax,
2362 + .extra2 = &init_net.ipv4.frags.high_thresh
2363 + },
2364 + {
2365 +@@ -781,7 +812,7 @@ static struct ctl_table ip4_frags_ctl_table[] = {
2366 + .maxlen = sizeof(int),
2367 + .mode = 0644,
2368 + .proc_handler = proc_dointvec_minmax,
2369 +- .extra1 = &zero
2370 ++ .extra1 = &dist_min,
2371 + },
2372 + { }
2373 + };
2374 +@@ -853,6 +884,8 @@ static void __init ip4_frags_ctl_register(void)
2375 +
2376 + static int __net_init ipv4_frags_init_net(struct net *net)
2377 + {
2378 ++ int res;
2379 ++
2380 + /* Fragment cache limits.
2381 + *
2382 + * The fragment memory accounting code, (tries to) account for
2383 +@@ -876,15 +909,21 @@ static int __net_init ipv4_frags_init_net(struct net *net)
2384 + */
2385 + net->ipv4.frags.timeout = IP_FRAG_TIME;
2386 +
2387 +- inet_frags_init_net(&net->ipv4.frags);
2388 ++ net->ipv4.frags.f = &ip4_frags;
2389 +
2390 +- return ip4_frags_ns_ctl_register(net);
2391 ++ res = inet_frags_init_net(&net->ipv4.frags);
2392 ++ if (res < 0)
2393 ++ return res;
2394 ++ res = ip4_frags_ns_ctl_register(net);
2395 ++ if (res < 0)
2396 ++ inet_frags_exit_net(&net->ipv4.frags);
2397 ++ return res;
2398 + }
2399 +
2400 + static void __net_exit ipv4_frags_exit_net(struct net *net)
2401 + {
2402 + ip4_frags_ns_ctl_unregister(net);
2403 +- inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
2404 ++ inet_frags_exit_net(&net->ipv4.frags);
2405 + }
2406 +
2407 + static struct pernet_operations ip4_frags_ops = {
2408 +@@ -892,18 +931,50 @@ static struct pernet_operations ip4_frags_ops = {
2409 + .exit = ipv4_frags_exit_net,
2410 + };
2411 +
2412 ++
2413 ++static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
2414 ++{
2415 ++ return jhash2(data,
2416 ++ sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
2417 ++}
2418 ++
2419 ++static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
2420 ++{
2421 ++ const struct inet_frag_queue *fq = data;
2422 ++
2423 ++ return jhash2((const u32 *)&fq->key.v4,
2424 ++ sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
2425 ++}
2426 ++
2427 ++static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
2428 ++{
2429 ++ const struct frag_v4_compare_key *key = arg->key;
2430 ++ const struct inet_frag_queue *fq = ptr;
2431 ++
2432 ++ return !!memcmp(&fq->key, key, sizeof(*key));
2433 ++}
2434 ++
2435 ++static const struct rhashtable_params ip4_rhash_params = {
2436 ++ .head_offset = offsetof(struct inet_frag_queue, node),
2437 ++ .key_offset = offsetof(struct inet_frag_queue, key),
2438 ++ .key_len = sizeof(struct frag_v4_compare_key),
2439 ++ .hashfn = ip4_key_hashfn,
2440 ++ .obj_hashfn = ip4_obj_hashfn,
2441 ++ .obj_cmpfn = ip4_obj_cmpfn,
2442 ++ .automatic_shrinking = true,
2443 ++};
2444 ++
2445 + void __init ipfrag_init(void)
2446 + {
2447 +- ip4_frags_ctl_register();
2448 +- register_pernet_subsys(&ip4_frags_ops);
2449 +- ip4_frags.hashfn = ip4_hashfn;
2450 + ip4_frags.constructor = ip4_frag_init;
2451 + ip4_frags.destructor = ip4_frag_free;
2452 + ip4_frags.skb_free = NULL;
2453 + ip4_frags.qsize = sizeof(struct ipq);
2454 +- ip4_frags.match = ip4_frag_match;
2455 + ip4_frags.frag_expire = ip_expire;
2456 + ip4_frags.frags_cache_name = ip_frag_cache_name;
2457 ++ ip4_frags.rhash_params = ip4_rhash_params;
2458 + if (inet_frags_init(&ip4_frags))
2459 + panic("IP: failed to allocate ip4_frags cache\n");
2460 ++ ip4_frags_ctl_register();
2461 ++ register_pernet_subsys(&ip4_frags_ops);
2462 + }
2463 +diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
2464 +index 3abd9d7a3adf..b001ad668108 100644
2465 +--- a/net/ipv4/proc.c
2466 ++++ b/net/ipv4/proc.c
2467 +@@ -52,7 +52,6 @@
2468 + static int sockstat_seq_show(struct seq_file *seq, void *v)
2469 + {
2470 + struct net *net = seq->private;
2471 +- unsigned int frag_mem;
2472 + int orphans, sockets;
2473 +
2474 + local_bh_disable();
2475 +@@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
2476 + sock_prot_inuse_get(net, &udplite_prot));
2477 + seq_printf(seq, "RAW: inuse %d\n",
2478 + sock_prot_inuse_get(net, &raw_prot));
2479 +- frag_mem = ip_frag_mem(net);
2480 +- seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
2481 ++ seq_printf(seq, "FRAG: inuse %u memory %lu\n",
2482 ++ atomic_read(&net->ipv4.frags.rhashtable.nelems),
2483 ++ frag_mem_limit(&net->ipv4.frags));
2484 + return 0;
2485 + }
2486 +
2487 +@@ -132,6 +132,7 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
2488 + SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
2489 + SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
2490 + SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
2491 ++ SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
2492 + SNMP_MIB_SENTINEL
2493 + };
2494 +
2495 +diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
2496 +index 5a9ae56e7868..664c84e47bab 100644
2497 +--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
2498 ++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
2499 +@@ -64,7 +64,6 @@ struct nf_ct_frag6_skb_cb
2500 + static struct inet_frags nf_frags;
2501 +
2502 + #ifdef CONFIG_SYSCTL
2503 +-static int zero;
2504 +
2505 + static struct ctl_table nf_ct_frag6_sysctl_table[] = {
2506 + {
2507 +@@ -77,18 +76,17 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = {
2508 + {
2509 + .procname = "nf_conntrack_frag6_low_thresh",
2510 + .data = &init_net.nf_frag.frags.low_thresh,
2511 +- .maxlen = sizeof(unsigned int),
2512 ++ .maxlen = sizeof(unsigned long),
2513 + .mode = 0644,
2514 +- .proc_handler = proc_dointvec_minmax,
2515 +- .extra1 = &zero,
2516 ++ .proc_handler = proc_doulongvec_minmax,
2517 + .extra2 = &init_net.nf_frag.frags.high_thresh
2518 + },
2519 + {
2520 + .procname = "nf_conntrack_frag6_high_thresh",
2521 + .data = &init_net.nf_frag.frags.high_thresh,
2522 +- .maxlen = sizeof(unsigned int),
2523 ++ .maxlen = sizeof(unsigned long),
2524 + .mode = 0644,
2525 +- .proc_handler = proc_dointvec_minmax,
2526 ++ .proc_handler = proc_doulongvec_minmax,
2527 + .extra1 = &init_net.nf_frag.frags.low_thresh
2528 + },
2529 + { }
2530 +@@ -153,23 +151,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
2531 + return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
2532 + }
2533 +
2534 +-static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
2535 +- const struct in6_addr *daddr)
2536 +-{
2537 +- net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd));
2538 +- return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
2539 +- (__force u32)id, nf_frags.rnd);
2540 +-}
2541 +-
2542 +-
2543 +-static unsigned int nf_hashfn(const struct inet_frag_queue *q)
2544 +-{
2545 +- const struct frag_queue *nq;
2546 +-
2547 +- nq = container_of(q, struct frag_queue, q);
2548 +- return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr);
2549 +-}
2550 +-
2551 + static void nf_skb_free(struct sk_buff *skb)
2552 + {
2553 + if (NFCT_FRAG6_CB(skb)->orig)
2554 +@@ -184,34 +165,26 @@ static void nf_ct_frag6_expire(unsigned long data)
2555 + fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
2556 + net = container_of(fq->q.net, struct net, nf_frag.frags);
2557 +
2558 +- ip6_expire_frag_queue(net, fq, &nf_frags);
2559 ++ ip6_expire_frag_queue(net, fq);
2560 + }
2561 +
2562 + /* Creation primitives. */
2563 +-static inline struct frag_queue *fq_find(struct net *net, __be32 id,
2564 +- u32 user, struct in6_addr *src,
2565 +- struct in6_addr *dst, int iif, u8 ecn)
2566 ++static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
2567 ++ const struct ipv6hdr *hdr, int iif)
2568 + {
2569 ++ struct frag_v6_compare_key key = {
2570 ++ .id = id,
2571 ++ .saddr = hdr->saddr,
2572 ++ .daddr = hdr->daddr,
2573 ++ .user = user,
2574 ++ .iif = iif,
2575 ++ };
2576 + struct inet_frag_queue *q;
2577 +- struct ip6_create_arg arg;
2578 +- unsigned int hash;
2579 +-
2580 +- arg.id = id;
2581 +- arg.user = user;
2582 +- arg.src = src;
2583 +- arg.dst = dst;
2584 +- arg.iif = iif;
2585 +- arg.ecn = ecn;
2586 +-
2587 +- local_bh_disable();
2588 +- hash = nf_hash_frag(id, src, dst);
2589 +-
2590 +- q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
2591 +- local_bh_enable();
2592 +- if (IS_ERR_OR_NULL(q)) {
2593 +- inet_frag_maybe_warn_overflow(q, pr_fmt());
2594 ++
2595 ++ q = inet_frag_find(&net->nf_frag.frags, &key);
2596 ++ if (!q)
2597 + return NULL;
2598 +- }
2599 ++
2600 + return container_of(q, struct frag_queue, q);
2601 + }
2602 +
2603 +@@ -362,7 +335,7 @@ found:
2604 + return 0;
2605 +
2606 + discard_fq:
2607 +- inet_frag_kill(&fq->q, &nf_frags);
2608 ++ inet_frag_kill(&fq->q);
2609 + err:
2610 + return -1;
2611 + }
2612 +@@ -383,7 +356,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
2613 + int payload_len;
2614 + u8 ecn;
2615 +
2616 +- inet_frag_kill(&fq->q, &nf_frags);
2617 ++ inet_frag_kill(&fq->q);
2618 +
2619 + WARN_ON(head == NULL);
2620 + WARN_ON(NFCT_FRAG6_CB(head)->offset != 0);
2621 +@@ -454,6 +427,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
2622 + else if (head->ip_summed == CHECKSUM_COMPLETE)
2623 + head->csum = csum_add(head->csum, fp->csum);
2624 + head->truesize += fp->truesize;
2625 ++ fp->sk = NULL;
2626 + }
2627 + sub_frag_mem_limit(fq->q.net, head->truesize);
2628 +
2629 +@@ -472,6 +446,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
2630 + head->csum);
2631 +
2632 + fq->q.fragments = NULL;
2633 ++ fq->q.rb_fragments = RB_ROOT;
2634 + fq->q.fragments_tail = NULL;
2635 +
2636 + /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
2637 +@@ -601,9 +576,13 @@ struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 use
2638 + hdr = ipv6_hdr(clone);
2639 + fhdr = (struct frag_hdr *)skb_transport_header(clone);
2640 +
2641 ++ if (clone->len - skb_network_offset(clone) < IPV6_MIN_MTU &&
2642 ++ fhdr->frag_off & htons(IP6_MF))
2643 ++ goto ret_orig;
2644 ++
2645 + skb_orphan(skb);
2646 +- fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
2647 +- skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
2648 ++ fq = fq_find(net, fhdr->identification, user, hdr,
2649 ++ skb->dev ? skb->dev->ifindex : 0);
2650 + if (fq == NULL) {
2651 + pr_debug("Can't find and can't create new queue\n");
2652 + goto ret_orig;
2653 +@@ -614,7 +593,7 @@ struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 use
2654 + if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
2655 + spin_unlock_bh(&fq->q.lock);
2656 + pr_debug("Can't insert skb to queue\n");
2657 +- inet_frag_put(&fq->q, &nf_frags);
2658 ++ inet_frag_put(&fq->q);
2659 + goto ret_orig;
2660 + }
2661 +
2662 +@@ -626,7 +605,7 @@ struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 use
2663 + }
2664 + spin_unlock_bh(&fq->q.lock);
2665 +
2666 +- inet_frag_put(&fq->q, &nf_frags);
2667 ++ inet_frag_put(&fq->q);
2668 + return ret_skb;
2669 +
2670 + ret_orig:
2671 +@@ -650,18 +629,26 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_consume_orig);
2672 +
2673 + static int nf_ct_net_init(struct net *net)
2674 + {
2675 ++ int res;
2676 ++
2677 + net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
2678 + net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
2679 + net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
2680 +- inet_frags_init_net(&net->nf_frag.frags);
2681 +-
2682 +- return nf_ct_frag6_sysctl_register(net);
2683 ++ net->nf_frag.frags.f = &nf_frags;
2684 ++
2685 ++ res = inet_frags_init_net(&net->nf_frag.frags);
2686 ++ if (res < 0)
2687 ++ return res;
2688 ++ res = nf_ct_frag6_sysctl_register(net);
2689 ++ if (res < 0)
2690 ++ inet_frags_exit_net(&net->nf_frag.frags);
2691 ++ return res;
2692 + }
2693 +
2694 + static void nf_ct_net_exit(struct net *net)
2695 + {
2696 + nf_ct_frags6_sysctl_unregister(net);
2697 +- inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
2698 ++ inet_frags_exit_net(&net->nf_frag.frags);
2699 + }
2700 +
2701 + static struct pernet_operations nf_ct_net_ops = {
2702 +@@ -673,14 +660,13 @@ int nf_ct_frag6_init(void)
2703 + {
2704 + int ret = 0;
2705 +
2706 +- nf_frags.hashfn = nf_hashfn;
2707 + nf_frags.constructor = ip6_frag_init;
2708 + nf_frags.destructor = NULL;
2709 + nf_frags.skb_free = nf_skb_free;
2710 + nf_frags.qsize = sizeof(struct frag_queue);
2711 +- nf_frags.match = ip6_frag_match;
2712 + nf_frags.frag_expire = nf_ct_frag6_expire;
2713 + nf_frags.frags_cache_name = nf_frags_cache_name;
2714 ++ nf_frags.rhash_params = ip6_rhash_params;
2715 + ret = inet_frags_init(&nf_frags);
2716 + if (ret)
2717 + goto out;
2718 +diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
2719 +index 679253d0af84..73e766e7bc37 100644
2720 +--- a/net/ipv6/proc.c
2721 ++++ b/net/ipv6/proc.c
2722 +@@ -33,7 +33,6 @@
2723 + static int sockstat6_seq_show(struct seq_file *seq, void *v)
2724 + {
2725 + struct net *net = seq->private;
2726 +- unsigned int frag_mem = ip6_frag_mem(net);
2727 +
2728 + seq_printf(seq, "TCP6: inuse %d\n",
2729 + sock_prot_inuse_get(net, &tcpv6_prot));
2730 +@@ -43,7 +42,9 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
2731 + sock_prot_inuse_get(net, &udplitev6_prot));
2732 + seq_printf(seq, "RAW6: inuse %d\n",
2733 + sock_prot_inuse_get(net, &rawv6_prot));
2734 +- seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem);
2735 ++ seq_printf(seq, "FRAG6: inuse %u memory %lu\n",
2736 ++ atomic_read(&net->ipv6.frags.rhashtable.nelems),
2737 ++ frag_mem_limit(&net->ipv6.frags));
2738 + return 0;
2739 + }
2740 +
2741 +diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
2742 +index 58f2139ebb5e..ec917f58d105 100644
2743 +--- a/net/ipv6/reassembly.c
2744 ++++ b/net/ipv6/reassembly.c
2745 +@@ -79,94 +79,58 @@ static struct inet_frags ip6_frags;
2746 + static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
2747 + struct net_device *dev);
2748 +
2749 +-/*
2750 +- * callers should be careful not to use the hash value outside the ipfrag_lock
2751 +- * as doing so could race with ipfrag_hash_rnd being recalculated.
2752 +- */
2753 +-static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr,
2754 +- const struct in6_addr *daddr)
2755 +-{
2756 +- net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd));
2757 +- return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
2758 +- (__force u32)id, ip6_frags.rnd);
2759 +-}
2760 +-
2761 +-static unsigned int ip6_hashfn(const struct inet_frag_queue *q)
2762 +-{
2763 +- const struct frag_queue *fq;
2764 +-
2765 +- fq = container_of(q, struct frag_queue, q);
2766 +- return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr);
2767 +-}
2768 +-
2769 +-bool ip6_frag_match(const struct inet_frag_queue *q, const void *a)
2770 +-{
2771 +- const struct frag_queue *fq;
2772 +- const struct ip6_create_arg *arg = a;
2773 +-
2774 +- fq = container_of(q, struct frag_queue, q);
2775 +- return fq->id == arg->id &&
2776 +- fq->user == arg->user &&
2777 +- ipv6_addr_equal(&fq->saddr, arg->src) &&
2778 +- ipv6_addr_equal(&fq->daddr, arg->dst) &&
2779 +- (arg->iif == fq->iif ||
2780 +- !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST |
2781 +- IPV6_ADDR_LINKLOCAL)));
2782 +-}
2783 +-EXPORT_SYMBOL(ip6_frag_match);
2784 +-
2785 + void ip6_frag_init(struct inet_frag_queue *q, const void *a)
2786 + {
2787 + struct frag_queue *fq = container_of(q, struct frag_queue, q);
2788 +- const struct ip6_create_arg *arg = a;
2789 ++ const struct frag_v6_compare_key *key = a;
2790 +
2791 +- fq->id = arg->id;
2792 +- fq->user = arg->user;
2793 +- fq->saddr = *arg->src;
2794 +- fq->daddr = *arg->dst;
2795 +- fq->ecn = arg->ecn;
2796 ++ q->key.v6 = *key;
2797 ++ fq->ecn = 0;
2798 + }
2799 + EXPORT_SYMBOL(ip6_frag_init);
2800 +
2801 +-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
2802 +- struct inet_frags *frags)
2803 ++void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
2804 + {
2805 + struct net_device *dev = NULL;
2806 ++ struct sk_buff *head;
2807 +
2808 ++ rcu_read_lock();
2809 + spin_lock(&fq->q.lock);
2810 +
2811 + if (fq->q.flags & INET_FRAG_COMPLETE)
2812 + goto out;
2813 +
2814 +- inet_frag_kill(&fq->q, frags);
2815 ++ inet_frag_kill(&fq->q);
2816 +
2817 +- rcu_read_lock();
2818 + dev = dev_get_by_index_rcu(net, fq->iif);
2819 + if (!dev)
2820 +- goto out_rcu_unlock;
2821 ++ goto out;
2822 +
2823 + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
2824 +-
2825 +- if (inet_frag_evicting(&fq->q))
2826 +- goto out_rcu_unlock;
2827 +-
2828 + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
2829 +
2830 + /* Don't send error if the first segment did not arrive. */
2831 +- if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments)
2832 +- goto out_rcu_unlock;
2833 ++ head = fq->q.fragments;
2834 ++ if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head)
2835 ++ goto out;
2836 +
2837 + /* But use as source device on which LAST ARRIVED
2838 + * segment was received. And do not use fq->dev
2839 + * pointer directly, device might already disappeared.
2840 + */
2841 +- fq->q.fragments->dev = dev;
2842 +- icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
2843 +-out_rcu_unlock:
2844 +- rcu_read_unlock();
2845 ++ head->dev = dev;
2846 ++ skb_get(head);
2847 ++ spin_unlock(&fq->q.lock);
2848 ++
2849 ++ icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
2850 ++ kfree_skb(head);
2851 ++ goto out_rcu_unlock;
2852 ++
2853 + out:
2854 + spin_unlock(&fq->q.lock);
2855 +- inet_frag_put(&fq->q, frags);
2856 ++out_rcu_unlock:
2857 ++ rcu_read_unlock();
2858 ++ inet_frag_put(&fq->q);
2859 + }
2860 + EXPORT_SYMBOL(ip6_expire_frag_queue);
2861 +
2862 +@@ -178,31 +142,29 @@ static void ip6_frag_expire(unsigned long data)
2863 + fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q);
2864 + net = container_of(fq->q.net, struct net, ipv6.frags);
2865 +
2866 +- ip6_expire_frag_queue(net, fq, &ip6_frags);
2867 ++ ip6_expire_frag_queue(net, fq);
2868 + }
2869 +
2870 + static struct frag_queue *
2871 +-fq_find(struct net *net, __be32 id, const struct in6_addr *src,
2872 +- const struct in6_addr *dst, int iif, u8 ecn)
2873 ++fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
2874 + {
2875 ++ struct frag_v6_compare_key key = {
2876 ++ .id = id,
2877 ++ .saddr = hdr->saddr,
2878 ++ .daddr = hdr->daddr,
2879 ++ .user = IP6_DEFRAG_LOCAL_DELIVER,
2880 ++ .iif = iif,
2881 ++ };
2882 + struct inet_frag_queue *q;
2883 +- struct ip6_create_arg arg;
2884 +- unsigned int hash;
2885 +
2886 +- arg.id = id;
2887 +- arg.user = IP6_DEFRAG_LOCAL_DELIVER;
2888 +- arg.src = src;
2889 +- arg.dst = dst;
2890 +- arg.iif = iif;
2891 +- arg.ecn = ecn;
2892 ++ if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST |
2893 ++ IPV6_ADDR_LINKLOCAL)))
2894 ++ key.iif = 0;
2895 +
2896 +- hash = inet6_hash_frag(id, src, dst);
2897 +-
2898 +- q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash);
2899 +- if (IS_ERR_OR_NULL(q)) {
2900 +- inet_frag_maybe_warn_overflow(q, pr_fmt());
2901 ++ q = inet_frag_find(&net->ipv6.frags, &key);
2902 ++ if (!q)
2903 + return NULL;
2904 +- }
2905 ++
2906 + return container_of(q, struct frag_queue, q);
2907 + }
2908 +
2909 +@@ -359,7 +321,7 @@ found:
2910 + return -1;
2911 +
2912 + discard_fq:
2913 +- inet_frag_kill(&fq->q, &ip6_frags);
2914 ++ inet_frag_kill(&fq->q);
2915 + err:
2916 + IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
2917 + IPSTATS_MIB_REASMFAILS);
2918 +@@ -386,7 +348,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
2919 + int sum_truesize;
2920 + u8 ecn;
2921 +
2922 +- inet_frag_kill(&fq->q, &ip6_frags);
2923 ++ inet_frag_kill(&fq->q);
2924 +
2925 + ecn = ip_frag_ecn_table[fq->ecn];
2926 + if (unlikely(ecn == 0xff))
2927 +@@ -503,6 +465,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
2928 + IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
2929 + rcu_read_unlock();
2930 + fq->q.fragments = NULL;
2931 ++ fq->q.rb_fragments = RB_ROOT;
2932 + fq->q.fragments_tail = NULL;
2933 + return 1;
2934 +
2935 +@@ -524,6 +487,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
2936 + struct frag_queue *fq;
2937 + const struct ipv6hdr *hdr = ipv6_hdr(skb);
2938 + struct net *net = dev_net(skb_dst(skb)->dev);
2939 ++ int iif;
2940 +
2941 + if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
2942 + goto fail_hdr;
2943 +@@ -552,17 +516,22 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
2944 + return 1;
2945 + }
2946 +
2947 +- fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
2948 +- skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
2949 ++ if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
2950 ++ fhdr->frag_off & htons(IP6_MF))
2951 ++ goto fail_hdr;
2952 ++
2953 ++ iif = skb->dev ? skb->dev->ifindex : 0;
2954 ++ fq = fq_find(net, fhdr->identification, hdr, iif);
2955 + if (fq) {
2956 + int ret;
2957 +
2958 + spin_lock(&fq->q.lock);
2959 +
2960 ++ fq->iif = iif;
2961 + ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
2962 +
2963 + spin_unlock(&fq->q.lock);
2964 +- inet_frag_put(&fq->q, &ip6_frags);
2965 ++ inet_frag_put(&fq->q);
2966 + return ret;
2967 + }
2968 +
2969 +@@ -583,24 +552,22 @@ static const struct inet6_protocol frag_protocol = {
2970 + };
2971 +
2972 + #ifdef CONFIG_SYSCTL
2973 +-static int zero;
2974 +
2975 + static struct ctl_table ip6_frags_ns_ctl_table[] = {
2976 + {
2977 + .procname = "ip6frag_high_thresh",
2978 + .data = &init_net.ipv6.frags.high_thresh,
2979 +- .maxlen = sizeof(int),
2980 ++ .maxlen = sizeof(unsigned long),
2981 + .mode = 0644,
2982 +- .proc_handler = proc_dointvec_minmax,
2983 ++ .proc_handler = proc_doulongvec_minmax,
2984 + .extra1 = &init_net.ipv6.frags.low_thresh
2985 + },
2986 + {
2987 + .procname = "ip6frag_low_thresh",
2988 + .data = &init_net.ipv6.frags.low_thresh,
2989 +- .maxlen = sizeof(int),
2990 ++ .maxlen = sizeof(unsigned long),
2991 + .mode = 0644,
2992 +- .proc_handler = proc_dointvec_minmax,
2993 +- .extra1 = &zero,
2994 ++ .proc_handler = proc_doulongvec_minmax,
2995 + .extra2 = &init_net.ipv6.frags.high_thresh
2996 + },
2997 + {
2998 +@@ -708,19 +675,27 @@ static void ip6_frags_sysctl_unregister(void)
2999 +
3000 + static int __net_init ipv6_frags_init_net(struct net *net)
3001 + {
3002 ++ int res;
3003 ++
3004 + net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
3005 + net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
3006 + net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
3007 ++ net->ipv6.frags.f = &ip6_frags;
3008 +
3009 +- inet_frags_init_net(&net->ipv6.frags);
3010 ++ res = inet_frags_init_net(&net->ipv6.frags);
3011 ++ if (res < 0)
3012 ++ return res;
3013 +
3014 +- return ip6_frags_ns_sysctl_register(net);
3015 ++ res = ip6_frags_ns_sysctl_register(net);
3016 ++ if (res < 0)
3017 ++ inet_frags_exit_net(&net->ipv6.frags);
3018 ++ return res;
3019 + }
3020 +
3021 + static void __net_exit ipv6_frags_exit_net(struct net *net)
3022 + {
3023 + ip6_frags_ns_sysctl_unregister(net);
3024 +- inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
3025 ++ inet_frags_exit_net(&net->ipv6.frags);
3026 + }
3027 +
3028 + static struct pernet_operations ip6_frags_ops = {
3029 +@@ -728,14 +703,55 @@ static struct pernet_operations ip6_frags_ops = {
3030 + .exit = ipv6_frags_exit_net,
3031 + };
3032 +
3033 ++static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed)
3034 ++{
3035 ++ return jhash2(data,
3036 ++ sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
3037 ++}
3038 ++
3039 ++static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed)
3040 ++{
3041 ++ const struct inet_frag_queue *fq = data;
3042 ++
3043 ++ return jhash2((const u32 *)&fq->key.v6,
3044 ++ sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
3045 ++}
3046 ++
3047 ++static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
3048 ++{
3049 ++ const struct frag_v6_compare_key *key = arg->key;
3050 ++ const struct inet_frag_queue *fq = ptr;
3051 ++
3052 ++ return !!memcmp(&fq->key, key, sizeof(*key));
3053 ++}
3054 ++
3055 ++const struct rhashtable_params ip6_rhash_params = {
3056 ++ .head_offset = offsetof(struct inet_frag_queue, node),
3057 ++ .hashfn = ip6_key_hashfn,
3058 ++ .obj_hashfn = ip6_obj_hashfn,
3059 ++ .obj_cmpfn = ip6_obj_cmpfn,
3060 ++ .automatic_shrinking = true,
3061 ++};
3062 ++EXPORT_SYMBOL(ip6_rhash_params);
3063 ++
3064 + int __init ipv6_frag_init(void)
3065 + {
3066 + int ret;
3067 +
3068 +- ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
3069 ++ ip6_frags.constructor = ip6_frag_init;
3070 ++ ip6_frags.destructor = NULL;
3071 ++ ip6_frags.qsize = sizeof(struct frag_queue);
3072 ++ ip6_frags.frag_expire = ip6_frag_expire;
3073 ++ ip6_frags.frags_cache_name = ip6_frag_cache_name;
3074 ++ ip6_frags.rhash_params = ip6_rhash_params;
3075 ++ ret = inet_frags_init(&ip6_frags);
3076 + if (ret)
3077 + goto out;
3078 +
3079 ++ ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
3080 ++ if (ret)
3081 ++ goto err_protocol;
3082 ++
3083 + ret = ip6_frags_sysctl_register();
3084 + if (ret)
3085 + goto err_sysctl;
3086 +@@ -744,17 +760,6 @@ int __init ipv6_frag_init(void)
3087 + if (ret)
3088 + goto err_pernet;
3089 +
3090 +- ip6_frags.hashfn = ip6_hashfn;
3091 +- ip6_frags.constructor = ip6_frag_init;
3092 +- ip6_frags.destructor = NULL;
3093 +- ip6_frags.skb_free = NULL;
3094 +- ip6_frags.qsize = sizeof(struct frag_queue);
3095 +- ip6_frags.match = ip6_frag_match;
3096 +- ip6_frags.frag_expire = ip6_frag_expire;
3097 +- ip6_frags.frags_cache_name = ip6_frag_cache_name;
3098 +- ret = inet_frags_init(&ip6_frags);
3099 +- if (ret)
3100 +- goto err_pernet;
3101 + out:
3102 + return ret;
3103 +
3104 +@@ -762,6 +767,8 @@ err_pernet:
3105 + ip6_frags_sysctl_unregister();
3106 + err_sysctl:
3107 + inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
3108 ++err_protocol:
3109 ++ inet_frags_fini(&ip6_frags);
3110 + goto out;
3111 + }
3112 +