Gentoo Archives: gentoo-commits

From: Mike Pagano <mpagano@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] proj/linux-patches:4.6 commit in: /
Date: Wed, 27 Jul 2016 23:52:16
Message-Id: 1469663516.c8839f5a116f5e0d27d587a09b40e1cc668c9885.mpagano@gentoo
1 commit: c8839f5a116f5e0d27d587a09b40e1cc668c9885
2 Author: Mike Pagano <mpagano <AT> gentoo <DOT> org>
3 AuthorDate: Wed Jul 27 23:51:56 2016 +0000
4 Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org>
5 CommitDate: Wed Jul 27 23:51:56 2016 +0000
6 URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=c8839f5a
7
8 Add BFQ patches for 4.6.X: http://algogroup.unimore.it/people/paolo/disk_sched/patches/4.6.0-v8/
9
10 0000_README | 16 +
11 ...oups-kconfig-build-bits-for-BFQ-v7r11-4.6.patch | 103 +
12 ...ntroduce-the-BFQ-v7r11-I-O-sched-for-4.6.patch1 | 7097 ++++++++++++++++++++
13 ...arly-Queue-Merge-EQM-to-BFQ-v7r11-for-4.6.patch | 1101 +++
14 ...rn-BFQ-v7r11-for-4.7.0-into-BFQ-v8-for-4.patch2 | 6361 ++++++++++++++++++
15 5 files changed, 14678 insertions(+)
16
17 diff --git a/0000_README b/0000_README
18 index 67da565..9e42d11 100644
19 --- a/0000_README
20 +++ b/0000_README
21 @@ -91,6 +91,22 @@ Patch: 5000_enable-additional-cpu-optimizations-for-gcc.patch
22 From: https://github.com/graysky2/kernel_gcc_patch/
23 Desc: Kernel patch enables gcc < v4.9 optimizations for additional CPUs.
24
25 +Patch: 5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.6.patch
26 +From: http://algo.ing.unimo.it/people/paolo/disk_sched/
27 +Desc: BFQ v7r11 patch 1 for 4.6: Build, cgroups and kconfig bits
28 +
29 +Patch: 5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.6.patch1
30 +From: http://algo.ing.unimo.it/people/paolo/disk_sched/
31 +Desc: BFQ v7r11 patch 2 for 4.6: BFQ Scheduler
32 +
33 +Patch: 5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for-4.6.patch
34 +From: http://algo.ing.unimo.it/people/paolo/disk_sched/
35 +Desc: BFQ v7r11 patch 3 for 4.6: Early Queue Merge (EQM)
36 +
37 +Patch: 5004_blkck-bfq-turn-BFQ-v7r11-for-4.7.0-into-BFQ-v8-for-4.patch2
38 +From: http://algo.ing.unimo.it/people/paolo/disk_sched/
39 +Desc: BFQ v7r11 patch 4 for 4.7: Early Queue Merge (EQM)
40 +
41 Patch: 5010_enable-additional-cpu-optimizations-for-gcc-4.9.patch
42 From: https://github.com/graysky2/kernel_gcc_patch/
43 Desc: Kernel patch enables gcc >= v4.9 optimizations for additional CPUs.
44
45 diff --git a/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.6.patch b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.6.patch
46 new file mode 100644
47 index 0000000..ee3934f
48 --- /dev/null
49 +++ b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.6.patch
50 @@ -0,0 +1,103 @@
51 +From 4cf5d043709bfe73b4553272706cb5beb8072301 Mon Sep 17 00:00:00 2001
52 +From: Paolo Valente <paolo.valente@×××××××.it>
53 +Date: Tue, 7 Apr 2015 13:39:12 +0200
54 +Subject: [PATCH 1/4] block: cgroups, kconfig, build bits for BFQ-v7r11-4.6.0
55 +
56 +Update Kconfig.iosched and do the related Makefile changes to include
57 +kernel configuration options for BFQ. Also increase the number of
58 +policies supported by the blkio controller so that BFQ can add its
59 +own.
60 +
61 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
62 +Signed-off-by: Arianna Avanzini <avanzini@××××××.com>
63 +---
64 + block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++
65 + block/Makefile | 1 +
66 + include/linux/blkdev.h | 2 +-
67 + 3 files changed, 34 insertions(+), 1 deletion(-)
68 +
69 +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
70 +index 421bef9..0ee5f0f 100644
71 +--- a/block/Kconfig.iosched
72 ++++ b/block/Kconfig.iosched
73 +@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED
74 + ---help---
75 + Enable group IO scheduling in CFQ.
76 +
77 ++config IOSCHED_BFQ
78 ++ tristate "BFQ I/O scheduler"
79 ++ default n
80 ++ ---help---
81 ++ The BFQ I/O scheduler tries to distribute bandwidth among
82 ++ all processes according to their weights.
83 ++ It aims at distributing the bandwidth as desired, independently of
84 ++ the disk parameters and with any workload. It also tries to
85 ++ guarantee low latency to interactive and soft real-time
86 ++ applications. If compiled built-in (saying Y here), BFQ can
87 ++ be configured to support hierarchical scheduling.
88 ++
89 ++config CGROUP_BFQIO
90 ++ bool "BFQ hierarchical scheduling support"
91 ++ depends on CGROUPS && IOSCHED_BFQ=y
92 ++ default n
93 ++ ---help---
94 ++ Enable hierarchical scheduling in BFQ, using the cgroups
95 ++ filesystem interface. The name of the subsystem will be
96 ++ bfqio.
97 ++
98 + choice
99 + prompt "Default I/O scheduler"
100 + default DEFAULT_CFQ
101 +@@ -52,6 +73,16 @@ choice
102 + config DEFAULT_CFQ
103 + bool "CFQ" if IOSCHED_CFQ=y
104 +
105 ++ config DEFAULT_BFQ
106 ++ bool "BFQ" if IOSCHED_BFQ=y
107 ++ help
108 ++ Selects BFQ as the default I/O scheduler which will be
109 ++ used by default for all block devices.
110 ++ The BFQ I/O scheduler aims at distributing the bandwidth
111 ++ as desired, independently of the disk parameters and with
112 ++ any workload. It also tries to guarantee low latency to
113 ++ interactive and soft real-time applications.
114 ++
115 + config DEFAULT_NOOP
116 + bool "No-op"
117 +
118 +@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED
119 + string
120 + default "deadline" if DEFAULT_DEADLINE
121 + default "cfq" if DEFAULT_CFQ
122 ++ default "bfq" if DEFAULT_BFQ
123 + default "noop" if DEFAULT_NOOP
124 +
125 + endmenu
126 +diff --git a/block/Makefile b/block/Makefile
127 +index 9eda232..4a36683 100644
128 +--- a/block/Makefile
129 ++++ b/block/Makefile
130 +@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
131 + obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
132 + obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
133 + obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
134 ++obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
135 +
136 + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
137 + obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
138 +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
139 +index 669e419..119be87 100644
140 +--- a/include/linux/blkdev.h
141 ++++ b/include/linux/blkdev.h
142 +@@ -45,7 +45,7 @@ struct pr_ops;
143 + * Maximum number of blkcg policies allowed to be registered concurrently.
144 + * Defined here to simplify include dependency.
145 + */
146 +-#define BLKCG_MAX_POLS 2
147 ++#define BLKCG_MAX_POLS 3
148 +
149 + struct request;
150 + typedef void (rq_end_io_fn)(struct request *, int);
151 +--
152 +1.9.1
153 +
154
155 diff --git a/5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.6.patch1 b/5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.6.patch1
156 new file mode 100644
157 index 0000000..c232a83
158 --- /dev/null
159 +++ b/5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.6.patch1
160 @@ -0,0 +1,7097 @@
161 +From 75c0230fa4f82fb77e41cf60c06e22b5e07f7f97 Mon Sep 17 00:00:00 2001
162 +From: Paolo Valente <paolo.valente@×××××××.it>
163 +Date: Thu, 9 May 2013 19:10:02 +0200
164 +Subject: [PATCH 2/4] block: introduce the BFQ-v7r11 I/O sched for 4.6.0
165 +
166 +The general structure is borrowed from CFQ, as much of the code for
167 +handling I/O contexts. Over time, several useful features have been
168 +ported from CFQ as well (details in the changelog in README.BFQ). A
169 +(bfq_)queue is associated to each task doing I/O on a device, and each
170 +time a scheduling decision has to be made a queue is selected and served
171 +until it expires.
172 +
173 + - Slices are given in the service domain: tasks are assigned
174 + budgets, measured in number of sectors. Once got the disk, a task
175 + must however consume its assigned budget within a configurable
176 + maximum time (by default, the maximum possible value of the
177 + budgets is automatically computed to comply with this timeout).
178 + This allows the desired latency vs "throughput boosting" tradeoff
179 + to be set.
180 +
181 + - Budgets are scheduled according to a variant of WF2Q+, implemented
182 + using an augmented rb-tree to take eligibility into account while
183 + preserving an O(log N) overall complexity.
184 +
185 + - A low-latency tunable is provided; if enabled, both interactive
186 + and soft real-time applications are guaranteed a very low latency.
187 +
188 + - Latency guarantees are preserved also in the presence of NCQ.
189 +
190 + - Also with flash-based devices, a high throughput is achieved
191 + while still preserving latency guarantees.
192 +
193 + - BFQ features Early Queue Merge (EQM), a sort of fusion of the
194 + cooperating-queue-merging and the preemption mechanisms present
195 + in CFQ. EQM is in fact a unified mechanism that tries to get a
196 + sequential read pattern, and hence a high throughput, with any
197 + set of processes performing interleaved I/O over a contiguous
198 + sequence of sectors.
199 +
200 + - BFQ supports full hierarchical scheduling, exporting a cgroups
201 + interface. Since each node has a full scheduler, each group can
202 + be assigned its own weight.
203 +
204 + - If the cgroups interface is not used, only I/O priorities can be
205 + assigned to processes, with ioprio values mapped to weights
206 + with the relation weight = IOPRIO_BE_NR - ioprio.
207 +
208 + - ioprio classes are served in strict priority order, i.e., lower
209 + priority queues are not served as long as there are higher
210 + priority queues. Among queues in the same class the bandwidth is
211 + distributed in proportion to the weight of each queue. A very
212 + thin extra bandwidth is however guaranteed to the Idle class, to
213 + prevent it from starving.
214 +
215 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
216 +Signed-off-by: Arianna Avanzini <avanzini@××××××.com>
217 +---
218 + block/Kconfig.iosched | 6 +-
219 + block/bfq-cgroup.c | 1182 ++++++++++++++++
220 + block/bfq-ioc.c | 36 +
221 + block/bfq-iosched.c | 3754 +++++++++++++++++++++++++++++++++++++++++++++++++
222 + block/bfq-sched.c | 1200 ++++++++++++++++
223 + block/bfq.h | 801 +++++++++++
224 + 6 files changed, 6975 insertions(+), 4 deletions(-)
225 + create mode 100644 block/bfq-cgroup.c
226 + create mode 100644 block/bfq-ioc.c
227 + create mode 100644 block/bfq-iosched.c
228 + create mode 100644 block/bfq-sched.c
229 + create mode 100644 block/bfq.h
230 +
231 +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
232 +index 0ee5f0f..f78cd1a 100644
233 +--- a/block/Kconfig.iosched
234 ++++ b/block/Kconfig.iosched
235 +@@ -51,14 +51,12 @@ config IOSCHED_BFQ
236 + applications. If compiled built-in (saying Y here), BFQ can
237 + be configured to support hierarchical scheduling.
238 +
239 +-config CGROUP_BFQIO
240 ++config BFQ_GROUP_IOSCHED
241 + bool "BFQ hierarchical scheduling support"
242 + depends on CGROUPS && IOSCHED_BFQ=y
243 + default n
244 + ---help---
245 +- Enable hierarchical scheduling in BFQ, using the cgroups
246 +- filesystem interface. The name of the subsystem will be
247 +- bfqio.
248 ++ Enable hierarchical scheduling in BFQ, using the blkio controller.
249 +
250 + choice
251 + prompt "Default I/O scheduler"
252 +diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
253 +new file mode 100644
254 +index 0000000..8610cd6
255 +--- /dev/null
256 ++++ b/block/bfq-cgroup.c
257 +@@ -0,0 +1,1182 @@
258 ++/*
259 ++ * BFQ: CGROUPS support.
260 ++ *
261 ++ * Based on ideas and code from CFQ:
262 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
263 ++ *
264 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
265 ++ * Paolo Valente <paolo.valente@×××××××.it>
266 ++ *
267 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
268 ++ *
269 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
270 ++ * file.
271 ++ */
272 ++
273 ++#ifdef CONFIG_BFQ_GROUP_IOSCHED
274 ++
275 ++/* bfqg stats flags */
276 ++enum bfqg_stats_flags {
277 ++ BFQG_stats_waiting = 0,
278 ++ BFQG_stats_idling,
279 ++ BFQG_stats_empty,
280 ++};
281 ++
282 ++#define BFQG_FLAG_FNS(name) \
283 ++static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
284 ++{ \
285 ++ stats->flags |= (1 << BFQG_stats_##name); \
286 ++} \
287 ++static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
288 ++{ \
289 ++ stats->flags &= ~(1 << BFQG_stats_##name); \
290 ++} \
291 ++static int bfqg_stats_##name(struct bfqg_stats *stats) \
292 ++{ \
293 ++ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
294 ++} \
295 ++
296 ++BFQG_FLAG_FNS(waiting)
297 ++BFQG_FLAG_FNS(idling)
298 ++BFQG_FLAG_FNS(empty)
299 ++#undef BFQG_FLAG_FNS
300 ++
301 ++/* This should be called with the queue_lock held. */
302 ++static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
303 ++{
304 ++ unsigned long long now;
305 ++
306 ++ if (!bfqg_stats_waiting(stats))
307 ++ return;
308 ++
309 ++ now = sched_clock();
310 ++ if (time_after64(now, stats->start_group_wait_time))
311 ++ blkg_stat_add(&stats->group_wait_time,
312 ++ now - stats->start_group_wait_time);
313 ++ bfqg_stats_clear_waiting(stats);
314 ++}
315 ++
316 ++/* This should be called with the queue_lock held. */
317 ++static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
318 ++ struct bfq_group *curr_bfqg)
319 ++{
320 ++ struct bfqg_stats *stats = &bfqg->stats;
321 ++
322 ++ if (bfqg_stats_waiting(stats))
323 ++ return;
324 ++ if (bfqg == curr_bfqg)
325 ++ return;
326 ++ stats->start_group_wait_time = sched_clock();
327 ++ bfqg_stats_mark_waiting(stats);
328 ++}
329 ++
330 ++/* This should be called with the queue_lock held. */
331 ++static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
332 ++{
333 ++ unsigned long long now;
334 ++
335 ++ if (!bfqg_stats_empty(stats))
336 ++ return;
337 ++
338 ++ now = sched_clock();
339 ++ if (time_after64(now, stats->start_empty_time))
340 ++ blkg_stat_add(&stats->empty_time,
341 ++ now - stats->start_empty_time);
342 ++ bfqg_stats_clear_empty(stats);
343 ++}
344 ++
345 ++static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
346 ++{
347 ++ blkg_stat_add(&bfqg->stats.dequeue, 1);
348 ++}
349 ++
350 ++static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
351 ++{
352 ++ struct bfqg_stats *stats = &bfqg->stats;
353 ++
354 ++ if (blkg_rwstat_total(&stats->queued))
355 ++ return;
356 ++
357 ++ /*
358 ++ * group is already marked empty. This can happen if bfqq got new
359 ++ * request in parent group and moved to this group while being added
360 ++ * to service tree. Just ignore the event and move on.
361 ++ */
362 ++ if (bfqg_stats_empty(stats))
363 ++ return;
364 ++
365 ++ stats->start_empty_time = sched_clock();
366 ++ bfqg_stats_mark_empty(stats);
367 ++}
368 ++
369 ++static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
370 ++{
371 ++ struct bfqg_stats *stats = &bfqg->stats;
372 ++
373 ++ if (bfqg_stats_idling(stats)) {
374 ++ unsigned long long now = sched_clock();
375 ++
376 ++ if (time_after64(now, stats->start_idle_time))
377 ++ blkg_stat_add(&stats->idle_time,
378 ++ now - stats->start_idle_time);
379 ++ bfqg_stats_clear_idling(stats);
380 ++ }
381 ++}
382 ++
383 ++static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
384 ++{
385 ++ struct bfqg_stats *stats = &bfqg->stats;
386 ++
387 ++ stats->start_idle_time = sched_clock();
388 ++ bfqg_stats_mark_idling(stats);
389 ++}
390 ++
391 ++static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
392 ++{
393 ++ struct bfqg_stats *stats = &bfqg->stats;
394 ++
395 ++ blkg_stat_add(&stats->avg_queue_size_sum,
396 ++ blkg_rwstat_total(&stats->queued));
397 ++ blkg_stat_add(&stats->avg_queue_size_samples, 1);
398 ++ bfqg_stats_update_group_wait_time(stats);
399 ++}
400 ++
401 ++static struct blkcg_policy blkcg_policy_bfq;
402 ++
403 ++/*
404 ++ * blk-cgroup policy-related handlers
405 ++ * The following functions help in converting between blk-cgroup
406 ++ * internal structures and BFQ-specific structures.
407 ++ */
408 ++
409 ++static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
410 ++{
411 ++ return pd ? container_of(pd, struct bfq_group, pd) : NULL;
412 ++}
413 ++
414 ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
415 ++{
416 ++ return pd_to_blkg(&bfqg->pd);
417 ++}
418 ++
419 ++static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
420 ++{
421 ++ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq);
422 ++ BUG_ON(!pd);
423 ++ return pd_to_bfqg(pd);
424 ++}
425 ++
426 ++/*
427 ++ * bfq_group handlers
428 ++ * The following functions help in navigating the bfq_group hierarchy
429 ++ * by allowing to find the parent of a bfq_group or the bfq_group
430 ++ * associated to a bfq_queue.
431 ++ */
432 ++
433 ++static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
434 ++{
435 ++ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
436 ++
437 ++ return pblkg ? blkg_to_bfqg(pblkg) : NULL;
438 ++}
439 ++
440 ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
441 ++{
442 ++ struct bfq_entity *group_entity = bfqq->entity.parent;
443 ++
444 ++ return group_entity ? container_of(group_entity, struct bfq_group,
445 ++ entity) :
446 ++ bfqq->bfqd->root_group;
447 ++}
448 ++
449 ++/*
450 ++ * The following two functions handle get and put of a bfq_group by
451 ++ * wrapping the related blk-cgroup hooks.
452 ++ */
453 ++
454 ++static void bfqg_get(struct bfq_group *bfqg)
455 ++{
456 ++ return blkg_get(bfqg_to_blkg(bfqg));
457 ++}
458 ++
459 ++static void bfqg_put(struct bfq_group *bfqg)
460 ++{
461 ++ return blkg_put(bfqg_to_blkg(bfqg));
462 ++}
463 ++
464 ++static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
465 ++ struct bfq_queue *bfqq,
466 ++ int rw)
467 ++{
468 ++ blkg_rwstat_add(&bfqg->stats.queued, rw, 1);
469 ++ bfqg_stats_end_empty_time(&bfqg->stats);
470 ++ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
471 ++ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
472 ++}
473 ++
474 ++static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw)
475 ++{
476 ++ blkg_rwstat_add(&bfqg->stats.queued, rw, -1);
477 ++}
478 ++
479 ++static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw)
480 ++{
481 ++ blkg_rwstat_add(&bfqg->stats.merged, rw, 1);
482 ++}
483 ++
484 ++static void bfqg_stats_update_dispatch(struct bfq_group *bfqg,
485 ++ uint64_t bytes, int rw)
486 ++{
487 ++ blkg_stat_add(&bfqg->stats.sectors, bytes >> 9);
488 ++ blkg_rwstat_add(&bfqg->stats.serviced, rw, 1);
489 ++ blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes);
490 ++}
491 ++
492 ++static void bfqg_stats_update_completion(struct bfq_group *bfqg,
493 ++ uint64_t start_time, uint64_t io_start_time, int rw)
494 ++{
495 ++ struct bfqg_stats *stats = &bfqg->stats;
496 ++ unsigned long long now = sched_clock();
497 ++
498 ++ if (time_after64(now, io_start_time))
499 ++ blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
500 ++ if (time_after64(io_start_time, start_time))
501 ++ blkg_rwstat_add(&stats->wait_time, rw,
502 ++ io_start_time - start_time);
503 ++}
504 ++
505 ++/* @stats = 0 */
506 ++static void bfqg_stats_reset(struct bfqg_stats *stats)
507 ++{
508 ++ if (!stats)
509 ++ return;
510 ++
511 ++ /* queued stats shouldn't be cleared */
512 ++ blkg_rwstat_reset(&stats->service_bytes);
513 ++ blkg_rwstat_reset(&stats->serviced);
514 ++ blkg_rwstat_reset(&stats->merged);
515 ++ blkg_rwstat_reset(&stats->service_time);
516 ++ blkg_rwstat_reset(&stats->wait_time);
517 ++ blkg_stat_reset(&stats->time);
518 ++ blkg_stat_reset(&stats->unaccounted_time);
519 ++ blkg_stat_reset(&stats->avg_queue_size_sum);
520 ++ blkg_stat_reset(&stats->avg_queue_size_samples);
521 ++ blkg_stat_reset(&stats->dequeue);
522 ++ blkg_stat_reset(&stats->group_wait_time);
523 ++ blkg_stat_reset(&stats->idle_time);
524 ++ blkg_stat_reset(&stats->empty_time);
525 ++}
526 ++
527 ++/* @to += @from */
528 ++static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from)
529 ++{
530 ++ if (!to || !from)
531 ++ return;
532 ++
533 ++ /* queued stats shouldn't be cleared */
534 ++ blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes);
535 ++ blkg_rwstat_add_aux(&to->serviced, &from->serviced);
536 ++ blkg_rwstat_add_aux(&to->merged, &from->merged);
537 ++ blkg_rwstat_add_aux(&to->service_time, &from->service_time);
538 ++ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
539 ++ blkg_stat_add_aux(&from->time, &from->time);
540 ++ blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
541 ++ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
542 ++ blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
543 ++ blkg_stat_add_aux(&to->dequeue, &from->dequeue);
544 ++ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
545 ++ blkg_stat_add_aux(&to->idle_time, &from->idle_time);
546 ++ blkg_stat_add_aux(&to->empty_time, &from->empty_time);
547 ++}
548 ++
549 ++/*
550 ++ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors'
551 ++ * recursive stats can still account for the amount used by this bfqg after
552 ++ * it's gone.
553 ++ */
554 ++static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
555 ++{
556 ++ struct bfq_group *parent;
557 ++
558 ++ if (!bfqg) /* root_group */
559 ++ return;
560 ++
561 ++ parent = bfqg_parent(bfqg);
562 ++
563 ++ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
564 ++
565 ++ if (unlikely(!parent))
566 ++ return;
567 ++
568 ++ bfqg_stats_merge(&parent->dead_stats, &bfqg->stats);
569 ++ bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats);
570 ++ bfqg_stats_reset(&bfqg->stats);
571 ++ bfqg_stats_reset(&bfqg->dead_stats);
572 ++}
573 ++
574 ++static void bfq_init_entity(struct bfq_entity *entity,
575 ++ struct bfq_group *bfqg)
576 ++{
577 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
578 ++
579 ++ entity->weight = entity->new_weight;
580 ++ entity->orig_weight = entity->new_weight;
581 ++ if (bfqq) {
582 ++ bfqq->ioprio = bfqq->new_ioprio;
583 ++ bfqq->ioprio_class = bfqq->new_ioprio_class;
584 ++ bfqg_get(bfqg);
585 ++ }
586 ++ entity->parent = bfqg->my_entity;
587 ++ entity->sched_data = &bfqg->sched_data;
588 ++}
589 ++
590 ++static void bfqg_stats_exit(struct bfqg_stats *stats)
591 ++{
592 ++ blkg_rwstat_exit(&stats->service_bytes);
593 ++ blkg_rwstat_exit(&stats->serviced);
594 ++ blkg_rwstat_exit(&stats->merged);
595 ++ blkg_rwstat_exit(&stats->service_time);
596 ++ blkg_rwstat_exit(&stats->wait_time);
597 ++ blkg_rwstat_exit(&stats->queued);
598 ++ blkg_stat_exit(&stats->sectors);
599 ++ blkg_stat_exit(&stats->time);
600 ++ blkg_stat_exit(&stats->unaccounted_time);
601 ++ blkg_stat_exit(&stats->avg_queue_size_sum);
602 ++ blkg_stat_exit(&stats->avg_queue_size_samples);
603 ++ blkg_stat_exit(&stats->dequeue);
604 ++ blkg_stat_exit(&stats->group_wait_time);
605 ++ blkg_stat_exit(&stats->idle_time);
606 ++ blkg_stat_exit(&stats->empty_time);
607 ++}
608 ++
609 ++static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
610 ++{
611 ++ if (blkg_rwstat_init(&stats->service_bytes, gfp) ||
612 ++ blkg_rwstat_init(&stats->serviced, gfp) ||
613 ++ blkg_rwstat_init(&stats->merged, gfp) ||
614 ++ blkg_rwstat_init(&stats->service_time, gfp) ||
615 ++ blkg_rwstat_init(&stats->wait_time, gfp) ||
616 ++ blkg_rwstat_init(&stats->queued, gfp) ||
617 ++ blkg_stat_init(&stats->sectors, gfp) ||
618 ++ blkg_stat_init(&stats->time, gfp) ||
619 ++ blkg_stat_init(&stats->unaccounted_time, gfp) ||
620 ++ blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
621 ++ blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
622 ++ blkg_stat_init(&stats->dequeue, gfp) ||
623 ++ blkg_stat_init(&stats->group_wait_time, gfp) ||
624 ++ blkg_stat_init(&stats->idle_time, gfp) ||
625 ++ blkg_stat_init(&stats->empty_time, gfp)) {
626 ++ bfqg_stats_exit(stats);
627 ++ return -ENOMEM;
628 ++ }
629 ++
630 ++ return 0;
631 ++}
632 ++
633 ++static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
634 ++ {
635 ++ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
636 ++ }
637 ++
638 ++static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
639 ++{
640 ++ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
641 ++}
642 ++
643 ++static void bfq_cpd_init(struct blkcg_policy_data *cpd)
644 ++{
645 ++ struct bfq_group_data *d = cpd_to_bfqgd(cpd);
646 ++
647 ++ d->weight = BFQ_DEFAULT_GRP_WEIGHT;
648 ++}
649 ++
650 ++static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
651 ++{
652 ++ struct bfq_group *bfqg;
653 ++
654 ++ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
655 ++ if (!bfqg)
656 ++ return NULL;
657 ++
658 ++ if (bfqg_stats_init(&bfqg->stats, gfp) ||
659 ++ bfqg_stats_init(&bfqg->dead_stats, gfp)) {
660 ++ kfree(bfqg);
661 ++ return NULL;
662 ++ }
663 ++
664 ++ return &bfqg->pd;
665 ++}
666 ++
667 ++static void bfq_group_set_parent(struct bfq_group *bfqg,
668 ++ struct bfq_group *parent)
669 ++{
670 ++ struct bfq_entity *entity;
671 ++
672 ++ BUG_ON(!parent);
673 ++ BUG_ON(!bfqg);
674 ++ BUG_ON(bfqg == parent);
675 ++
676 ++ entity = &bfqg->entity;
677 ++ entity->parent = parent->my_entity;
678 ++ entity->sched_data = &parent->sched_data;
679 ++}
680 ++
681 ++static void bfq_pd_init(struct blkg_policy_data *pd)
682 ++{
683 ++ struct blkcg_gq *blkg = pd_to_blkg(pd);
684 ++ struct bfq_group *bfqg = blkg_to_bfqg(blkg);
685 ++ struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
686 ++ struct bfq_entity *entity = &bfqg->entity;
687 ++ struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
688 ++
689 ++ entity->orig_weight = entity->weight = entity->new_weight = d->weight;
690 ++ entity->my_sched_data = &bfqg->sched_data;
691 ++ bfqg->my_entity = entity; /*
692 ++ * the root_group's will be set to NULL
693 ++ * in bfq_init_queue()
694 ++ */
695 ++ bfqg->bfqd = bfqd;
696 ++ bfqg->active_entities = 0;
697 ++}
698 ++
699 ++static void bfq_pd_free(struct blkg_policy_data *pd)
700 ++{
701 ++ struct bfq_group *bfqg = pd_to_bfqg(pd);
702 ++
703 ++ bfqg_stats_exit(&bfqg->stats);
704 ++ bfqg_stats_exit(&bfqg->dead_stats);
705 ++
706 ++ return kfree(bfqg);
707 ++}
708 ++
709 ++/* offset delta from bfqg->stats to bfqg->dead_stats */
710 ++static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) -
711 ++ offsetof(struct bfq_group, stats);
712 ++
713 ++/* to be used by recursive prfill, sums live and dead stats recursively */
714 ++static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
715 ++{
716 ++ u64 sum = 0;
717 ++
718 ++ sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);
719 ++ sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,
720 ++ off + dead_stats_off_delta);
721 ++ return sum;
722 ++}
723 ++
724 ++/* to be used by recursive prfill, sums live and dead rwstats recursively */
725 ++static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
726 ++ int off)
727 ++{
728 ++ struct blkg_rwstat a, b;
729 ++
730 ++ a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);
731 ++ b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,
732 ++ off + dead_stats_off_delta);
733 ++ blkg_rwstat_add_aux(&a, &b);
734 ++ return a;
735 ++}
736 ++
737 ++static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
738 ++{
739 ++ struct bfq_group *bfqg = pd_to_bfqg(pd);
740 ++
741 ++ bfqg_stats_reset(&bfqg->stats);
742 ++ bfqg_stats_reset(&bfqg->dead_stats);
743 ++}
744 ++
745 ++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
746 ++ struct blkcg *blkcg)
747 ++{
748 ++ struct request_queue *q = bfqd->queue;
749 ++ struct bfq_group *bfqg = NULL, *parent;
750 ++ struct bfq_entity *entity = NULL;
751 ++
752 ++ assert_spin_locked(bfqd->queue->queue_lock);
753 ++
754 ++ /* avoid lookup for the common case where there's no blkcg */
755 ++ if (blkcg == &blkcg_root) {
756 ++ bfqg = bfqd->root_group;
757 ++ } else {
758 ++ struct blkcg_gq *blkg;
759 ++
760 ++ blkg = blkg_lookup_create(blkcg, q);
761 ++ if (!IS_ERR(blkg))
762 ++ bfqg = blkg_to_bfqg(blkg);
763 ++ else /* fallback to root_group */
764 ++ bfqg = bfqd->root_group;
765 ++ }
766 ++
767 ++ BUG_ON(!bfqg);
768 ++
769 ++ /*
770 ++ * Update chain of bfq_groups as we might be handling a leaf group
771 ++ * which, along with some of its relatives, has not been hooked yet
772 ++ * to the private hierarchy of BFQ.
773 ++ */
774 ++ entity = &bfqg->entity;
775 ++ for_each_entity(entity) {
776 ++ bfqg = container_of(entity, struct bfq_group, entity);
777 ++ BUG_ON(!bfqg);
778 ++ if (bfqg != bfqd->root_group) {
779 ++ parent = bfqg_parent(bfqg);
780 ++ if (!parent)
781 ++ parent = bfqd->root_group;
782 ++ BUG_ON(!parent);
783 ++ bfq_group_set_parent(bfqg, parent);
784 ++ }
785 ++ }
786 ++
787 ++ return bfqg;
788 ++}
789 ++
790 ++/**
791 ++ * bfq_bfqq_move - migrate @bfqq to @bfqg.
792 ++ * @bfqd: queue descriptor.
793 ++ * @bfqq: the queue to move.
794 ++ * @entity: @bfqq's entity.
795 ++ * @bfqg: the group to move to.
796 ++ *
797 ++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
798 ++ * it on the new one. Avoid putting the entity on the old group idle tree.
799 ++ *
800 ++ * Must be called under the queue lock; the cgroup owning @bfqg must
801 ++ * not disappear (by now this just means that we are called under
802 ++ * rcu_read_lock()).
803 ++ */
804 ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
805 ++ struct bfq_entity *entity, struct bfq_group *bfqg)
806 ++{
807 ++ int busy, resume;
808 ++
809 ++ busy = bfq_bfqq_busy(bfqq);
810 ++ resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
811 ++
812 ++ BUG_ON(resume && !entity->on_st);
813 ++ BUG_ON(busy && !resume && entity->on_st &&
814 ++ bfqq != bfqd->in_service_queue);
815 ++
816 ++ if (busy) {
817 ++ BUG_ON(atomic_read(&bfqq->ref) < 2);
818 ++
819 ++ if (!resume)
820 ++ bfq_del_bfqq_busy(bfqd, bfqq, 0);
821 ++ else
822 ++ bfq_deactivate_bfqq(bfqd, bfqq, 0);
823 ++ } else if (entity->on_st)
824 ++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
825 ++ bfqg_put(bfqq_group(bfqq));
826 ++
827 ++ /*
828 ++ * Here we use a reference to bfqg. We don't need a refcounter
829 ++ * as the cgroup reference will not be dropped, so that its
830 ++ * destroy() callback will not be invoked.
831 ++ */
832 ++ entity->parent = bfqg->my_entity;
833 ++ entity->sched_data = &bfqg->sched_data;
834 ++ bfqg_get(bfqg);
835 ++
836 ++ if (busy) {
837 ++ if (resume)
838 ++ bfq_activate_bfqq(bfqd, bfqq);
839 ++ }
840 ++
841 ++ if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
842 ++ bfq_schedule_dispatch(bfqd);
843 ++}
844 ++
845 ++/**
846 ++ * __bfq_bic_change_cgroup - move @bic to @cgroup.
847 ++ * @bfqd: the queue descriptor.
848 ++ * @bic: the bic to move.
849 ++ * @blkcg: the blk-cgroup to move to.
850 ++ *
851 ++ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
852 ++ * has to make sure that the reference to cgroup is valid across the call.
853 ++ *
854 ++ * NOTE: an alternative approach might have been to store the current
855 ++ * cgroup in bfqq and getting a reference to it, reducing the lookup
856 ++ * time here, at the price of slightly more complex code.
857 ++ */
858 ++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
859 ++ struct bfq_io_cq *bic,
860 ++ struct blkcg *blkcg)
861 ++{
862 ++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
863 ++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
864 ++ struct bfq_group *bfqg;
865 ++ struct bfq_entity *entity;
866 ++
867 ++ lockdep_assert_held(bfqd->queue->queue_lock);
868 ++
869 ++ bfqg = bfq_find_alloc_group(bfqd, blkcg);
870 ++ if (async_bfqq) {
871 ++ entity = &async_bfqq->entity;
872 ++
873 ++ if (entity->sched_data != &bfqg->sched_data) {
874 ++ bic_set_bfqq(bic, NULL, 0);
875 ++ bfq_log_bfqq(bfqd, async_bfqq,
876 ++ "bic_change_group: %p %d",
877 ++ async_bfqq, atomic_read(&async_bfqq->ref));
878 ++ bfq_put_queue(async_bfqq);
879 ++ }
880 ++ }
881 ++
882 ++ if (sync_bfqq) {
883 ++ entity = &sync_bfqq->entity;
884 ++ if (entity->sched_data != &bfqg->sched_data)
885 ++ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
886 ++ }
887 ++
888 ++ return bfqg;
889 ++}
890 ++
891 ++static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
892 ++{
893 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
894 ++ struct blkcg *blkcg;
895 ++ struct bfq_group *bfqg = NULL;
896 ++ uint64_t id;
897 ++
898 ++ rcu_read_lock();
899 ++ blkcg = bio_blkcg(bio);
900 ++ id = blkcg->css.serial_nr;
901 ++ rcu_read_unlock();
902 ++
903 ++ /*
904 ++ * Check whether blkcg has changed. The condition may trigger
905 ++ * spuriously on a newly created cic but there's no harm.
906 ++ */
907 ++ if (unlikely(!bfqd) || likely(bic->blkcg_id == id))
908 ++ return;
909 ++
910 ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg);
911 ++ BUG_ON(!bfqg);
912 ++ bic->blkcg_id = id;
913 ++}
914 ++
915 ++/**
916 ++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
917 ++ * @st: the service tree being flushed.
918 ++ */
919 ++static void bfq_flush_idle_tree(struct bfq_service_tree *st)
920 ++{
921 ++ struct bfq_entity *entity = st->first_idle;
922 ++
923 ++ for (; entity ; entity = st->first_idle)
924 ++ __bfq_deactivate_entity(entity, 0);
925 ++}
926 ++
927 ++/**
928 ++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
929 ++ * @bfqd: the device data structure with the root group.
930 ++ * @entity: the entity to move.
931 ++ */
932 ++static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
933 ++ struct bfq_entity *entity)
934 ++{
935 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
936 ++
937 ++ BUG_ON(!bfqq);
938 ++ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
939 ++ return;
940 ++}
941 ++
942 ++/**
943 ++ * bfq_reparent_active_entities - move to the root group all active
944 ++ * entities.
945 ++ * @bfqd: the device data structure with the root group.
946 ++ * @bfqg: the group to move from.
947 ++ * @st: the service tree with the entities.
948 ++ *
949 ++ * Needs queue_lock to be taken and reference to be valid over the call.
950 ++ */
951 ++static void bfq_reparent_active_entities(struct bfq_data *bfqd,
952 ++ struct bfq_group *bfqg,
953 ++ struct bfq_service_tree *st)
954 ++{
955 ++ struct rb_root *active = &st->active;
956 ++ struct bfq_entity *entity = NULL;
957 ++
958 ++ if (!RB_EMPTY_ROOT(&st->active))
959 ++ entity = bfq_entity_of(rb_first(active));
960 ++
961 ++ for (; entity ; entity = bfq_entity_of(rb_first(active)))
962 ++ bfq_reparent_leaf_entity(bfqd, entity);
963 ++
964 ++ if (bfqg->sched_data.in_service_entity)
965 ++ bfq_reparent_leaf_entity(bfqd,
966 ++ bfqg->sched_data.in_service_entity);
967 ++
968 ++ return;
969 ++}
970 ++
971 ++/**
972 ++ * bfq_destroy_group - destroy @bfqg.
973 ++ * @bfqg: the group being destroyed.
974 ++ *
975 ++ * Destroy @bfqg, making sure that it is not referenced from its parent.
976 ++ * blkio already grabs the queue_lock for us, so no need to use RCU-based magic
977 ++ */
978 ++static void bfq_pd_offline(struct blkg_policy_data *pd)
979 ++{
980 ++ struct bfq_service_tree *st;
981 ++ struct bfq_group *bfqg;
982 ++ struct bfq_data *bfqd;
983 ++ struct bfq_entity *entity;
984 ++ int i;
985 ++
986 ++ BUG_ON(!pd);
987 ++ bfqg = pd_to_bfqg(pd);
988 ++ BUG_ON(!bfqg);
989 ++ bfqd = bfqg->bfqd;
990 ++ BUG_ON(bfqd && !bfqd->root_group);
991 ++
992 ++ entity = bfqg->my_entity;
993 ++
994 ++ if (!entity) /* root group */
995 ++ return;
996 ++
997 ++ /*
998 ++ * Empty all service_trees belonging to this group before
999 ++ * deactivating the group itself.
1000 ++ */
1001 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
1002 ++ BUG_ON(!bfqg->sched_data.service_tree);
1003 ++ st = bfqg->sched_data.service_tree + i;
1004 ++ /*
1005 ++ * The idle tree may still contain bfq_queues belonging
1006 ++ * to exited task because they never migrated to a different
1007 ++ * cgroup from the one being destroyed now. No one else
1008 ++ * can access them so it's safe to act without any lock.
1009 ++ */
1010 ++ bfq_flush_idle_tree(st);
1011 ++
1012 ++ /*
1013 ++ * It may happen that some queues are still active
1014 ++ * (busy) upon group destruction (if the corresponding
1015 ++ * processes have been forced to terminate). We move
1016 ++ * all the leaf entities corresponding to these queues
1017 ++ * to the root_group.
1018 ++ * Also, it may happen that the group has an entity
1019 ++ * in service, which is disconnected from the active
1020 ++ * tree: it must be moved, too.
1021 ++ * There is no need to put the sync queues, as the
1022 ++ * scheduler has taken no reference.
1023 ++ */
1024 ++ bfq_reparent_active_entities(bfqd, bfqg, st);
1025 ++ BUG_ON(!RB_EMPTY_ROOT(&st->active));
1026 ++ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
1027 ++ }
1028 ++ BUG_ON(bfqg->sched_data.next_in_service);
1029 ++ BUG_ON(bfqg->sched_data.in_service_entity);
1030 ++
1031 ++ __bfq_deactivate_entity(entity, 0);
1032 ++ bfq_put_async_queues(bfqd, bfqg);
1033 ++ BUG_ON(entity->tree);
1034 ++
1035 ++ bfqg_stats_xfer_dead(bfqg);
1036 ++}
1037 ++
1038 ++static void bfq_end_wr_async(struct bfq_data *bfqd)
1039 ++{
1040 ++ struct blkcg_gq *blkg;
1041 ++
1042 ++ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
1043 ++ struct bfq_group *bfqg = blkg_to_bfqg(blkg);
1044 ++
1045 ++ bfq_end_wr_async_queues(bfqd, bfqg);
1046 ++ }
1047 ++ bfq_end_wr_async_queues(bfqd, bfqd->root_group);
1048 ++}
1049 ++
1050 ++static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css,
1051 ++ struct cftype *cftype)
1052 ++{
1053 ++ struct blkcg *blkcg = css_to_blkcg(css);
1054 ++ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
1055 ++ int ret = -EINVAL;
1056 ++
1057 ++ spin_lock_irq(&blkcg->lock);
1058 ++ ret = bfqgd->weight;
1059 ++ spin_unlock_irq(&blkcg->lock);
1060 ++
1061 ++ return ret;
1062 ++}
1063 ++
1064 ++static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v)
1065 ++{
1066 ++ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
1067 ++ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
1068 ++
1069 ++ spin_lock_irq(&blkcg->lock);
1070 ++ seq_printf(sf, "%u\n", bfqgd->weight);
1071 ++ spin_unlock_irq(&blkcg->lock);
1072 ++
1073 ++ return 0;
1074 ++}
1075 ++
1076 ++static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css,
1077 ++ struct cftype *cftype,
1078 ++ u64 val)
1079 ++{
1080 ++ struct blkcg *blkcg = css_to_blkcg(css);
1081 ++ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
1082 ++ struct blkcg_gq *blkg;
1083 ++ int ret = -EINVAL;
1084 ++
1085 ++ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
1086 ++ return ret;
1087 ++
1088 ++ ret = 0;
1089 ++ spin_lock_irq(&blkcg->lock);
1090 ++ bfqgd->weight = (unsigned short)val;
1091 ++ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
1092 ++ struct bfq_group *bfqg = blkg_to_bfqg(blkg);
1093 ++ if (!bfqg)
1094 ++ continue;
1095 ++ /*
1096 ++ * Setting the prio_changed flag of the entity
1097 ++ * to 1 with new_weight == weight would re-set
1098 ++ * the value of the weight to its ioprio mapping.
1099 ++ * Set the flag only if necessary.
1100 ++ */
1101 ++ if ((unsigned short)val != bfqg->entity.new_weight) {
1102 ++ bfqg->entity.new_weight = (unsigned short)val;
1103 ++ /*
1104 ++ * Make sure that the above new value has been
1105 ++ * stored in bfqg->entity.new_weight before
1106 ++ * setting the prio_changed flag. In fact,
1107 ++ * this flag may be read asynchronously (in
1108 ++ * critical sections protected by a different
1109 ++ * lock than that held here), and finding this
1110 ++ * flag set may cause the execution of the code
1111 ++ * for updating parameters whose value may
1112 ++ * depend also on bfqg->entity.new_weight (in
1113 ++ * __bfq_entity_update_weight_prio).
1114 ++ * This barrier makes sure that the new value
1115 ++ * of bfqg->entity.new_weight is correctly
1116 ++ * seen in that code.
1117 ++ */
1118 ++ smp_wmb();
1119 ++ bfqg->entity.prio_changed = 1;
1120 ++ }
1121 ++ }
1122 ++ spin_unlock_irq(&blkcg->lock);
1123 ++
1124 ++ return ret;
1125 ++}
1126 ++
1127 ++static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of,
1128 ++ char *buf, size_t nbytes,
1129 ++ loff_t off)
1130 ++{
1131 ++ /* First unsigned long found in the file is used */
1132 ++ return bfqio_cgroup_weight_write(of_css(of), NULL,
1133 ++ simple_strtoull(strim(buf), NULL, 0));
1134 ++}
1135 ++
1136 ++static int bfqg_print_stat(struct seq_file *sf, void *v)
1137 ++{
1138 ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
1139 ++ &blkcg_policy_bfq, seq_cft(sf)->private, false);
1140 ++ return 0;
1141 ++}
1142 ++
1143 ++static int bfqg_print_rwstat(struct seq_file *sf, void *v)
1144 ++{
1145 ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
1146 ++ &blkcg_policy_bfq, seq_cft(sf)->private, true);
1147 ++ return 0;
1148 ++}
1149 ++
1150 ++static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
1151 ++ struct blkg_policy_data *pd, int off)
1152 ++{
1153 ++ u64 sum = bfqg_stat_pd_recursive_sum(pd, off);
1154 ++
1155 ++ return __blkg_prfill_u64(sf, pd, sum);
1156 ++}
1157 ++
1158 ++static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
1159 ++ struct blkg_policy_data *pd, int off)
1160 ++{
1161 ++ struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off);
1162 ++
1163 ++ return __blkg_prfill_rwstat(sf, pd, &sum);
1164 ++}
1165 ++
1166 ++static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
1167 ++{
1168 ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1169 ++ bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
1170 ++ seq_cft(sf)->private, false);
1171 ++ return 0;
1172 ++}
1173 ++
1174 ++static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
1175 ++{
1176 ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1177 ++ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
1178 ++ seq_cft(sf)->private, true);
1179 ++ return 0;
1180 ++}
1181 ++
1182 ++static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
1183 ++ struct blkg_policy_data *pd, int off)
1184 ++{
1185 ++ struct bfq_group *bfqg = pd_to_bfqg(pd);
1186 ++ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
1187 ++ u64 v = 0;
1188 ++
1189 ++ if (samples) {
1190 ++ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
1191 ++ v = div64_u64(v, samples);
1192 ++ }
1193 ++ __blkg_prfill_u64(sf, pd, v);
1194 ++ return 0;
1195 ++}
1196 ++
1197 ++/* print avg_queue_size */
1198 ++static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
1199 ++{
1200 ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
1201 ++ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
1202 ++ 0, false);
1203 ++ return 0;
1204 ++}
1205 ++
1206 ++static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
1207 ++{
1208 ++ int ret;
1209 ++
1210 ++ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
1211 ++ if (ret)
1212 ++ return NULL;
1213 ++
1214 ++ return blkg_to_bfqg(bfqd->queue->root_blkg);
1215 ++}
1216 ++
1217 ++static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
1218 ++{
1219 ++ struct bfq_group_data *bgd;
1220 ++
1221 ++ bgd = kzalloc(sizeof(*bgd), GFP_KERNEL);
1222 ++ if (!bgd)
1223 ++ return NULL;
1224 ++ return &bgd->pd;
1225 ++}
1226 ++
1227 ++static void bfq_cpd_free(struct blkcg_policy_data *cpd)
1228 ++{
1229 ++ kfree(cpd_to_bfqgd(cpd));
1230 ++}
1231 ++
1232 ++static struct cftype bfqio_files_dfl[] = {
1233 ++ {
1234 ++ .name = "weight",
1235 ++ .flags = CFTYPE_NOT_ON_ROOT,
1236 ++ .seq_show = bfqio_cgroup_weight_read_dfl,
1237 ++ .write = bfqio_cgroup_weight_write_dfl,
1238 ++ },
1239 ++ {} /* terminate */
1240 ++};
1241 ++
1242 ++static struct cftype bfqio_files[] = {
1243 ++ {
1244 ++ .name = "bfq.weight",
1245 ++ .read_u64 = bfqio_cgroup_weight_read,
1246 ++ .write_u64 = bfqio_cgroup_weight_write,
1247 ++ },
1248 ++ /* statistics, cover only the tasks in the bfqg */
1249 ++ {
1250 ++ .name = "bfq.time",
1251 ++ .private = offsetof(struct bfq_group, stats.time),
1252 ++ .seq_show = bfqg_print_stat,
1253 ++ },
1254 ++ {
1255 ++ .name = "bfq.sectors",
1256 ++ .private = offsetof(struct bfq_group, stats.sectors),
1257 ++ .seq_show = bfqg_print_stat,
1258 ++ },
1259 ++ {
1260 ++ .name = "bfq.io_service_bytes",
1261 ++ .private = offsetof(struct bfq_group, stats.service_bytes),
1262 ++ .seq_show = bfqg_print_rwstat,
1263 ++ },
1264 ++ {
1265 ++ .name = "bfq.io_serviced",
1266 ++ .private = offsetof(struct bfq_group, stats.serviced),
1267 ++ .seq_show = bfqg_print_rwstat,
1268 ++ },
1269 ++ {
1270 ++ .name = "bfq.io_service_time",
1271 ++ .private = offsetof(struct bfq_group, stats.service_time),
1272 ++ .seq_show = bfqg_print_rwstat,
1273 ++ },
1274 ++ {
1275 ++ .name = "bfq.io_wait_time",
1276 ++ .private = offsetof(struct bfq_group, stats.wait_time),
1277 ++ .seq_show = bfqg_print_rwstat,
1278 ++ },
1279 ++ {
1280 ++ .name = "bfq.io_merged",
1281 ++ .private = offsetof(struct bfq_group, stats.merged),
1282 ++ .seq_show = bfqg_print_rwstat,
1283 ++ },
1284 ++ {
1285 ++ .name = "bfq.io_queued",
1286 ++ .private = offsetof(struct bfq_group, stats.queued),
1287 ++ .seq_show = bfqg_print_rwstat,
1288 ++ },
1289 ++
1290 ++ /* the same statictics which cover the bfqg and its descendants */
1291 ++ {
1292 ++ .name = "bfq.time_recursive",
1293 ++ .private = offsetof(struct bfq_group, stats.time),
1294 ++ .seq_show = bfqg_print_stat_recursive,
1295 ++ },
1296 ++ {
1297 ++ .name = "bfq.sectors_recursive",
1298 ++ .private = offsetof(struct bfq_group, stats.sectors),
1299 ++ .seq_show = bfqg_print_stat_recursive,
1300 ++ },
1301 ++ {
1302 ++ .name = "bfq.io_service_bytes_recursive",
1303 ++ .private = offsetof(struct bfq_group, stats.service_bytes),
1304 ++ .seq_show = bfqg_print_rwstat_recursive,
1305 ++ },
1306 ++ {
1307 ++ .name = "bfq.io_serviced_recursive",
1308 ++ .private = offsetof(struct bfq_group, stats.serviced),
1309 ++ .seq_show = bfqg_print_rwstat_recursive,
1310 ++ },
1311 ++ {
1312 ++ .name = "bfq.io_service_time_recursive",
1313 ++ .private = offsetof(struct bfq_group, stats.service_time),
1314 ++ .seq_show = bfqg_print_rwstat_recursive,
1315 ++ },
1316 ++ {
1317 ++ .name = "bfq.io_wait_time_recursive",
1318 ++ .private = offsetof(struct bfq_group, stats.wait_time),
1319 ++ .seq_show = bfqg_print_rwstat_recursive,
1320 ++ },
1321 ++ {
1322 ++ .name = "bfq.io_merged_recursive",
1323 ++ .private = offsetof(struct bfq_group, stats.merged),
1324 ++ .seq_show = bfqg_print_rwstat_recursive,
1325 ++ },
1326 ++ {
1327 ++ .name = "bfq.io_queued_recursive",
1328 ++ .private = offsetof(struct bfq_group, stats.queued),
1329 ++ .seq_show = bfqg_print_rwstat_recursive,
1330 ++ },
1331 ++ {
1332 ++ .name = "bfq.avg_queue_size",
1333 ++ .seq_show = bfqg_print_avg_queue_size,
1334 ++ },
1335 ++ {
1336 ++ .name = "bfq.group_wait_time",
1337 ++ .private = offsetof(struct bfq_group, stats.group_wait_time),
1338 ++ .seq_show = bfqg_print_stat,
1339 ++ },
1340 ++ {
1341 ++ .name = "bfq.idle_time",
1342 ++ .private = offsetof(struct bfq_group, stats.idle_time),
1343 ++ .seq_show = bfqg_print_stat,
1344 ++ },
1345 ++ {
1346 ++ .name = "bfq.empty_time",
1347 ++ .private = offsetof(struct bfq_group, stats.empty_time),
1348 ++ .seq_show = bfqg_print_stat,
1349 ++ },
1350 ++ {
1351 ++ .name = "bfq.dequeue",
1352 ++ .private = offsetof(struct bfq_group, stats.dequeue),
1353 ++ .seq_show = bfqg_print_stat,
1354 ++ },
1355 ++ {
1356 ++ .name = "bfq.unaccounted_time",
1357 ++ .private = offsetof(struct bfq_group, stats.unaccounted_time),
1358 ++ .seq_show = bfqg_print_stat,
1359 ++ },
1360 ++ { } /* terminate */
1361 ++};
1362 ++
1363 ++static struct blkcg_policy blkcg_policy_bfq = {
1364 ++ .dfl_cftypes = bfqio_files_dfl,
1365 ++ .legacy_cftypes = bfqio_files,
1366 ++
1367 ++ .pd_alloc_fn = bfq_pd_alloc,
1368 ++ .pd_init_fn = bfq_pd_init,
1369 ++ .pd_offline_fn = bfq_pd_offline,
1370 ++ .pd_free_fn = bfq_pd_free,
1371 ++ .pd_reset_stats_fn = bfq_pd_reset_stats,
1372 ++
1373 ++ .cpd_alloc_fn = bfq_cpd_alloc,
1374 ++ .cpd_init_fn = bfq_cpd_init,
1375 ++ .cpd_bind_fn = bfq_cpd_init,
1376 ++ .cpd_free_fn = bfq_cpd_free,
1377 ++
1378 ++};
1379 ++
1380 ++#else
1381 ++
1382 ++static void bfq_init_entity(struct bfq_entity *entity,
1383 ++ struct bfq_group *bfqg)
1384 ++{
1385 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1386 ++ entity->weight = entity->new_weight;
1387 ++ entity->orig_weight = entity->new_weight;
1388 ++ if (bfqq) {
1389 ++ bfqq->ioprio = bfqq->new_ioprio;
1390 ++ bfqq->ioprio_class = bfqq->new_ioprio_class;
1391 ++ }
1392 ++ entity->sched_data = &bfqg->sched_data;
1393 ++}
1394 ++
1395 ++static struct bfq_group *
1396 ++bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
1397 ++{
1398 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
1399 ++ return bfqd->root_group;
1400 ++}
1401 ++
1402 ++static void bfq_bfqq_move(struct bfq_data *bfqd,
1403 ++ struct bfq_queue *bfqq,
1404 ++ struct bfq_entity *entity,
1405 ++ struct bfq_group *bfqg)
1406 ++{
1407 ++}
1408 ++
1409 ++static void bfq_end_wr_async(struct bfq_data *bfqd)
1410 ++{
1411 ++ bfq_end_wr_async_queues(bfqd, bfqd->root_group);
1412 ++}
1413 ++
1414 ++static void bfq_disconnect_groups(struct bfq_data *bfqd)
1415 ++{
1416 ++ bfq_put_async_queues(bfqd, bfqd->root_group);
1417 ++}
1418 ++
1419 ++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
1420 ++ struct blkcg *blkcg)
1421 ++{
1422 ++ return bfqd->root_group;
1423 ++}
1424 ++
1425 ++static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
1426 ++{
1427 ++ struct bfq_group *bfqg;
1428 ++ int i;
1429 ++
1430 ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
1431 ++ if (!bfqg)
1432 ++ return NULL;
1433 ++
1434 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
1435 ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
1436 ++
1437 ++ return bfqg;
1438 ++}
1439 ++#endif
1440 +diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
1441 +new file mode 100644
1442 +index 0000000..fb7bb8f
1443 +--- /dev/null
1444 ++++ b/block/bfq-ioc.c
1445 +@@ -0,0 +1,36 @@
1446 ++/*
1447 ++ * BFQ: I/O context handling.
1448 ++ *
1449 ++ * Based on ideas and code from CFQ:
1450 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1451 ++ *
1452 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1453 ++ * Paolo Valente <paolo.valente@×××××××.it>
1454 ++ *
1455 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1456 ++ */
1457 ++
1458 ++/**
1459 ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
1460 ++ * @icq: the iocontext queue.
1461 ++ */
1462 ++static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
1463 ++{
1464 ++ /* bic->icq is the first member, %NULL will convert to %NULL */
1465 ++ return container_of(icq, struct bfq_io_cq, icq);
1466 ++}
1467 ++
1468 ++/**
1469 ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
1470 ++ * @bfqd: the lookup key.
1471 ++ * @ioc: the io_context of the process doing I/O.
1472 ++ *
1473 ++ * Queue lock must be held.
1474 ++ */
1475 ++static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
1476 ++ struct io_context *ioc)
1477 ++{
1478 ++ if (ioc)
1479 ++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
1480 ++ return NULL;
1481 ++}
1482 +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
1483 +new file mode 100644
1484 +index 0000000..f9787a6
1485 +--- /dev/null
1486 ++++ b/block/bfq-iosched.c
1487 +@@ -0,0 +1,3754 @@
1488 ++/*
1489 ++ * Budget Fair Queueing (BFQ) disk scheduler.
1490 ++ *
1491 ++ * Based on ideas and code from CFQ:
1492 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1493 ++ *
1494 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1495 ++ * Paolo Valente <paolo.valente@×××××××.it>
1496 ++ *
1497 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1498 ++ *
1499 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
1500 ++ * file.
1501 ++ *
1502 ++ * BFQ is a proportional-share storage-I/O scheduling algorithm based on
1503 ++ * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
1504 ++ * measured in number of sectors, to processes instead of time slices. The
1505 ++ * device is not granted to the in-service process for a given time slice,
1506 ++ * but until it has exhausted its assigned budget. This change from the time
1507 ++ * to the service domain allows BFQ to distribute the device throughput
1508 ++ * among processes as desired, without any distortion due to ZBR, workload
1509 ++ * fluctuations or other factors. BFQ uses an ad hoc internal scheduler,
1510 ++ * called B-WF2Q+, to schedule processes according to their budgets. More
1511 ++ * precisely, BFQ schedules queues associated to processes. Thanks to the
1512 ++ * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to
1513 ++ * I/O-bound processes issuing sequential requests (to boost the
1514 ++ * throughput), and yet guarantee a low latency to interactive and soft
1515 ++ * real-time applications.
1516 ++ *
1517 ++ * BFQ is described in [1], where also a reference to the initial, more
1518 ++ * theoretical paper on BFQ can be found. The interested reader can find
1519 ++ * in the latter paper full details on the main algorithm, as well as
1520 ++ * formulas of the guarantees and formal proofs of all the properties.
1521 ++ * With respect to the version of BFQ presented in these papers, this
1522 ++ * implementation adds a few more heuristics, such as the one that
1523 ++ * guarantees a low latency to soft real-time applications, and a
1524 ++ * hierarchical extension based on H-WF2Q+.
1525 ++ *
1526 ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
1527 ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
1528 ++ * complexity derives from the one introduced with EEVDF in [3].
1529 ++ *
1530 ++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
1531 ++ * with the BFQ Disk I/O Scheduler'',
1532 ++ * Proceedings of the 5th Annual International Systems and Storage
1533 ++ * Conference (SYSTOR '12), June 2012.
1534 ++ *
1535 ++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
1536 ++ *
1537 ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
1538 ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
1539 ++ * Oct 1997.
1540 ++ *
1541 ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
1542 ++ *
1543 ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
1544 ++ * First: A Flexible and Accurate Mechanism for Proportional Share
1545 ++ * Resource Allocation,'' technical report.
1546 ++ *
1547 ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
1548 ++ */
1549 ++#include <linux/module.h>
1550 ++#include <linux/slab.h>
1551 ++#include <linux/blkdev.h>
1552 ++#include <linux/cgroup.h>
1553 ++#include <linux/elevator.h>
1554 ++#include <linux/jiffies.h>
1555 ++#include <linux/rbtree.h>
1556 ++#include <linux/ioprio.h>
1557 ++#include "bfq.h"
1558 ++#include "blk.h"
1559 ++
1560 ++/* Expiration time of sync (0) and async (1) requests, in jiffies. */
1561 ++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
1562 ++
1563 ++/* Maximum backwards seek, in KiB. */
1564 ++static const int bfq_back_max = 16 * 1024;
1565 ++
1566 ++/* Penalty of a backwards seek, in number of sectors. */
1567 ++static const int bfq_back_penalty = 2;
1568 ++
1569 ++/* Idling period duration, in jiffies. */
1570 ++static int bfq_slice_idle = HZ / 125;
1571 ++
1572 ++/* Minimum number of assigned budgets for which stats are safe to compute. */
1573 ++static const int bfq_stats_min_budgets = 194;
1574 ++
1575 ++/* Default maximum budget values, in sectors and number of requests. */
1576 ++static const int bfq_default_max_budget = 16 * 1024;
1577 ++static const int bfq_max_budget_async_rq = 4;
1578 ++
1579 ++/*
1580 ++ * Async to sync throughput distribution is controlled as follows:
1581 ++ * when an async request is served, the entity is charged the number
1582 ++ * of sectors of the request, multiplied by the factor below
1583 ++ */
1584 ++static const int bfq_async_charge_factor = 10;
1585 ++
1586 ++/* Default timeout values, in jiffies, approximating CFQ defaults. */
1587 ++static const int bfq_timeout_sync = HZ / 8;
1588 ++static int bfq_timeout_async = HZ / 25;
1589 ++
1590 ++struct kmem_cache *bfq_pool;
1591 ++
1592 ++/* Below this threshold (in ms), we consider thinktime immediate. */
1593 ++#define BFQ_MIN_TT 2
1594 ++
1595 ++/* hw_tag detection: parallel requests threshold and min samples needed. */
1596 ++#define BFQ_HW_QUEUE_THRESHOLD 4
1597 ++#define BFQ_HW_QUEUE_SAMPLES 32
1598 ++
1599 ++#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
1600 ++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
1601 ++
1602 ++/* Min samples used for peak rate estimation (for autotuning). */
1603 ++#define BFQ_PEAK_RATE_SAMPLES 32
1604 ++
1605 ++/* Shift used for peak rate fixed precision calculations. */
1606 ++#define BFQ_RATE_SHIFT 16
1607 ++
1608 ++/*
1609 ++ * By default, BFQ computes the duration of the weight raising for
1610 ++ * interactive applications automatically, using the following formula:
1611 ++ * duration = (R / r) * T, where r is the peak rate of the device, and
1612 ++ * R and T are two reference parameters.
1613 ++ * In particular, R is the peak rate of the reference device (see below),
1614 ++ * and T is a reference time: given the systems that are likely to be
1615 ++ * installed on the reference device according to its speed class, T is
1616 ++ * about the maximum time needed, under BFQ and while reading two files in
1617 ++ * parallel, to load typical large applications on these systems.
1618 ++ * In practice, the slower/faster the device at hand is, the more/less it
1619 ++ * takes to load applications with respect to the reference device.
1620 ++ * Accordingly, the longer/shorter BFQ grants weight raising to interactive
1621 ++ * applications.
1622 ++ *
1623 ++ * BFQ uses four different reference pairs (R, T), depending on:
1624 ++ * . whether the device is rotational or non-rotational;
1625 ++ * . whether the device is slow, such as old or portable HDDs, as well as
1626 ++ * SD cards, or fast, such as newer HDDs and SSDs.
1627 ++ *
1628 ++ * The device's speed class is dynamically (re)detected in
1629 ++ * bfq_update_peak_rate() every time the estimated peak rate is updated.
1630 ++ *
1631 ++ * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0]
1632 ++ * are the reference values for a slow/fast rotational device, whereas
1633 ++ * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for
1634 ++ * a slow/fast non-rotational device. Finally, device_speed_thresh are the
1635 ++ * thresholds used to switch between speed classes.
1636 ++ * Both the reference peak rates and the thresholds are measured in
1637 ++ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
1638 ++ */
1639 ++static int R_slow[2] = {1536, 10752};
1640 ++static int R_fast[2] = {17415, 34791};
1641 ++/*
1642 ++ * To improve readability, a conversion function is used to initialize the
1643 ++ * following arrays, which entails that they can be initialized only in a
1644 ++ * function.
1645 ++ */
1646 ++static int T_slow[2];
1647 ++static int T_fast[2];
1648 ++static int device_speed_thresh[2];
1649 ++
1650 ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
1651 ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1652 ++
1653 ++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
1654 ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
1655 ++
1656 ++static void bfq_schedule_dispatch(struct bfq_data *bfqd);
1657 ++
1658 ++#include "bfq-ioc.c"
1659 ++#include "bfq-sched.c"
1660 ++#include "bfq-cgroup.c"
1661 ++
1662 ++#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
1663 ++#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
1664 ++
1665 ++#define bfq_sample_valid(samples) ((samples) > 80)
1666 ++
1667 ++/*
1668 ++ * We regard a request as SYNC, if either it's a read or has the SYNC bit
1669 ++ * set (in which case it could also be a direct WRITE).
1670 ++ */
1671 ++static int bfq_bio_sync(struct bio *bio)
1672 ++{
1673 ++ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
1674 ++ return 1;
1675 ++
1676 ++ return 0;
1677 ++}
1678 ++
1679 ++/*
1680 ++ * Scheduler run of queue, if there are requests pending and no one in the
1681 ++ * driver that will restart queueing.
1682 ++ */
1683 ++static void bfq_schedule_dispatch(struct bfq_data *bfqd)
1684 ++{
1685 ++ if (bfqd->queued != 0) {
1686 ++ bfq_log(bfqd, "schedule dispatch");
1687 ++ kblockd_schedule_work(&bfqd->unplug_work);
1688 ++ }
1689 ++}
1690 ++
1691 ++/*
1692 ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1693 ++ * We choose the request that is closesr to the head right now. Distance
1694 ++ * behind the head is penalized and only allowed to a certain extent.
1695 ++ */
1696 ++static struct request *bfq_choose_req(struct bfq_data *bfqd,
1697 ++ struct request *rq1,
1698 ++ struct request *rq2,
1699 ++ sector_t last)
1700 ++{
1701 ++ sector_t s1, s2, d1 = 0, d2 = 0;
1702 ++ unsigned long back_max;
1703 ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
1704 ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
1705 ++ unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1706 ++
1707 ++ if (!rq1 || rq1 == rq2)
1708 ++ return rq2;
1709 ++ if (!rq2)
1710 ++ return rq1;
1711 ++
1712 ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
1713 ++ return rq1;
1714 ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
1715 ++ return rq2;
1716 ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
1717 ++ return rq1;
1718 ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
1719 ++ return rq2;
1720 ++
1721 ++ s1 = blk_rq_pos(rq1);
1722 ++ s2 = blk_rq_pos(rq2);
1723 ++
1724 ++ /*
1725 ++ * By definition, 1KiB is 2 sectors.
1726 ++ */
1727 ++ back_max = bfqd->bfq_back_max * 2;
1728 ++
1729 ++ /*
1730 ++ * Strict one way elevator _except_ in the case where we allow
1731 ++ * short backward seeks which are biased as twice the cost of a
1732 ++ * similar forward seek.
1733 ++ */
1734 ++ if (s1 >= last)
1735 ++ d1 = s1 - last;
1736 ++ else if (s1 + back_max >= last)
1737 ++ d1 = (last - s1) * bfqd->bfq_back_penalty;
1738 ++ else
1739 ++ wrap |= BFQ_RQ1_WRAP;
1740 ++
1741 ++ if (s2 >= last)
1742 ++ d2 = s2 - last;
1743 ++ else if (s2 + back_max >= last)
1744 ++ d2 = (last - s2) * bfqd->bfq_back_penalty;
1745 ++ else
1746 ++ wrap |= BFQ_RQ2_WRAP;
1747 ++
1748 ++ /* Found required data */
1749 ++
1750 ++ /*
1751 ++ * By doing switch() on the bit mask "wrap" we avoid having to
1752 ++ * check two variables for all permutations: --> faster!
1753 ++ */
1754 ++ switch (wrap) {
1755 ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1756 ++ if (d1 < d2)
1757 ++ return rq1;
1758 ++ else if (d2 < d1)
1759 ++ return rq2;
1760 ++ else {
1761 ++ if (s1 >= s2)
1762 ++ return rq1;
1763 ++ else
1764 ++ return rq2;
1765 ++ }
1766 ++
1767 ++ case BFQ_RQ2_WRAP:
1768 ++ return rq1;
1769 ++ case BFQ_RQ1_WRAP:
1770 ++ return rq2;
1771 ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
1772 ++ default:
1773 ++ /*
1774 ++ * Since both rqs are wrapped,
1775 ++ * start with the one that's further behind head
1776 ++ * (--> only *one* back seek required),
1777 ++ * since back seek takes more time than forward.
1778 ++ */
1779 ++ if (s1 <= s2)
1780 ++ return rq1;
1781 ++ else
1782 ++ return rq2;
1783 ++ }
1784 ++}
1785 ++
1786 ++/*
1787 ++ * Tell whether there are active queues or groups with differentiated weights.
1788 ++ */
1789 ++static bool bfq_differentiated_weights(struct bfq_data *bfqd)
1790 ++{
1791 ++ /*
1792 ++ * For weights to differ, at least one of the trees must contain
1793 ++ * at least two nodes.
1794 ++ */
1795 ++ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
1796 ++ (bfqd->queue_weights_tree.rb_node->rb_left ||
1797 ++ bfqd->queue_weights_tree.rb_node->rb_right)
1798 ++#ifdef CONFIG_BFQ_GROUP_IOSCHED
1799 ++ ) ||
1800 ++ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
1801 ++ (bfqd->group_weights_tree.rb_node->rb_left ||
1802 ++ bfqd->group_weights_tree.rb_node->rb_right)
1803 ++#endif
1804 ++ );
1805 ++}
1806 ++
1807 ++/*
1808 ++ * The following function returns true if every queue must receive the
1809 ++ * same share of the throughput (this condition is used when deciding
1810 ++ * whether idling may be disabled, see the comments in the function
1811 ++ * bfq_bfqq_may_idle()).
1812 ++ *
1813 ++ * Such a scenario occurs when:
1814 ++ * 1) all active queues have the same weight,
1815 ++ * 2) all active groups at the same level in the groups tree have the same
1816 ++ * weight,
1817 ++ * 3) all active groups at the same level in the groups tree have the same
1818 ++ * number of children.
1819 ++ *
1820 ++ * Unfortunately, keeping the necessary state for evaluating exactly the
1821 ++ * above symmetry conditions would be quite complex and time-consuming.
1822 ++ * Therefore this function evaluates, instead, the following stronger
1823 ++ * sub-conditions, for which it is much easier to maintain the needed
1824 ++ * state:
1825 ++ * 1) all active queues have the same weight,
1826 ++ * 2) all active groups have the same weight,
1827 ++ * 3) all active groups have at most one active child each.
1828 ++ * In particular, the last two conditions are always true if hierarchical
1829 ++ * support and the cgroups interface are not enabled, thus no state needs
1830 ++ * to be maintained in this case.
1831 ++ */
1832 ++static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
1833 ++{
1834 ++ return
1835 ++#ifdef CONFIG_BFQ_GROUP_IOSCHED
1836 ++ !bfqd->active_numerous_groups &&
1837 ++#endif
1838 ++ !bfq_differentiated_weights(bfqd);
1839 ++}
1840 ++
1841 ++/*
1842 ++ * If the weight-counter tree passed as input contains no counter for
1843 ++ * the weight of the input entity, then add that counter; otherwise just
1844 ++ * increment the existing counter.
1845 ++ *
1846 ++ * Note that weight-counter trees contain few nodes in mostly symmetric
1847 ++ * scenarios. For example, if all queues have the same weight, then the
1848 ++ * weight-counter tree for the queues may contain at most one node.
1849 ++ * This holds even if low_latency is on, because weight-raised queues
1850 ++ * are not inserted in the tree.
1851 ++ * In most scenarios, the rate at which nodes are created/destroyed
1852 ++ * should be low too.
1853 ++ */
1854 ++static void bfq_weights_tree_add(struct bfq_data *bfqd,
1855 ++ struct bfq_entity *entity,
1856 ++ struct rb_root *root)
1857 ++{
1858 ++ struct rb_node **new = &(root->rb_node), *parent = NULL;
1859 ++
1860 ++ /*
1861 ++ * Do not insert if the entity is already associated with a
1862 ++ * counter, which happens if:
1863 ++ * 1) the entity is associated with a queue,
1864 ++ * 2) a request arrival has caused the queue to become both
1865 ++ * non-weight-raised, and hence change its weight, and
1866 ++ * backlogged; in this respect, each of the two events
1867 ++ * causes an invocation of this function,
1868 ++ * 3) this is the invocation of this function caused by the
1869 ++ * second event. This second invocation is actually useless,
1870 ++ * and we handle this fact by exiting immediately. More
1871 ++ * efficient or clearer solutions might possibly be adopted.
1872 ++ */
1873 ++ if (entity->weight_counter)
1874 ++ return;
1875 ++
1876 ++ while (*new) {
1877 ++ struct bfq_weight_counter *__counter = container_of(*new,
1878 ++ struct bfq_weight_counter,
1879 ++ weights_node);
1880 ++ parent = *new;
1881 ++
1882 ++ if (entity->weight == __counter->weight) {
1883 ++ entity->weight_counter = __counter;
1884 ++ goto inc_counter;
1885 ++ }
1886 ++ if (entity->weight < __counter->weight)
1887 ++ new = &((*new)->rb_left);
1888 ++ else
1889 ++ new = &((*new)->rb_right);
1890 ++ }
1891 ++
1892 ++ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
1893 ++ GFP_ATOMIC);
1894 ++ entity->weight_counter->weight = entity->weight;
1895 ++ rb_link_node(&entity->weight_counter->weights_node, parent, new);
1896 ++ rb_insert_color(&entity->weight_counter->weights_node, root);
1897 ++
1898 ++inc_counter:
1899 ++ entity->weight_counter->num_active++;
1900 ++}
1901 ++
1902 ++/*
1903 ++ * Decrement the weight counter associated with the entity, and, if the
1904 ++ * counter reaches 0, remove the counter from the tree.
1905 ++ * See the comments to the function bfq_weights_tree_add() for considerations
1906 ++ * about overhead.
1907 ++ */
1908 ++static void bfq_weights_tree_remove(struct bfq_data *bfqd,
1909 ++ struct bfq_entity *entity,
1910 ++ struct rb_root *root)
1911 ++{
1912 ++ if (!entity->weight_counter)
1913 ++ return;
1914 ++
1915 ++ BUG_ON(RB_EMPTY_ROOT(root));
1916 ++ BUG_ON(entity->weight_counter->weight != entity->weight);
1917 ++
1918 ++ BUG_ON(!entity->weight_counter->num_active);
1919 ++ entity->weight_counter->num_active--;
1920 ++ if (entity->weight_counter->num_active > 0)
1921 ++ goto reset_entity_pointer;
1922 ++
1923 ++ rb_erase(&entity->weight_counter->weights_node, root);
1924 ++ kfree(entity->weight_counter);
1925 ++
1926 ++reset_entity_pointer:
1927 ++ entity->weight_counter = NULL;
1928 ++}
1929 ++
1930 ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
1931 ++ struct bfq_queue *bfqq,
1932 ++ struct request *last)
1933 ++{
1934 ++ struct rb_node *rbnext = rb_next(&last->rb_node);
1935 ++ struct rb_node *rbprev = rb_prev(&last->rb_node);
1936 ++ struct request *next = NULL, *prev = NULL;
1937 ++
1938 ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1939 ++
1940 ++ if (rbprev)
1941 ++ prev = rb_entry_rq(rbprev);
1942 ++
1943 ++ if (rbnext)
1944 ++ next = rb_entry_rq(rbnext);
1945 ++ else {
1946 ++ rbnext = rb_first(&bfqq->sort_list);
1947 ++ if (rbnext && rbnext != &last->rb_node)
1948 ++ next = rb_entry_rq(rbnext);
1949 ++ }
1950 ++
1951 ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
1952 ++}
1953 ++
1954 ++/* see the definition of bfq_async_charge_factor for details */
1955 ++static unsigned long bfq_serv_to_charge(struct request *rq,
1956 ++ struct bfq_queue *bfqq)
1957 ++{
1958 ++ return blk_rq_sectors(rq) *
1959 ++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *
1960 ++ bfq_async_charge_factor));
1961 ++}
1962 ++
1963 ++/**
1964 ++ * bfq_updated_next_req - update the queue after a new next_rq selection.
1965 ++ * @bfqd: the device data the queue belongs to.
1966 ++ * @bfqq: the queue to update.
1967 ++ *
1968 ++ * If the first request of a queue changes we make sure that the queue
1969 ++ * has enough budget to serve at least its first request (if the
1970 ++ * request has grown). We do this because if the queue has not enough
1971 ++ * budget for its first request, it has to go through two dispatch
1972 ++ * rounds to actually get it dispatched.
1973 ++ */
1974 ++static void bfq_updated_next_req(struct bfq_data *bfqd,
1975 ++ struct bfq_queue *bfqq)
1976 ++{
1977 ++ struct bfq_entity *entity = &bfqq->entity;
1978 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1979 ++ struct request *next_rq = bfqq->next_rq;
1980 ++ unsigned long new_budget;
1981 ++
1982 ++ if (!next_rq)
1983 ++ return;
1984 ++
1985 ++ if (bfqq == bfqd->in_service_queue)
1986 ++ /*
1987 ++ * In order not to break guarantees, budgets cannot be
1988 ++ * changed after an entity has been selected.
1989 ++ */
1990 ++ return;
1991 ++
1992 ++ BUG_ON(entity->tree != &st->active);
1993 ++ BUG_ON(entity == entity->sched_data->in_service_entity);
1994 ++
1995 ++ new_budget = max_t(unsigned long, bfqq->max_budget,
1996 ++ bfq_serv_to_charge(next_rq, bfqq));
1997 ++ if (entity->budget != new_budget) {
1998 ++ entity->budget = new_budget;
1999 ++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
2000 ++ new_budget);
2001 ++ bfq_activate_bfqq(bfqd, bfqq);
2002 ++ }
2003 ++}
2004 ++
2005 ++static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
2006 ++{
2007 ++ u64 dur;
2008 ++
2009 ++ if (bfqd->bfq_wr_max_time > 0)
2010 ++ return bfqd->bfq_wr_max_time;
2011 ++
2012 ++ dur = bfqd->RT_prod;
2013 ++ do_div(dur, bfqd->peak_rate);
2014 ++
2015 ++ return dur;
2016 ++}
2017 ++
2018 ++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
2019 ++static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2020 ++{
2021 ++ struct bfq_queue *item;
2022 ++ struct hlist_node *n;
2023 ++
2024 ++ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
2025 ++ hlist_del_init(&item->burst_list_node);
2026 ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
2027 ++ bfqd->burst_size = 1;
2028 ++}
2029 ++
2030 ++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
2031 ++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2032 ++{
2033 ++ /* Increment burst size to take into account also bfqq */
2034 ++ bfqd->burst_size++;
2035 ++
2036 ++ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
2037 ++ struct bfq_queue *pos, *bfqq_item;
2038 ++ struct hlist_node *n;
2039 ++
2040 ++ /*
2041 ++ * Enough queues have been activated shortly after each
2042 ++ * other to consider this burst as large.
2043 ++ */
2044 ++ bfqd->large_burst = true;
2045 ++
2046 ++ /*
2047 ++ * We can now mark all queues in the burst list as
2048 ++ * belonging to a large burst.
2049 ++ */
2050 ++ hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
2051 ++ burst_list_node)
2052 ++ bfq_mark_bfqq_in_large_burst(bfqq_item);
2053 ++ bfq_mark_bfqq_in_large_burst(bfqq);
2054 ++
2055 ++ /*
2056 ++ * From now on, and until the current burst finishes, any
2057 ++ * new queue being activated shortly after the last queue
2058 ++ * was inserted in the burst can be immediately marked as
2059 ++ * belonging to a large burst. So the burst list is not
2060 ++ * needed any more. Remove it.
2061 ++ */
2062 ++ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
2063 ++ burst_list_node)
2064 ++ hlist_del_init(&pos->burst_list_node);
2065 ++ } else /* burst not yet large: add bfqq to the burst list */
2066 ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
2067 ++}
2068 ++
2069 ++/*
2070 ++ * If many queues happen to become active shortly after each other, then,
2071 ++ * to help the processes associated to these queues get their job done as
2072 ++ * soon as possible, it is usually better to not grant either weight-raising
2073 ++ * or device idling to these queues. In this comment we describe, firstly,
2074 ++ * the reasons why this fact holds, and, secondly, the next function, which
2075 ++ * implements the main steps needed to properly mark these queues so that
2076 ++ * they can then be treated in a different way.
2077 ++ *
2078 ++ * As for the terminology, we say that a queue becomes active, i.e.,
2079 ++ * switches from idle to backlogged, either when it is created (as a
2080 ++ * consequence of the arrival of an I/O request), or, if already existing,
2081 ++ * when a new request for the queue arrives while the queue is idle.
2082 ++ * Bursts of activations, i.e., activations of different queues occurring
2083 ++ * shortly after each other, are typically caused by services or applications
2084 ++ * that spawn or reactivate many parallel threads/processes. Examples are
2085 ++ * systemd during boot or git grep.
2086 ++ *
2087 ++ * These services or applications benefit mostly from a high throughput:
2088 ++ * the quicker the requests of the activated queues are cumulatively served,
2089 ++ * the sooner the target job of these queues gets completed. As a consequence,
2090 ++ * weight-raising any of these queues, which also implies idling the device
2091 ++ * for it, is almost always counterproductive: in most cases it just lowers
2092 ++ * throughput.
2093 ++ *
2094 ++ * On the other hand, a burst of activations may be also caused by the start
2095 ++ * of an application that does not consist in a lot of parallel I/O-bound
2096 ++ * threads. In fact, with a complex application, the burst may be just a
2097 ++ * consequence of the fact that several processes need to be executed to
2098 ++ * start-up the application. To start an application as quickly as possible,
2099 ++ * the best thing to do is to privilege the I/O related to the application
2100 ++ * with respect to all other I/O. Therefore, the best strategy to start as
2101 ++ * quickly as possible an application that causes a burst of activations is
2102 ++ * to weight-raise all the queues activated during the burst. This is the
2103 ++ * exact opposite of the best strategy for the other type of bursts.
2104 ++ *
2105 ++ * In the end, to take the best action for each of the two cases, the two
2106 ++ * types of bursts need to be distinguished. Fortunately, this seems
2107 ++ * relatively easy to do, by looking at the sizes of the bursts. In
2108 ++ * particular, we found a threshold such that bursts with a larger size
2109 ++ * than that threshold are apparently caused only by services or commands
2110 ++ * such as systemd or git grep. For brevity, hereafter we call just 'large'
2111 ++ * these bursts. BFQ *does not* weight-raise queues whose activations occur
2112 ++ * in a large burst. In addition, for each of these queues BFQ performs or
2113 ++ * does not perform idling depending on which choice boosts the throughput
2114 ++ * most. The exact choice depends on the device and request pattern at
2115 ++ * hand.
2116 ++ *
2117 ++ * Turning back to the next function, it implements all the steps needed
2118 ++ * to detect the occurrence of a large burst and to properly mark all the
2119 ++ * queues belonging to it (so that they can then be treated in a different
2120 ++ * way). This goal is achieved by maintaining a special "burst list" that
2121 ++ * holds, temporarily, the queues that belong to the burst in progress. The
2122 ++ * list is then used to mark these queues as belonging to a large burst if
2123 ++ * the burst does become large. The main steps are the following.
2124 ++ *
2125 ++ * . when the very first queue is activated, the queue is inserted into the
2126 ++ * list (as it could be the first queue in a possible burst)
2127 ++ *
2128 ++ * . if the current burst has not yet become large, and a queue Q that does
2129 ++ * not yet belong to the burst is activated shortly after the last time
2130 ++ * at which a new queue entered the burst list, then the function appends
2131 ++ * Q to the burst list
2132 ++ *
2133 ++ * . if, as a consequence of the previous step, the burst size reaches
2134 ++ * the large-burst threshold, then
2135 ++ *
2136 ++ * . all the queues in the burst list are marked as belonging to a
2137 ++ * large burst
2138 ++ *
2139 ++ * . the burst list is deleted; in fact, the burst list already served
2140 ++ * its purpose (keeping temporarily track of the queues in a burst,
2141 ++ * so as to be able to mark them as belonging to a large burst in the
2142 ++ * previous sub-step), and now is not needed any more
2143 ++ *
2144 ++ * . the device enters a large-burst mode
2145 ++ *
2146 ++ * . if a queue Q that does not belong to the burst is activated while
2147 ++ * the device is in large-burst mode and shortly after the last time
2148 ++ * at which a queue either entered the burst list or was marked as
2149 ++ * belonging to the current large burst, then Q is immediately marked
2150 ++ * as belonging to a large burst.
2151 ++ *
2152 ++ * . if a queue Q that does not belong to the burst is activated a while
2153 ++ * later, i.e., not shortly after, than the last time at which a queue
2154 ++ * either entered the burst list or was marked as belonging to the
2155 ++ * current large burst, then the current burst is deemed as finished and:
2156 ++ *
2157 ++ * . the large-burst mode is reset if set
2158 ++ *
2159 ++ * . the burst list is emptied
2160 ++ *
2161 ++ * . Q is inserted in the burst list, as Q may be the first queue
2162 ++ * in a possible new burst (then the burst list contains just Q
2163 ++ * after this step).
2164 ++ */
2165 ++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2166 ++ bool idle_for_long_time)
2167 ++{
2168 ++ /*
2169 ++ * If bfqq happened to be activated in a burst, but has been idle
2170 ++ * for at least as long as an interactive queue, then we assume
2171 ++ * that, in the overall I/O initiated in the burst, the I/O
2172 ++ * associated to bfqq is finished. So bfqq does not need to be
2173 ++ * treated as a queue belonging to a burst anymore. Accordingly,
2174 ++ * we reset bfqq's in_large_burst flag if set, and remove bfqq
2175 ++ * from the burst list if it's there. We do not decrement instead
2176 ++ * burst_size, because the fact that bfqq does not need to belong
2177 ++ * to the burst list any more does not invalidate the fact that
2178 ++ * bfqq may have been activated during the current burst.
2179 ++ */
2180 ++ if (idle_for_long_time) {
2181 ++ hlist_del_init(&bfqq->burst_list_node);
2182 ++ bfq_clear_bfqq_in_large_burst(bfqq);
2183 ++ }
2184 ++
2185 ++ /*
2186 ++ * If bfqq is already in the burst list or is part of a large
2187 ++ * burst, then there is nothing else to do.
2188 ++ */
2189 ++ if (!hlist_unhashed(&bfqq->burst_list_node) ||
2190 ++ bfq_bfqq_in_large_burst(bfqq))
2191 ++ return;
2192 ++
2193 ++ /*
2194 ++ * If bfqq's activation happens late enough, then the current
2195 ++ * burst is finished, and related data structures must be reset.
2196 ++ *
2197 ++ * In this respect, consider the special case where bfqq is the very
2198 ++ * first queue being activated. In this case, last_ins_in_burst is
2199 ++ * not yet significant when we get here. But it is easy to verify
2200 ++ * that, whether or not the following condition is true, bfqq will
2201 ++ * end up being inserted into the burst list. In particular the
2202 ++ * list will happen to contain only bfqq. And this is exactly what
2203 ++ * has to happen, as bfqq may be the first queue in a possible
2204 ++ * burst.
2205 ++ */
2206 ++ if (time_is_before_jiffies(bfqd->last_ins_in_burst +
2207 ++ bfqd->bfq_burst_interval)) {
2208 ++ bfqd->large_burst = false;
2209 ++ bfq_reset_burst_list(bfqd, bfqq);
2210 ++ return;
2211 ++ }
2212 ++
2213 ++ /*
2214 ++ * If we get here, then bfqq is being activated shortly after the
2215 ++ * last queue. So, if the current burst is also large, we can mark
2216 ++ * bfqq as belonging to this large burst immediately.
2217 ++ */
2218 ++ if (bfqd->large_burst) {
2219 ++ bfq_mark_bfqq_in_large_burst(bfqq);
2220 ++ return;
2221 ++ }
2222 ++
2223 ++ /*
2224 ++ * If we get here, then a large-burst state has not yet been
2225 ++ * reached, but bfqq is being activated shortly after the last
2226 ++ * queue. Then we add bfqq to the burst.
2227 ++ */
2228 ++ bfq_add_to_burst(bfqd, bfqq);
2229 ++}
2230 ++
2231 ++static void bfq_add_request(struct request *rq)
2232 ++{
2233 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
2234 ++ struct bfq_entity *entity = &bfqq->entity;
2235 ++ struct bfq_data *bfqd = bfqq->bfqd;
2236 ++ struct request *next_rq, *prev;
2237 ++ unsigned long old_wr_coeff = bfqq->wr_coeff;
2238 ++ bool interactive = false;
2239 ++
2240 ++ bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
2241 ++ bfqq->queued[rq_is_sync(rq)]++;
2242 ++ bfqd->queued++;
2243 ++
2244 ++ elv_rb_add(&bfqq->sort_list, rq);
2245 ++
2246 ++ /*
2247 ++ * Check if this request is a better next-serve candidate.
2248 ++ */
2249 ++ prev = bfqq->next_rq;
2250 ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
2251 ++ BUG_ON(!next_rq);
2252 ++ bfqq->next_rq = next_rq;
2253 ++
2254 ++ if (!bfq_bfqq_busy(bfqq)) {
2255 ++ bool soft_rt, in_burst,
2256 ++ idle_for_long_time = time_is_before_jiffies(
2257 ++ bfqq->budget_timeout +
2258 ++ bfqd->bfq_wr_min_idle_time);
2259 ++
2260 ++#ifdef CONFIG_BFQ_GROUP_IOSCHED
2261 ++ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,
2262 ++ rq->cmd_flags);
2263 ++#endif
2264 ++ if (bfq_bfqq_sync(bfqq)) {
2265 ++ bool already_in_burst =
2266 ++ !hlist_unhashed(&bfqq->burst_list_node) ||
2267 ++ bfq_bfqq_in_large_burst(bfqq);
2268 ++ bfq_handle_burst(bfqd, bfqq, idle_for_long_time);
2269 ++ /*
2270 ++ * If bfqq was not already in the current burst,
2271 ++ * then, at this point, bfqq either has been
2272 ++ * added to the current burst or has caused the
2273 ++ * current burst to terminate. In particular, in
2274 ++ * the second case, bfqq has become the first
2275 ++ * queue in a possible new burst.
2276 ++ * In both cases last_ins_in_burst needs to be
2277 ++ * moved forward.
2278 ++ */
2279 ++ if (!already_in_burst)
2280 ++ bfqd->last_ins_in_burst = jiffies;
2281 ++ }
2282 ++
2283 ++ in_burst = bfq_bfqq_in_large_burst(bfqq);
2284 ++ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
2285 ++ !in_burst &&
2286 ++ time_is_before_jiffies(bfqq->soft_rt_next_start);
2287 ++ interactive = !in_burst && idle_for_long_time;
2288 ++ entity->budget = max_t(unsigned long, bfqq->max_budget,
2289 ++ bfq_serv_to_charge(next_rq, bfqq));
2290 ++
2291 ++ if (!bfq_bfqq_IO_bound(bfqq)) {
2292 ++ if (time_before(jiffies,
2293 ++ RQ_BIC(rq)->ttime.last_end_request +
2294 ++ bfqd->bfq_slice_idle)) {
2295 ++ bfqq->requests_within_timer++;
2296 ++ if (bfqq->requests_within_timer >=
2297 ++ bfqd->bfq_requests_within_timer)
2298 ++ bfq_mark_bfqq_IO_bound(bfqq);
2299 ++ } else
2300 ++ bfqq->requests_within_timer = 0;
2301 ++ }
2302 ++
2303 ++ if (!bfqd->low_latency)
2304 ++ goto add_bfqq_busy;
2305 ++
2306 ++ /*
2307 ++ * If the queue:
2308 ++ * - is not being boosted,
2309 ++ * - has been idle for enough time,
2310 ++ * - is not a sync queue or is linked to a bfq_io_cq (it is
2311 ++ * shared "for its nature" or it is not shared and its
2312 ++ * requests have not been redirected to a shared queue)
2313 ++ * start a weight-raising period.
2314 ++ */
2315 ++ if (old_wr_coeff == 1 && (interactive || soft_rt) &&
2316 ++ (!bfq_bfqq_sync(bfqq) || bfqq->bic)) {
2317 ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
2318 ++ if (interactive)
2319 ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
2320 ++ else
2321 ++ bfqq->wr_cur_max_time =
2322 ++ bfqd->bfq_wr_rt_max_time;
2323 ++ bfq_log_bfqq(bfqd, bfqq,
2324 ++ "wrais starting at %lu, rais_max_time %u",
2325 ++ jiffies,
2326 ++ jiffies_to_msecs(bfqq->wr_cur_max_time));
2327 ++ } else if (old_wr_coeff > 1) {
2328 ++ if (interactive)
2329 ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
2330 ++ else if (in_burst ||
2331 ++ (bfqq->wr_cur_max_time ==
2332 ++ bfqd->bfq_wr_rt_max_time &&
2333 ++ !soft_rt)) {
2334 ++ bfqq->wr_coeff = 1;
2335 ++ bfq_log_bfqq(bfqd, bfqq,
2336 ++ "wrais ending at %lu, rais_max_time %u",
2337 ++ jiffies,
2338 ++ jiffies_to_msecs(bfqq->
2339 ++ wr_cur_max_time));
2340 ++ } else if (time_before(
2341 ++ bfqq->last_wr_start_finish +
2342 ++ bfqq->wr_cur_max_time,
2343 ++ jiffies +
2344 ++ bfqd->bfq_wr_rt_max_time) &&
2345 ++ soft_rt) {
2346 ++ /*
2347 ++ *
2348 ++ * The remaining weight-raising time is lower
2349 ++ * than bfqd->bfq_wr_rt_max_time, which means
2350 ++ * that the application is enjoying weight
2351 ++ * raising either because deemed soft-rt in
2352 ++ * the near past, or because deemed interactive
2353 ++ * a long ago.
2354 ++ * In both cases, resetting now the current
2355 ++ * remaining weight-raising time for the
2356 ++ * application to the weight-raising duration
2357 ++ * for soft rt applications would not cause any
2358 ++ * latency increase for the application (as the
2359 ++ * new duration would be higher than the
2360 ++ * remaining time).
2361 ++ *
2362 ++ * In addition, the application is now meeting
2363 ++ * the requirements for being deemed soft rt.
2364 ++ * In the end we can correctly and safely
2365 ++ * (re)charge the weight-raising duration for
2366 ++ * the application with the weight-raising
2367 ++ * duration for soft rt applications.
2368 ++ *
2369 ++ * In particular, doing this recharge now, i.e.,
2370 ++ * before the weight-raising period for the
2371 ++ * application finishes, reduces the probability
2372 ++ * of the following negative scenario:
2373 ++ * 1) the weight of a soft rt application is
2374 ++ * raised at startup (as for any newly
2375 ++ * created application),
2376 ++ * 2) since the application is not interactive,
2377 ++ * at a certain time weight-raising is
2378 ++ * stopped for the application,
2379 ++ * 3) at that time the application happens to
2380 ++ * still have pending requests, and hence
2381 ++ * is destined to not have a chance to be
2382 ++ * deemed soft rt before these requests are
2383 ++ * completed (see the comments to the
2384 ++ * function bfq_bfqq_softrt_next_start()
2385 ++ * for details on soft rt detection),
2386 ++ * 4) these pending requests experience a high
2387 ++ * latency because the application is not
2388 ++ * weight-raised while they are pending.
2389 ++ */
2390 ++ bfqq->last_wr_start_finish = jiffies;
2391 ++ bfqq->wr_cur_max_time =
2392 ++ bfqd->bfq_wr_rt_max_time;
2393 ++ }
2394 ++ }
2395 ++ if (old_wr_coeff != bfqq->wr_coeff)
2396 ++ entity->prio_changed = 1;
2397 ++add_bfqq_busy:
2398 ++ bfqq->last_idle_bklogged = jiffies;
2399 ++ bfqq->service_from_backlogged = 0;
2400 ++ bfq_clear_bfqq_softrt_update(bfqq);
2401 ++ bfq_add_bfqq_busy(bfqd, bfqq);
2402 ++ } else {
2403 ++ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
2404 ++ time_is_before_jiffies(
2405 ++ bfqq->last_wr_start_finish +
2406 ++ bfqd->bfq_wr_min_inter_arr_async)) {
2407 ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
2408 ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
2409 ++
2410 ++ bfqd->wr_busy_queues++;
2411 ++ entity->prio_changed = 1;
2412 ++ bfq_log_bfqq(bfqd, bfqq,
2413 ++ "non-idle wrais starting at %lu, rais_max_time %u",
2414 ++ jiffies,
2415 ++ jiffies_to_msecs(bfqq->wr_cur_max_time));
2416 ++ }
2417 ++ if (prev != bfqq->next_rq)
2418 ++ bfq_updated_next_req(bfqd, bfqq);
2419 ++ }
2420 ++
2421 ++ if (bfqd->low_latency &&
2422 ++ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
2423 ++ bfqq->last_wr_start_finish = jiffies;
2424 ++}
2425 ++
2426 ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
2427 ++ struct bio *bio)
2428 ++{
2429 ++ struct task_struct *tsk = current;
2430 ++ struct bfq_io_cq *bic;
2431 ++ struct bfq_queue *bfqq;
2432 ++
2433 ++ bic = bfq_bic_lookup(bfqd, tsk->io_context);
2434 ++ if (!bic)
2435 ++ return NULL;
2436 ++
2437 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
2438 ++ if (bfqq)
2439 ++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
2440 ++
2441 ++ return NULL;
2442 ++}
2443 ++
2444 ++static void bfq_activate_request(struct request_queue *q, struct request *rq)
2445 ++{
2446 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
2447 ++
2448 ++ bfqd->rq_in_driver++;
2449 ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
2450 ++ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
2451 ++ (long long unsigned)bfqd->last_position);
2452 ++}
2453 ++
2454 ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
2455 ++{
2456 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
2457 ++
2458 ++ BUG_ON(bfqd->rq_in_driver == 0);
2459 ++ bfqd->rq_in_driver--;
2460 ++}
2461 ++
2462 ++static void bfq_remove_request(struct request *rq)
2463 ++{
2464 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
2465 ++ struct bfq_data *bfqd = bfqq->bfqd;
2466 ++ const int sync = rq_is_sync(rq);
2467 ++
2468 ++ if (bfqq->next_rq == rq) {
2469 ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
2470 ++ bfq_updated_next_req(bfqd, bfqq);
2471 ++ }
2472 ++
2473 ++ if (rq->queuelist.prev != &rq->queuelist)
2474 ++ list_del_init(&rq->queuelist);
2475 ++ BUG_ON(bfqq->queued[sync] == 0);
2476 ++ bfqq->queued[sync]--;
2477 ++ bfqd->queued--;
2478 ++ elv_rb_del(&bfqq->sort_list, rq);
2479 ++
2480 ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
2481 ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
2482 ++ bfq_del_bfqq_busy(bfqd, bfqq, 1);
2483 ++ /*
2484 ++ * Remove queue from request-position tree as it is empty.
2485 ++ */
2486 ++ if (bfqq->pos_root) {
2487 ++ rb_erase(&bfqq->pos_node, bfqq->pos_root);
2488 ++ bfqq->pos_root = NULL;
2489 ++ }
2490 ++ }
2491 ++
2492 ++ if (rq->cmd_flags & REQ_META) {
2493 ++ BUG_ON(bfqq->meta_pending == 0);
2494 ++ bfqq->meta_pending--;
2495 ++ }
2496 ++#ifdef CONFIG_BFQ_GROUP_IOSCHED
2497 ++ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
2498 ++#endif
2499 ++}
2500 ++
2501 ++static int bfq_merge(struct request_queue *q, struct request **req,
2502 ++ struct bio *bio)
2503 ++{
2504 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
2505 ++ struct request *__rq;
2506 ++
2507 ++ __rq = bfq_find_rq_fmerge(bfqd, bio);
2508 ++ if (__rq && elv_rq_merge_ok(__rq, bio)) {
2509 ++ *req = __rq;
2510 ++ return ELEVATOR_FRONT_MERGE;
2511 ++ }
2512 ++
2513 ++ return ELEVATOR_NO_MERGE;
2514 ++}
2515 ++
2516 ++static void bfq_merged_request(struct request_queue *q, struct request *req,
2517 ++ int type)
2518 ++{
2519 ++ if (type == ELEVATOR_FRONT_MERGE &&
2520 ++ rb_prev(&req->rb_node) &&
2521 ++ blk_rq_pos(req) <
2522 ++ blk_rq_pos(container_of(rb_prev(&req->rb_node),
2523 ++ struct request, rb_node))) {
2524 ++ struct bfq_queue *bfqq = RQ_BFQQ(req);
2525 ++ struct bfq_data *bfqd = bfqq->bfqd;
2526 ++ struct request *prev, *next_rq;
2527 ++
2528 ++ /* Reposition request in its sort_list */
2529 ++ elv_rb_del(&bfqq->sort_list, req);
2530 ++ elv_rb_add(&bfqq->sort_list, req);
2531 ++ /* Choose next request to be served for bfqq */
2532 ++ prev = bfqq->next_rq;
2533 ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
2534 ++ bfqd->last_position);
2535 ++ BUG_ON(!next_rq);
2536 ++ bfqq->next_rq = next_rq;
2537 ++ }
2538 ++}
2539 ++
2540 ++#ifdef CONFIG_BFQ_GROUP_IOSCHED
2541 ++static void bfq_bio_merged(struct request_queue *q, struct request *req,
2542 ++ struct bio *bio)
2543 ++{
2544 ++ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw);
2545 ++}
2546 ++#endif
2547 ++
2548 ++static void bfq_merged_requests(struct request_queue *q, struct request *rq,
2549 ++ struct request *next)
2550 ++{
2551 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
2552 ++
2553 ++ /*
2554 ++ * If next and rq belong to the same bfq_queue and next is older
2555 ++ * than rq, then reposition rq in the fifo (by substituting next
2556 ++ * with rq). Otherwise, if next and rq belong to different
2557 ++ * bfq_queues, never reposition rq: in fact, we would have to