1 |
commit: 543cfadc9443b9cfdbfea73dfcd2b7eb82dec66e |
2 |
Author: Mike Pagano <mpagano <AT> gentoo <DOT> org> |
3 |
AuthorDate: Thu Jul 28 00:03:33 2016 +0000 |
4 |
Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org> |
5 |
CommitDate: Thu Jul 28 00:03:33 2016 +0000 |
6 |
URL: https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=543cfadc |
7 |
|
8 |
BFQ patches for 4.7. See http://algogroup.unimore.it/people/paolo/disk_sched/patches/4.7.0-v8/ |
9 |
|
10 |
0000_README | 16 + |
11 |
...oups-kconfig-build-bits-for-BFQ-v7r11-4.7.patch | 103 + |
12 |
...ntroduce-the-BFQ-v7r11-I-O-sched-for-4.7.patch1 | 7097 ++++++++++++++++++++ |
13 |
...arly-Queue-Merge-EQM-to-BFQ-v7r11-for-4.7.patch | 1101 +++ |
14 |
...rn-BFQ-v7r11-for-4.7.0-into-BFQ-v8-for-4.patch1 | 6361 ++++++++++++++++++ |
15 |
5 files changed, 14678 insertions(+) |
16 |
|
17 |
diff --git a/0000_README b/0000_README |
18 |
index 0530209..1b5179e 100644 |
19 |
--- a/0000_README |
20 |
+++ b/0000_README |
21 |
@@ -67,6 +67,22 @@ Patch: 5000_enable-additional-cpu-optimizations-for-gcc.patch |
22 |
From: https://github.com/graysky2/kernel_gcc_patch/ |
23 |
Desc: Kernel patch enables gcc < v4.9 optimizations for additional CPUs. |
24 |
|
25 |
+Patch: 5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.7.patch |
26 |
+From: http://algo.ing.unimo.it/people/paolo/disk_sched/ |
27 |
+Desc: BFQ v7r11 patch 1 for 4.7: Build, cgroups and kconfig bits |
28 |
+ |
29 |
+Patch: 5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.7.patch1 |
30 |
+From: http://algo.ing.unimo.it/people/paolo/disk_sched/ |
31 |
+Desc: BFQ v7r11 patch 2 for 4.7: BFQ Scheduler |
32 |
+ |
33 |
+Patch: 5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for-4.7.patch |
34 |
+From: http://algo.ing.unimo.it/people/paolo/disk_sched/ |
35 |
+Desc: BFQ v7r11 patch 3 for 4.7: Early Queue Merge (EQM) |
36 |
+ |
37 |
+Patch: 5004_blkck-bfq-turn-BFQ-v7r11-for-4.7.0-into-BFQ-v8-for-4.patch2 |
38 |
+From: http://algo.ing.unimo.it/people/paolo/disk_sched/ |
39 |
+Desc: BFQ v7r11 patch 4 for 4.7: Early Queue Merge (EQM) |
40 |
+ |
41 |
Patch: 5010_enable-additional-cpu-optimizations-for-gcc-4.9.patch |
42 |
From: https://github.com/graysky2/kernel_gcc_patch/ |
43 |
Desc: Kernel patch enables gcc >= v4.9 optimizations for additional CPUs. |
44 |
|
45 |
diff --git a/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.7.patch b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.7.patch |
46 |
new file mode 100644 |
47 |
index 0000000..45d0b07 |
48 |
--- /dev/null |
49 |
+++ b/5001_block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.7.patch |
50 |
@@ -0,0 +1,103 @@ |
51 |
+From feb58b4dd1e8fd895f28ba4c759e92febe316cb2 Mon Sep 17 00:00:00 2001 |
52 |
+From: Paolo Valente <paolo.valente@×××××××.it> |
53 |
+Date: Tue, 7 Apr 2015 13:39:12 +0200 |
54 |
+Subject: [PATCH 1/4] block: cgroups, kconfig, build bits for BFQ-v7r11-4.7.0 |
55 |
+ |
56 |
+Update Kconfig.iosched and do the related Makefile changes to include |
57 |
+kernel configuration options for BFQ. Also increase the number of |
58 |
+policies supported by the blkio controller so that BFQ can add its |
59 |
+own. |
60 |
+ |
61 |
+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it> |
62 |
+Signed-off-by: Arianna Avanzini <avanzini@××××××.com> |
63 |
+--- |
64 |
+ block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++ |
65 |
+ block/Makefile | 1 + |
66 |
+ include/linux/blkdev.h | 2 +- |
67 |
+ 3 files changed, 34 insertions(+), 1 deletion(-) |
68 |
+ |
69 |
+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched |
70 |
+index 421bef9..0ee5f0f 100644 |
71 |
+--- a/block/Kconfig.iosched |
72 |
++++ b/block/Kconfig.iosched |
73 |
+@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED |
74 |
+ ---help--- |
75 |
+ Enable group IO scheduling in CFQ. |
76 |
+ |
77 |
++config IOSCHED_BFQ |
78 |
++ tristate "BFQ I/O scheduler" |
79 |
++ default n |
80 |
++ ---help--- |
81 |
++ The BFQ I/O scheduler tries to distribute bandwidth among |
82 |
++ all processes according to their weights. |
83 |
++ It aims at distributing the bandwidth as desired, independently of |
84 |
++ the disk parameters and with any workload. It also tries to |
85 |
++ guarantee low latency to interactive and soft real-time |
86 |
++ applications. If compiled built-in (saying Y here), BFQ can |
87 |
++ be configured to support hierarchical scheduling. |
88 |
++ |
89 |
++config CGROUP_BFQIO |
90 |
++ bool "BFQ hierarchical scheduling support" |
91 |
++ depends on CGROUPS && IOSCHED_BFQ=y |
92 |
++ default n |
93 |
++ ---help--- |
94 |
++ Enable hierarchical scheduling in BFQ, using the cgroups |
95 |
++ filesystem interface. The name of the subsystem will be |
96 |
++ bfqio. |
97 |
++ |
98 |
+ choice |
99 |
+ prompt "Default I/O scheduler" |
100 |
+ default DEFAULT_CFQ |
101 |
+@@ -52,6 +73,16 @@ choice |
102 |
+ config DEFAULT_CFQ |
103 |
+ bool "CFQ" if IOSCHED_CFQ=y |
104 |
+ |
105 |
++ config DEFAULT_BFQ |
106 |
++ bool "BFQ" if IOSCHED_BFQ=y |
107 |
++ help |
108 |
++ Selects BFQ as the default I/O scheduler which will be |
109 |
++ used by default for all block devices. |
110 |
++ The BFQ I/O scheduler aims at distributing the bandwidth |
111 |
++ as desired, independently of the disk parameters and with |
112 |
++ any workload. It also tries to guarantee low latency to |
113 |
++ interactive and soft real-time applications. |
114 |
++ |
115 |
+ config DEFAULT_NOOP |
116 |
+ bool "No-op" |
117 |
+ |
118 |
+@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED |
119 |
+ string |
120 |
+ default "deadline" if DEFAULT_DEADLINE |
121 |
+ default "cfq" if DEFAULT_CFQ |
122 |
++ default "bfq" if DEFAULT_BFQ |
123 |
+ default "noop" if DEFAULT_NOOP |
124 |
+ |
125 |
+ endmenu |
126 |
+diff --git a/block/Makefile b/block/Makefile |
127 |
+index 9eda232..4a36683 100644 |
128 |
+--- a/block/Makefile |
129 |
++++ b/block/Makefile |
130 |
+@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o |
131 |
+ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o |
132 |
+ obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o |
133 |
+ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o |
134 |
++obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o |
135 |
+ |
136 |
+ obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o |
137 |
+ obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o |
138 |
+diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h |
139 |
+index 3d9cf32..8d862a0 100644 |
140 |
+--- a/include/linux/blkdev.h |
141 |
++++ b/include/linux/blkdev.h |
142 |
+@@ -45,7 +45,7 @@ struct pr_ops; |
143 |
+ * Maximum number of blkcg policies allowed to be registered concurrently. |
144 |
+ * Defined here to simplify include dependency. |
145 |
+ */ |
146 |
+-#define BLKCG_MAX_POLS 2 |
147 |
++#define BLKCG_MAX_POLS 3 |
148 |
+ |
149 |
+ struct request; |
150 |
+ typedef void (rq_end_io_fn)(struct request *, int); |
151 |
+-- |
152 |
+1.9.1 |
153 |
+ |
154 |
|
155 |
diff --git a/5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.7.patch1 b/5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.7.patch1 |
156 |
new file mode 100644 |
157 |
index 0000000..8a67a4b |
158 |
--- /dev/null |
159 |
+++ b/5002_block-introduce-the-BFQ-v7r11-I-O-sched-for-4.7.patch1 |
160 |
@@ -0,0 +1,7097 @@ |
161 |
+From 1f07b3f666e6da78d10e62cfb9696242e5b3005e Mon Sep 17 00:00:00 2001 |
162 |
+From: Paolo Valente <paolo.valente@×××××××.it> |
163 |
+Date: Thu, 9 May 2013 19:10:02 +0200 |
164 |
+Subject: [PATCH 2/4] block: introduce the BFQ-v7r11 I/O sched for 4.7.0 |
165 |
+ |
166 |
+The general structure is borrowed from CFQ, as much of the code for |
167 |
+handling I/O contexts. Over time, several useful features have been |
168 |
+ported from CFQ as well (details in the changelog in README.BFQ). A |
169 |
+(bfq_)queue is associated to each task doing I/O on a device, and each |
170 |
+time a scheduling decision has to be made a queue is selected and served |
171 |
+until it expires. |
172 |
+ |
173 |
+ - Slices are given in the service domain: tasks are assigned |
174 |
+ budgets, measured in number of sectors. Once got the disk, a task |
175 |
+ must however consume its assigned budget within a configurable |
176 |
+ maximum time (by default, the maximum possible value of the |
177 |
+ budgets is automatically computed to comply with this timeout). |
178 |
+ This allows the desired latency vs "throughput boosting" tradeoff |
179 |
+ to be set. |
180 |
+ |
181 |
+ - Budgets are scheduled according to a variant of WF2Q+, implemented |
182 |
+ using an augmented rb-tree to take eligibility into account while |
183 |
+ preserving an O(log N) overall complexity. |
184 |
+ |
185 |
+ - A low-latency tunable is provided; if enabled, both interactive |
186 |
+ and soft real-time applications are guaranteed a very low latency. |
187 |
+ |
188 |
+ - Latency guarantees are preserved also in the presence of NCQ. |
189 |
+ |
190 |
+ - Also with flash-based devices, a high throughput is achieved |
191 |
+ while still preserving latency guarantees. |
192 |
+ |
193 |
+ - BFQ features Early Queue Merge (EQM), a sort of fusion of the |
194 |
+ cooperating-queue-merging and the preemption mechanisms present |
195 |
+ in CFQ. EQM is in fact a unified mechanism that tries to get a |
196 |
+ sequential read pattern, and hence a high throughput, with any |
197 |
+ set of processes performing interleaved I/O over a contiguous |
198 |
+ sequence of sectors. |
199 |
+ |
200 |
+ - BFQ supports full hierarchical scheduling, exporting a cgroups |
201 |
+ interface. Since each node has a full scheduler, each group can |
202 |
+ be assigned its own weight. |
203 |
+ |
204 |
+ - If the cgroups interface is not used, only I/O priorities can be |
205 |
+ assigned to processes, with ioprio values mapped to weights |
206 |
+ with the relation weight = IOPRIO_BE_NR - ioprio. |
207 |
+ |
208 |
+ - ioprio classes are served in strict priority order, i.e., lower |
209 |
+ priority queues are not served as long as there are higher |
210 |
+ priority queues. Among queues in the same class the bandwidth is |
211 |
+ distributed in proportion to the weight of each queue. A very |
212 |
+ thin extra bandwidth is however guaranteed to the Idle class, to |
213 |
+ prevent it from starving. |
214 |
+ |
215 |
+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it> |
216 |
+Signed-off-by: Arianna Avanzini <avanzini@××××××.com> |
217 |
+--- |
218 |
+ block/Kconfig.iosched | 6 +- |
219 |
+ block/bfq-cgroup.c | 1182 ++++++++++++++++ |
220 |
+ block/bfq-ioc.c | 36 + |
221 |
+ block/bfq-iosched.c | 3754 +++++++++++++++++++++++++++++++++++++++++++++++++ |
222 |
+ block/bfq-sched.c | 1200 ++++++++++++++++ |
223 |
+ block/bfq.h | 801 +++++++++++ |
224 |
+ 6 files changed, 6975 insertions(+), 4 deletions(-) |
225 |
+ create mode 100644 block/bfq-cgroup.c |
226 |
+ create mode 100644 block/bfq-ioc.c |
227 |
+ create mode 100644 block/bfq-iosched.c |
228 |
+ create mode 100644 block/bfq-sched.c |
229 |
+ create mode 100644 block/bfq.h |
230 |
+ |
231 |
+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched |
232 |
+index 0ee5f0f..f78cd1a 100644 |
233 |
+--- a/block/Kconfig.iosched |
234 |
++++ b/block/Kconfig.iosched |
235 |
+@@ -51,14 +51,12 @@ config IOSCHED_BFQ |
236 |
+ applications. If compiled built-in (saying Y here), BFQ can |
237 |
+ be configured to support hierarchical scheduling. |
238 |
+ |
239 |
+-config CGROUP_BFQIO |
240 |
++config BFQ_GROUP_IOSCHED |
241 |
+ bool "BFQ hierarchical scheduling support" |
242 |
+ depends on CGROUPS && IOSCHED_BFQ=y |
243 |
+ default n |
244 |
+ ---help--- |
245 |
+- Enable hierarchical scheduling in BFQ, using the cgroups |
246 |
+- filesystem interface. The name of the subsystem will be |
247 |
+- bfqio. |
248 |
++ Enable hierarchical scheduling in BFQ, using the blkio controller. |
249 |
+ |
250 |
+ choice |
251 |
+ prompt "Default I/O scheduler" |
252 |
+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c |
253 |
+new file mode 100644 |
254 |
+index 0000000..8610cd6 |
255 |
+--- /dev/null |
256 |
++++ b/block/bfq-cgroup.c |
257 |
+@@ -0,0 +1,1182 @@ |
258 |
++/* |
259 |
++ * BFQ: CGROUPS support. |
260 |
++ * |
261 |
++ * Based on ideas and code from CFQ: |
262 |
++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
263 |
++ * |
264 |
++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
265 |
++ * Paolo Valente <paolo.valente@×××××××.it> |
266 |
++ * |
267 |
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
268 |
++ * |
269 |
++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ |
270 |
++ * file. |
271 |
++ */ |
272 |
++ |
273 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
274 |
++ |
275 |
++/* bfqg stats flags */ |
276 |
++enum bfqg_stats_flags { |
277 |
++ BFQG_stats_waiting = 0, |
278 |
++ BFQG_stats_idling, |
279 |
++ BFQG_stats_empty, |
280 |
++}; |
281 |
++ |
282 |
++#define BFQG_FLAG_FNS(name) \ |
283 |
++static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ |
284 |
++{ \ |
285 |
++ stats->flags |= (1 << BFQG_stats_##name); \ |
286 |
++} \ |
287 |
++static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ |
288 |
++{ \ |
289 |
++ stats->flags &= ~(1 << BFQG_stats_##name); \ |
290 |
++} \ |
291 |
++static int bfqg_stats_##name(struct bfqg_stats *stats) \ |
292 |
++{ \ |
293 |
++ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ |
294 |
++} \ |
295 |
++ |
296 |
++BFQG_FLAG_FNS(waiting) |
297 |
++BFQG_FLAG_FNS(idling) |
298 |
++BFQG_FLAG_FNS(empty) |
299 |
++#undef BFQG_FLAG_FNS |
300 |
++ |
301 |
++/* This should be called with the queue_lock held. */ |
302 |
++static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) |
303 |
++{ |
304 |
++ unsigned long long now; |
305 |
++ |
306 |
++ if (!bfqg_stats_waiting(stats)) |
307 |
++ return; |
308 |
++ |
309 |
++ now = sched_clock(); |
310 |
++ if (time_after64(now, stats->start_group_wait_time)) |
311 |
++ blkg_stat_add(&stats->group_wait_time, |
312 |
++ now - stats->start_group_wait_time); |
313 |
++ bfqg_stats_clear_waiting(stats); |
314 |
++} |
315 |
++ |
316 |
++/* This should be called with the queue_lock held. */ |
317 |
++static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, |
318 |
++ struct bfq_group *curr_bfqg) |
319 |
++{ |
320 |
++ struct bfqg_stats *stats = &bfqg->stats; |
321 |
++ |
322 |
++ if (bfqg_stats_waiting(stats)) |
323 |
++ return; |
324 |
++ if (bfqg == curr_bfqg) |
325 |
++ return; |
326 |
++ stats->start_group_wait_time = sched_clock(); |
327 |
++ bfqg_stats_mark_waiting(stats); |
328 |
++} |
329 |
++ |
330 |
++/* This should be called with the queue_lock held. */ |
331 |
++static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) |
332 |
++{ |
333 |
++ unsigned long long now; |
334 |
++ |
335 |
++ if (!bfqg_stats_empty(stats)) |
336 |
++ return; |
337 |
++ |
338 |
++ now = sched_clock(); |
339 |
++ if (time_after64(now, stats->start_empty_time)) |
340 |
++ blkg_stat_add(&stats->empty_time, |
341 |
++ now - stats->start_empty_time); |
342 |
++ bfqg_stats_clear_empty(stats); |
343 |
++} |
344 |
++ |
345 |
++static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) |
346 |
++{ |
347 |
++ blkg_stat_add(&bfqg->stats.dequeue, 1); |
348 |
++} |
349 |
++ |
350 |
++static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) |
351 |
++{ |
352 |
++ struct bfqg_stats *stats = &bfqg->stats; |
353 |
++ |
354 |
++ if (blkg_rwstat_total(&stats->queued)) |
355 |
++ return; |
356 |
++ |
357 |
++ /* |
358 |
++ * group is already marked empty. This can happen if bfqq got new |
359 |
++ * request in parent group and moved to this group while being added |
360 |
++ * to service tree. Just ignore the event and move on. |
361 |
++ */ |
362 |
++ if (bfqg_stats_empty(stats)) |
363 |
++ return; |
364 |
++ |
365 |
++ stats->start_empty_time = sched_clock(); |
366 |
++ bfqg_stats_mark_empty(stats); |
367 |
++} |
368 |
++ |
369 |
++static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) |
370 |
++{ |
371 |
++ struct bfqg_stats *stats = &bfqg->stats; |
372 |
++ |
373 |
++ if (bfqg_stats_idling(stats)) { |
374 |
++ unsigned long long now = sched_clock(); |
375 |
++ |
376 |
++ if (time_after64(now, stats->start_idle_time)) |
377 |
++ blkg_stat_add(&stats->idle_time, |
378 |
++ now - stats->start_idle_time); |
379 |
++ bfqg_stats_clear_idling(stats); |
380 |
++ } |
381 |
++} |
382 |
++ |
383 |
++static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) |
384 |
++{ |
385 |
++ struct bfqg_stats *stats = &bfqg->stats; |
386 |
++ |
387 |
++ stats->start_idle_time = sched_clock(); |
388 |
++ bfqg_stats_mark_idling(stats); |
389 |
++} |
390 |
++ |
391 |
++static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) |
392 |
++{ |
393 |
++ struct bfqg_stats *stats = &bfqg->stats; |
394 |
++ |
395 |
++ blkg_stat_add(&stats->avg_queue_size_sum, |
396 |
++ blkg_rwstat_total(&stats->queued)); |
397 |
++ blkg_stat_add(&stats->avg_queue_size_samples, 1); |
398 |
++ bfqg_stats_update_group_wait_time(stats); |
399 |
++} |
400 |
++ |
401 |
++static struct blkcg_policy blkcg_policy_bfq; |
402 |
++ |
403 |
++/* |
404 |
++ * blk-cgroup policy-related handlers |
405 |
++ * The following functions help in converting between blk-cgroup |
406 |
++ * internal structures and BFQ-specific structures. |
407 |
++ */ |
408 |
++ |
409 |
++static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) |
410 |
++{ |
411 |
++ return pd ? container_of(pd, struct bfq_group, pd) : NULL; |
412 |
++} |
413 |
++ |
414 |
++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) |
415 |
++{ |
416 |
++ return pd_to_blkg(&bfqg->pd); |
417 |
++} |
418 |
++ |
419 |
++static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) |
420 |
++{ |
421 |
++ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); |
422 |
++ BUG_ON(!pd); |
423 |
++ return pd_to_bfqg(pd); |
424 |
++} |
425 |
++ |
426 |
++/* |
427 |
++ * bfq_group handlers |
428 |
++ * The following functions help in navigating the bfq_group hierarchy |
429 |
++ * by allowing to find the parent of a bfq_group or the bfq_group |
430 |
++ * associated to a bfq_queue. |
431 |
++ */ |
432 |
++ |
433 |
++static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) |
434 |
++{ |
435 |
++ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; |
436 |
++ |
437 |
++ return pblkg ? blkg_to_bfqg(pblkg) : NULL; |
438 |
++} |
439 |
++ |
440 |
++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) |
441 |
++{ |
442 |
++ struct bfq_entity *group_entity = bfqq->entity.parent; |
443 |
++ |
444 |
++ return group_entity ? container_of(group_entity, struct bfq_group, |
445 |
++ entity) : |
446 |
++ bfqq->bfqd->root_group; |
447 |
++} |
448 |
++ |
449 |
++/* |
450 |
++ * The following two functions handle get and put of a bfq_group by |
451 |
++ * wrapping the related blk-cgroup hooks. |
452 |
++ */ |
453 |
++ |
454 |
++static void bfqg_get(struct bfq_group *bfqg) |
455 |
++{ |
456 |
++ return blkg_get(bfqg_to_blkg(bfqg)); |
457 |
++} |
458 |
++ |
459 |
++static void bfqg_put(struct bfq_group *bfqg) |
460 |
++{ |
461 |
++ return blkg_put(bfqg_to_blkg(bfqg)); |
462 |
++} |
463 |
++ |
464 |
++static void bfqg_stats_update_io_add(struct bfq_group *bfqg, |
465 |
++ struct bfq_queue *bfqq, |
466 |
++ int rw) |
467 |
++{ |
468 |
++ blkg_rwstat_add(&bfqg->stats.queued, rw, 1); |
469 |
++ bfqg_stats_end_empty_time(&bfqg->stats); |
470 |
++ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) |
471 |
++ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); |
472 |
++} |
473 |
++ |
474 |
++static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) |
475 |
++{ |
476 |
++ blkg_rwstat_add(&bfqg->stats.queued, rw, -1); |
477 |
++} |
478 |
++ |
479 |
++static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) |
480 |
++{ |
481 |
++ blkg_rwstat_add(&bfqg->stats.merged, rw, 1); |
482 |
++} |
483 |
++ |
484 |
++static void bfqg_stats_update_dispatch(struct bfq_group *bfqg, |
485 |
++ uint64_t bytes, int rw) |
486 |
++{ |
487 |
++ blkg_stat_add(&bfqg->stats.sectors, bytes >> 9); |
488 |
++ blkg_rwstat_add(&bfqg->stats.serviced, rw, 1); |
489 |
++ blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes); |
490 |
++} |
491 |
++ |
492 |
++static void bfqg_stats_update_completion(struct bfq_group *bfqg, |
493 |
++ uint64_t start_time, uint64_t io_start_time, int rw) |
494 |
++{ |
495 |
++ struct bfqg_stats *stats = &bfqg->stats; |
496 |
++ unsigned long long now = sched_clock(); |
497 |
++ |
498 |
++ if (time_after64(now, io_start_time)) |
499 |
++ blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); |
500 |
++ if (time_after64(io_start_time, start_time)) |
501 |
++ blkg_rwstat_add(&stats->wait_time, rw, |
502 |
++ io_start_time - start_time); |
503 |
++} |
504 |
++ |
505 |
++/* @stats = 0 */ |
506 |
++static void bfqg_stats_reset(struct bfqg_stats *stats) |
507 |
++{ |
508 |
++ if (!stats) |
509 |
++ return; |
510 |
++ |
511 |
++ /* queued stats shouldn't be cleared */ |
512 |
++ blkg_rwstat_reset(&stats->service_bytes); |
513 |
++ blkg_rwstat_reset(&stats->serviced); |
514 |
++ blkg_rwstat_reset(&stats->merged); |
515 |
++ blkg_rwstat_reset(&stats->service_time); |
516 |
++ blkg_rwstat_reset(&stats->wait_time); |
517 |
++ blkg_stat_reset(&stats->time); |
518 |
++ blkg_stat_reset(&stats->unaccounted_time); |
519 |
++ blkg_stat_reset(&stats->avg_queue_size_sum); |
520 |
++ blkg_stat_reset(&stats->avg_queue_size_samples); |
521 |
++ blkg_stat_reset(&stats->dequeue); |
522 |
++ blkg_stat_reset(&stats->group_wait_time); |
523 |
++ blkg_stat_reset(&stats->idle_time); |
524 |
++ blkg_stat_reset(&stats->empty_time); |
525 |
++} |
526 |
++ |
527 |
++/* @to += @from */ |
528 |
++static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from) |
529 |
++{ |
530 |
++ if (!to || !from) |
531 |
++ return; |
532 |
++ |
533 |
++ /* queued stats shouldn't be cleared */ |
534 |
++ blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); |
535 |
++ blkg_rwstat_add_aux(&to->serviced, &from->serviced); |
536 |
++ blkg_rwstat_add_aux(&to->merged, &from->merged); |
537 |
++ blkg_rwstat_add_aux(&to->service_time, &from->service_time); |
538 |
++ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); |
539 |
++ blkg_stat_add_aux(&from->time, &from->time); |
540 |
++ blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); |
541 |
++ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); |
542 |
++ blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); |
543 |
++ blkg_stat_add_aux(&to->dequeue, &from->dequeue); |
544 |
++ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); |
545 |
++ blkg_stat_add_aux(&to->idle_time, &from->idle_time); |
546 |
++ blkg_stat_add_aux(&to->empty_time, &from->empty_time); |
547 |
++} |
548 |
++ |
549 |
++/* |
550 |
++ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' |
551 |
++ * recursive stats can still account for the amount used by this bfqg after |
552 |
++ * it's gone. |
553 |
++ */ |
554 |
++static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) |
555 |
++{ |
556 |
++ struct bfq_group *parent; |
557 |
++ |
558 |
++ if (!bfqg) /* root_group */ |
559 |
++ return; |
560 |
++ |
561 |
++ parent = bfqg_parent(bfqg); |
562 |
++ |
563 |
++ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); |
564 |
++ |
565 |
++ if (unlikely(!parent)) |
566 |
++ return; |
567 |
++ |
568 |
++ bfqg_stats_merge(&parent->dead_stats, &bfqg->stats); |
569 |
++ bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats); |
570 |
++ bfqg_stats_reset(&bfqg->stats); |
571 |
++ bfqg_stats_reset(&bfqg->dead_stats); |
572 |
++} |
573 |
++ |
574 |
++static void bfq_init_entity(struct bfq_entity *entity, |
575 |
++ struct bfq_group *bfqg) |
576 |
++{ |
577 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
578 |
++ |
579 |
++ entity->weight = entity->new_weight; |
580 |
++ entity->orig_weight = entity->new_weight; |
581 |
++ if (bfqq) { |
582 |
++ bfqq->ioprio = bfqq->new_ioprio; |
583 |
++ bfqq->ioprio_class = bfqq->new_ioprio_class; |
584 |
++ bfqg_get(bfqg); |
585 |
++ } |
586 |
++ entity->parent = bfqg->my_entity; |
587 |
++ entity->sched_data = &bfqg->sched_data; |
588 |
++} |
589 |
++ |
590 |
++static void bfqg_stats_exit(struct bfqg_stats *stats) |
591 |
++{ |
592 |
++ blkg_rwstat_exit(&stats->service_bytes); |
593 |
++ blkg_rwstat_exit(&stats->serviced); |
594 |
++ blkg_rwstat_exit(&stats->merged); |
595 |
++ blkg_rwstat_exit(&stats->service_time); |
596 |
++ blkg_rwstat_exit(&stats->wait_time); |
597 |
++ blkg_rwstat_exit(&stats->queued); |
598 |
++ blkg_stat_exit(&stats->sectors); |
599 |
++ blkg_stat_exit(&stats->time); |
600 |
++ blkg_stat_exit(&stats->unaccounted_time); |
601 |
++ blkg_stat_exit(&stats->avg_queue_size_sum); |
602 |
++ blkg_stat_exit(&stats->avg_queue_size_samples); |
603 |
++ blkg_stat_exit(&stats->dequeue); |
604 |
++ blkg_stat_exit(&stats->group_wait_time); |
605 |
++ blkg_stat_exit(&stats->idle_time); |
606 |
++ blkg_stat_exit(&stats->empty_time); |
607 |
++} |
608 |
++ |
609 |
++static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) |
610 |
++{ |
611 |
++ if (blkg_rwstat_init(&stats->service_bytes, gfp) || |
612 |
++ blkg_rwstat_init(&stats->serviced, gfp) || |
613 |
++ blkg_rwstat_init(&stats->merged, gfp) || |
614 |
++ blkg_rwstat_init(&stats->service_time, gfp) || |
615 |
++ blkg_rwstat_init(&stats->wait_time, gfp) || |
616 |
++ blkg_rwstat_init(&stats->queued, gfp) || |
617 |
++ blkg_stat_init(&stats->sectors, gfp) || |
618 |
++ blkg_stat_init(&stats->time, gfp) || |
619 |
++ blkg_stat_init(&stats->unaccounted_time, gfp) || |
620 |
++ blkg_stat_init(&stats->avg_queue_size_sum, gfp) || |
621 |
++ blkg_stat_init(&stats->avg_queue_size_samples, gfp) || |
622 |
++ blkg_stat_init(&stats->dequeue, gfp) || |
623 |
++ blkg_stat_init(&stats->group_wait_time, gfp) || |
624 |
++ blkg_stat_init(&stats->idle_time, gfp) || |
625 |
++ blkg_stat_init(&stats->empty_time, gfp)) { |
626 |
++ bfqg_stats_exit(stats); |
627 |
++ return -ENOMEM; |
628 |
++ } |
629 |
++ |
630 |
++ return 0; |
631 |
++} |
632 |
++ |
633 |
++static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) |
634 |
++ { |
635 |
++ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; |
636 |
++ } |
637 |
++ |
638 |
++static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) |
639 |
++{ |
640 |
++ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); |
641 |
++} |
642 |
++ |
643 |
++static void bfq_cpd_init(struct blkcg_policy_data *cpd) |
644 |
++{ |
645 |
++ struct bfq_group_data *d = cpd_to_bfqgd(cpd); |
646 |
++ |
647 |
++ d->weight = BFQ_DEFAULT_GRP_WEIGHT; |
648 |
++} |
649 |
++ |
650 |
++static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) |
651 |
++{ |
652 |
++ struct bfq_group *bfqg; |
653 |
++ |
654 |
++ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); |
655 |
++ if (!bfqg) |
656 |
++ return NULL; |
657 |
++ |
658 |
++ if (bfqg_stats_init(&bfqg->stats, gfp) || |
659 |
++ bfqg_stats_init(&bfqg->dead_stats, gfp)) { |
660 |
++ kfree(bfqg); |
661 |
++ return NULL; |
662 |
++ } |
663 |
++ |
664 |
++ return &bfqg->pd; |
665 |
++} |
666 |
++ |
667 |
++static void bfq_group_set_parent(struct bfq_group *bfqg, |
668 |
++ struct bfq_group *parent) |
669 |
++{ |
670 |
++ struct bfq_entity *entity; |
671 |
++ |
672 |
++ BUG_ON(!parent); |
673 |
++ BUG_ON(!bfqg); |
674 |
++ BUG_ON(bfqg == parent); |
675 |
++ |
676 |
++ entity = &bfqg->entity; |
677 |
++ entity->parent = parent->my_entity; |
678 |
++ entity->sched_data = &parent->sched_data; |
679 |
++} |
680 |
++ |
681 |
++static void bfq_pd_init(struct blkg_policy_data *pd) |
682 |
++{ |
683 |
++ struct blkcg_gq *blkg = pd_to_blkg(pd); |
684 |
++ struct bfq_group *bfqg = blkg_to_bfqg(blkg); |
685 |
++ struct bfq_data *bfqd = blkg->q->elevator->elevator_data; |
686 |
++ struct bfq_entity *entity = &bfqg->entity; |
687 |
++ struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); |
688 |
++ |
689 |
++ entity->orig_weight = entity->weight = entity->new_weight = d->weight; |
690 |
++ entity->my_sched_data = &bfqg->sched_data; |
691 |
++ bfqg->my_entity = entity; /* |
692 |
++ * the root_group's will be set to NULL |
693 |
++ * in bfq_init_queue() |
694 |
++ */ |
695 |
++ bfqg->bfqd = bfqd; |
696 |
++ bfqg->active_entities = 0; |
697 |
++} |
698 |
++ |
699 |
++static void bfq_pd_free(struct blkg_policy_data *pd) |
700 |
++{ |
701 |
++ struct bfq_group *bfqg = pd_to_bfqg(pd); |
702 |
++ |
703 |
++ bfqg_stats_exit(&bfqg->stats); |
704 |
++ bfqg_stats_exit(&bfqg->dead_stats); |
705 |
++ |
706 |
++ return kfree(bfqg); |
707 |
++} |
708 |
++ |
709 |
++/* offset delta from bfqg->stats to bfqg->dead_stats */ |
710 |
++static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) - |
711 |
++ offsetof(struct bfq_group, stats); |
712 |
++ |
713 |
++/* to be used by recursive prfill, sums live and dead stats recursively */ |
714 |
++static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) |
715 |
++{ |
716 |
++ u64 sum = 0; |
717 |
++ |
718 |
++ sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); |
719 |
++ sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, |
720 |
++ off + dead_stats_off_delta); |
721 |
++ return sum; |
722 |
++} |
723 |
++ |
724 |
++/* to be used by recursive prfill, sums live and dead rwstats recursively */ |
725 |
++static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, |
726 |
++ int off) |
727 |
++{ |
728 |
++ struct blkg_rwstat a, b; |
729 |
++ |
730 |
++ a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); |
731 |
++ b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, |
732 |
++ off + dead_stats_off_delta); |
733 |
++ blkg_rwstat_add_aux(&a, &b); |
734 |
++ return a; |
735 |
++} |
736 |
++ |
737 |
++static void bfq_pd_reset_stats(struct blkg_policy_data *pd) |
738 |
++{ |
739 |
++ struct bfq_group *bfqg = pd_to_bfqg(pd); |
740 |
++ |
741 |
++ bfqg_stats_reset(&bfqg->stats); |
742 |
++ bfqg_stats_reset(&bfqg->dead_stats); |
743 |
++} |
744 |
++ |
745 |
++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, |
746 |
++ struct blkcg *blkcg) |
747 |
++{ |
748 |
++ struct request_queue *q = bfqd->queue; |
749 |
++ struct bfq_group *bfqg = NULL, *parent; |
750 |
++ struct bfq_entity *entity = NULL; |
751 |
++ |
752 |
++ assert_spin_locked(bfqd->queue->queue_lock); |
753 |
++ |
754 |
++ /* avoid lookup for the common case where there's no blkcg */ |
755 |
++ if (blkcg == &blkcg_root) { |
756 |
++ bfqg = bfqd->root_group; |
757 |
++ } else { |
758 |
++ struct blkcg_gq *blkg; |
759 |
++ |
760 |
++ blkg = blkg_lookup_create(blkcg, q); |
761 |
++ if (!IS_ERR(blkg)) |
762 |
++ bfqg = blkg_to_bfqg(blkg); |
763 |
++ else /* fallback to root_group */ |
764 |
++ bfqg = bfqd->root_group; |
765 |
++ } |
766 |
++ |
767 |
++ BUG_ON(!bfqg); |
768 |
++ |
769 |
++ /* |
770 |
++ * Update chain of bfq_groups as we might be handling a leaf group |
771 |
++ * which, along with some of its relatives, has not been hooked yet |
772 |
++ * to the private hierarchy of BFQ. |
773 |
++ */ |
774 |
++ entity = &bfqg->entity; |
775 |
++ for_each_entity(entity) { |
776 |
++ bfqg = container_of(entity, struct bfq_group, entity); |
777 |
++ BUG_ON(!bfqg); |
778 |
++ if (bfqg != bfqd->root_group) { |
779 |
++ parent = bfqg_parent(bfqg); |
780 |
++ if (!parent) |
781 |
++ parent = bfqd->root_group; |
782 |
++ BUG_ON(!parent); |
783 |
++ bfq_group_set_parent(bfqg, parent); |
784 |
++ } |
785 |
++ } |
786 |
++ |
787 |
++ return bfqg; |
788 |
++} |
789 |
++ |
790 |
++/** |
791 |
++ * bfq_bfqq_move - migrate @bfqq to @bfqg. |
792 |
++ * @bfqd: queue descriptor. |
793 |
++ * @bfqq: the queue to move. |
794 |
++ * @entity: @bfqq's entity. |
795 |
++ * @bfqg: the group to move to. |
796 |
++ * |
797 |
++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating |
798 |
++ * it on the new one. Avoid putting the entity on the old group idle tree. |
799 |
++ * |
800 |
++ * Must be called under the queue lock; the cgroup owning @bfqg must |
801 |
++ * not disappear (by now this just means that we are called under |
802 |
++ * rcu_read_lock()). |
803 |
++ */ |
804 |
++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
805 |
++ struct bfq_entity *entity, struct bfq_group *bfqg) |
806 |
++{ |
807 |
++ int busy, resume; |
808 |
++ |
809 |
++ busy = bfq_bfqq_busy(bfqq); |
810 |
++ resume = !RB_EMPTY_ROOT(&bfqq->sort_list); |
811 |
++ |
812 |
++ BUG_ON(resume && !entity->on_st); |
813 |
++ BUG_ON(busy && !resume && entity->on_st && |
814 |
++ bfqq != bfqd->in_service_queue); |
815 |
++ |
816 |
++ if (busy) { |
817 |
++ BUG_ON(atomic_read(&bfqq->ref) < 2); |
818 |
++ |
819 |
++ if (!resume) |
820 |
++ bfq_del_bfqq_busy(bfqd, bfqq, 0); |
821 |
++ else |
822 |
++ bfq_deactivate_bfqq(bfqd, bfqq, 0); |
823 |
++ } else if (entity->on_st) |
824 |
++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); |
825 |
++ bfqg_put(bfqq_group(bfqq)); |
826 |
++ |
827 |
++ /* |
828 |
++ * Here we use a reference to bfqg. We don't need a refcounter |
829 |
++ * as the cgroup reference will not be dropped, so that its |
830 |
++ * destroy() callback will not be invoked. |
831 |
++ */ |
832 |
++ entity->parent = bfqg->my_entity; |
833 |
++ entity->sched_data = &bfqg->sched_data; |
834 |
++ bfqg_get(bfqg); |
835 |
++ |
836 |
++ if (busy) { |
837 |
++ if (resume) |
838 |
++ bfq_activate_bfqq(bfqd, bfqq); |
839 |
++ } |
840 |
++ |
841 |
++ if (!bfqd->in_service_queue && !bfqd->rq_in_driver) |
842 |
++ bfq_schedule_dispatch(bfqd); |
843 |
++} |
844 |
++ |
845 |
++/** |
846 |
++ * __bfq_bic_change_cgroup - move @bic to @cgroup. |
847 |
++ * @bfqd: the queue descriptor. |
848 |
++ * @bic: the bic to move. |
849 |
++ * @blkcg: the blk-cgroup to move to. |
850 |
++ * |
851 |
++ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller |
852 |
++ * has to make sure that the reference to cgroup is valid across the call. |
853 |
++ * |
854 |
++ * NOTE: an alternative approach might have been to store the current |
855 |
++ * cgroup in bfqq and getting a reference to it, reducing the lookup |
856 |
++ * time here, at the price of slightly more complex code. |
857 |
++ */ |
858 |
++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, |
859 |
++ struct bfq_io_cq *bic, |
860 |
++ struct blkcg *blkcg) |
861 |
++{ |
862 |
++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); |
863 |
++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); |
864 |
++ struct bfq_group *bfqg; |
865 |
++ struct bfq_entity *entity; |
866 |
++ |
867 |
++ lockdep_assert_held(bfqd->queue->queue_lock); |
868 |
++ |
869 |
++ bfqg = bfq_find_alloc_group(bfqd, blkcg); |
870 |
++ if (async_bfqq) { |
871 |
++ entity = &async_bfqq->entity; |
872 |
++ |
873 |
++ if (entity->sched_data != &bfqg->sched_data) { |
874 |
++ bic_set_bfqq(bic, NULL, 0); |
875 |
++ bfq_log_bfqq(bfqd, async_bfqq, |
876 |
++ "bic_change_group: %p %d", |
877 |
++ async_bfqq, atomic_read(&async_bfqq->ref)); |
878 |
++ bfq_put_queue(async_bfqq); |
879 |
++ } |
880 |
++ } |
881 |
++ |
882 |
++ if (sync_bfqq) { |
883 |
++ entity = &sync_bfqq->entity; |
884 |
++ if (entity->sched_data != &bfqg->sched_data) |
885 |
++ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); |
886 |
++ } |
887 |
++ |
888 |
++ return bfqg; |
889 |
++} |
890 |
++ |
891 |
++static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) |
892 |
++{ |
893 |
++ struct bfq_data *bfqd = bic_to_bfqd(bic); |
894 |
++ struct blkcg *blkcg; |
895 |
++ struct bfq_group *bfqg = NULL; |
896 |
++ uint64_t id; |
897 |
++ |
898 |
++ rcu_read_lock(); |
899 |
++ blkcg = bio_blkcg(bio); |
900 |
++ id = blkcg->css.serial_nr; |
901 |
++ rcu_read_unlock(); |
902 |
++ |
903 |
++ /* |
904 |
++ * Check whether blkcg has changed. The condition may trigger |
905 |
++ * spuriously on a newly created cic but there's no harm. |
906 |
++ */ |
907 |
++ if (unlikely(!bfqd) || likely(bic->blkcg_id == id)) |
908 |
++ return; |
909 |
++ |
910 |
++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg); |
911 |
++ BUG_ON(!bfqg); |
912 |
++ bic->blkcg_id = id; |
913 |
++} |
914 |
++ |
915 |
++/** |
916 |
++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. |
917 |
++ * @st: the service tree being flushed. |
918 |
++ */ |
919 |
++static void bfq_flush_idle_tree(struct bfq_service_tree *st) |
920 |
++{ |
921 |
++ struct bfq_entity *entity = st->first_idle; |
922 |
++ |
923 |
++ for (; entity ; entity = st->first_idle) |
924 |
++ __bfq_deactivate_entity(entity, 0); |
925 |
++} |
926 |
++ |
927 |
++/** |
928 |
++ * bfq_reparent_leaf_entity - move leaf entity to the root_group. |
929 |
++ * @bfqd: the device data structure with the root group. |
930 |
++ * @entity: the entity to move. |
931 |
++ */ |
932 |
++static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, |
933 |
++ struct bfq_entity *entity) |
934 |
++{ |
935 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
936 |
++ |
937 |
++ BUG_ON(!bfqq); |
938 |
++ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); |
939 |
++ return; |
940 |
++} |
941 |
++ |
942 |
++/** |
943 |
++ * bfq_reparent_active_entities - move to the root group all active |
944 |
++ * entities. |
945 |
++ * @bfqd: the device data structure with the root group. |
946 |
++ * @bfqg: the group to move from. |
947 |
++ * @st: the service tree with the entities. |
948 |
++ * |
949 |
++ * Needs queue_lock to be taken and reference to be valid over the call. |
950 |
++ */ |
951 |
++static void bfq_reparent_active_entities(struct bfq_data *bfqd, |
952 |
++ struct bfq_group *bfqg, |
953 |
++ struct bfq_service_tree *st) |
954 |
++{ |
955 |
++ struct rb_root *active = &st->active; |
956 |
++ struct bfq_entity *entity = NULL; |
957 |
++ |
958 |
++ if (!RB_EMPTY_ROOT(&st->active)) |
959 |
++ entity = bfq_entity_of(rb_first(active)); |
960 |
++ |
961 |
++ for (; entity ; entity = bfq_entity_of(rb_first(active))) |
962 |
++ bfq_reparent_leaf_entity(bfqd, entity); |
963 |
++ |
964 |
++ if (bfqg->sched_data.in_service_entity) |
965 |
++ bfq_reparent_leaf_entity(bfqd, |
966 |
++ bfqg->sched_data.in_service_entity); |
967 |
++ |
968 |
++ return; |
969 |
++} |
970 |
++ |
971 |
++/** |
972 |
++ * bfq_destroy_group - destroy @bfqg. |
973 |
++ * @bfqg: the group being destroyed. |
974 |
++ * |
975 |
++ * Destroy @bfqg, making sure that it is not referenced from its parent. |
976 |
++ * blkio already grabs the queue_lock for us, so no need to use RCU-based magic |
977 |
++ */ |
978 |
++static void bfq_pd_offline(struct blkg_policy_data *pd) |
979 |
++{ |
980 |
++ struct bfq_service_tree *st; |
981 |
++ struct bfq_group *bfqg; |
982 |
++ struct bfq_data *bfqd; |
983 |
++ struct bfq_entity *entity; |
984 |
++ int i; |
985 |
++ |
986 |
++ BUG_ON(!pd); |
987 |
++ bfqg = pd_to_bfqg(pd); |
988 |
++ BUG_ON(!bfqg); |
989 |
++ bfqd = bfqg->bfqd; |
990 |
++ BUG_ON(bfqd && !bfqd->root_group); |
991 |
++ |
992 |
++ entity = bfqg->my_entity; |
993 |
++ |
994 |
++ if (!entity) /* root group */ |
995 |
++ return; |
996 |
++ |
997 |
++ /* |
998 |
++ * Empty all service_trees belonging to this group before |
999 |
++ * deactivating the group itself. |
1000 |
++ */ |
1001 |
++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { |
1002 |
++ BUG_ON(!bfqg->sched_data.service_tree); |
1003 |
++ st = bfqg->sched_data.service_tree + i; |
1004 |
++ /* |
1005 |
++ * The idle tree may still contain bfq_queues belonging |
1006 |
++ * to exited task because they never migrated to a different |
1007 |
++ * cgroup from the one being destroyed now. No one else |
1008 |
++ * can access them so it's safe to act without any lock. |
1009 |
++ */ |
1010 |
++ bfq_flush_idle_tree(st); |
1011 |
++ |
1012 |
++ /* |
1013 |
++ * It may happen that some queues are still active |
1014 |
++ * (busy) upon group destruction (if the corresponding |
1015 |
++ * processes have been forced to terminate). We move |
1016 |
++ * all the leaf entities corresponding to these queues |
1017 |
++ * to the root_group. |
1018 |
++ * Also, it may happen that the group has an entity |
1019 |
++ * in service, which is disconnected from the active |
1020 |
++ * tree: it must be moved, too. |
1021 |
++ * There is no need to put the sync queues, as the |
1022 |
++ * scheduler has taken no reference. |
1023 |
++ */ |
1024 |
++ bfq_reparent_active_entities(bfqd, bfqg, st); |
1025 |
++ BUG_ON(!RB_EMPTY_ROOT(&st->active)); |
1026 |
++ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); |
1027 |
++ } |
1028 |
++ BUG_ON(bfqg->sched_data.next_in_service); |
1029 |
++ BUG_ON(bfqg->sched_data.in_service_entity); |
1030 |
++ |
1031 |
++ __bfq_deactivate_entity(entity, 0); |
1032 |
++ bfq_put_async_queues(bfqd, bfqg); |
1033 |
++ BUG_ON(entity->tree); |
1034 |
++ |
1035 |
++ bfqg_stats_xfer_dead(bfqg); |
1036 |
++} |
1037 |
++ |
1038 |
++static void bfq_end_wr_async(struct bfq_data *bfqd) |
1039 |
++{ |
1040 |
++ struct blkcg_gq *blkg; |
1041 |
++ |
1042 |
++ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { |
1043 |
++ struct bfq_group *bfqg = blkg_to_bfqg(blkg); |
1044 |
++ |
1045 |
++ bfq_end_wr_async_queues(bfqd, bfqg); |
1046 |
++ } |
1047 |
++ bfq_end_wr_async_queues(bfqd, bfqd->root_group); |
1048 |
++} |
1049 |
++ |
1050 |
++static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css, |
1051 |
++ struct cftype *cftype) |
1052 |
++{ |
1053 |
++ struct blkcg *blkcg = css_to_blkcg(css); |
1054 |
++ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); |
1055 |
++ int ret = -EINVAL; |
1056 |
++ |
1057 |
++ spin_lock_irq(&blkcg->lock); |
1058 |
++ ret = bfqgd->weight; |
1059 |
++ spin_unlock_irq(&blkcg->lock); |
1060 |
++ |
1061 |
++ return ret; |
1062 |
++} |
1063 |
++ |
1064 |
++static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v) |
1065 |
++{ |
1066 |
++ struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); |
1067 |
++ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); |
1068 |
++ |
1069 |
++ spin_lock_irq(&blkcg->lock); |
1070 |
++ seq_printf(sf, "%u\n", bfqgd->weight); |
1071 |
++ spin_unlock_irq(&blkcg->lock); |
1072 |
++ |
1073 |
++ return 0; |
1074 |
++} |
1075 |
++ |
1076 |
++static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, |
1077 |
++ struct cftype *cftype, |
1078 |
++ u64 val) |
1079 |
++{ |
1080 |
++ struct blkcg *blkcg = css_to_blkcg(css); |
1081 |
++ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); |
1082 |
++ struct blkcg_gq *blkg; |
1083 |
++ int ret = -EINVAL; |
1084 |
++ |
1085 |
++ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) |
1086 |
++ return ret; |
1087 |
++ |
1088 |
++ ret = 0; |
1089 |
++ spin_lock_irq(&blkcg->lock); |
1090 |
++ bfqgd->weight = (unsigned short)val; |
1091 |
++ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { |
1092 |
++ struct bfq_group *bfqg = blkg_to_bfqg(blkg); |
1093 |
++ if (!bfqg) |
1094 |
++ continue; |
1095 |
++ /* |
1096 |
++ * Setting the prio_changed flag of the entity |
1097 |
++ * to 1 with new_weight == weight would re-set |
1098 |
++ * the value of the weight to its ioprio mapping. |
1099 |
++ * Set the flag only if necessary. |
1100 |
++ */ |
1101 |
++ if ((unsigned short)val != bfqg->entity.new_weight) { |
1102 |
++ bfqg->entity.new_weight = (unsigned short)val; |
1103 |
++ /* |
1104 |
++ * Make sure that the above new value has been |
1105 |
++ * stored in bfqg->entity.new_weight before |
1106 |
++ * setting the prio_changed flag. In fact, |
1107 |
++ * this flag may be read asynchronously (in |
1108 |
++ * critical sections protected by a different |
1109 |
++ * lock than that held here), and finding this |
1110 |
++ * flag set may cause the execution of the code |
1111 |
++ * for updating parameters whose value may |
1112 |
++ * depend also on bfqg->entity.new_weight (in |
1113 |
++ * __bfq_entity_update_weight_prio). |
1114 |
++ * This barrier makes sure that the new value |
1115 |
++ * of bfqg->entity.new_weight is correctly |
1116 |
++ * seen in that code. |
1117 |
++ */ |
1118 |
++ smp_wmb(); |
1119 |
++ bfqg->entity.prio_changed = 1; |
1120 |
++ } |
1121 |
++ } |
1122 |
++ spin_unlock_irq(&blkcg->lock); |
1123 |
++ |
1124 |
++ return ret; |
1125 |
++} |
1126 |
++ |
1127 |
++static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of, |
1128 |
++ char *buf, size_t nbytes, |
1129 |
++ loff_t off) |
1130 |
++{ |
1131 |
++ /* First unsigned long found in the file is used */ |
1132 |
++ return bfqio_cgroup_weight_write(of_css(of), NULL, |
1133 |
++ simple_strtoull(strim(buf), NULL, 0)); |
1134 |
++} |
1135 |
++ |
1136 |
++static int bfqg_print_stat(struct seq_file *sf, void *v) |
1137 |
++{ |
1138 |
++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, |
1139 |
++ &blkcg_policy_bfq, seq_cft(sf)->private, false); |
1140 |
++ return 0; |
1141 |
++} |
1142 |
++ |
1143 |
++static int bfqg_print_rwstat(struct seq_file *sf, void *v) |
1144 |
++{ |
1145 |
++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, |
1146 |
++ &blkcg_policy_bfq, seq_cft(sf)->private, true); |
1147 |
++ return 0; |
1148 |
++} |
1149 |
++ |
1150 |
++static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, |
1151 |
++ struct blkg_policy_data *pd, int off) |
1152 |
++{ |
1153 |
++ u64 sum = bfqg_stat_pd_recursive_sum(pd, off); |
1154 |
++ |
1155 |
++ return __blkg_prfill_u64(sf, pd, sum); |
1156 |
++} |
1157 |
++ |
1158 |
++static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, |
1159 |
++ struct blkg_policy_data *pd, int off) |
1160 |
++{ |
1161 |
++ struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off); |
1162 |
++ |
1163 |
++ return __blkg_prfill_rwstat(sf, pd, &sum); |
1164 |
++} |
1165 |
++ |
1166 |
++static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) |
1167 |
++{ |
1168 |
++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), |
1169 |
++ bfqg_prfill_stat_recursive, &blkcg_policy_bfq, |
1170 |
++ seq_cft(sf)->private, false); |
1171 |
++ return 0; |
1172 |
++} |
1173 |
++ |
1174 |
++static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) |
1175 |
++{ |
1176 |
++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), |
1177 |
++ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, |
1178 |
++ seq_cft(sf)->private, true); |
1179 |
++ return 0; |
1180 |
++} |
1181 |
++ |
1182 |
++static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, |
1183 |
++ struct blkg_policy_data *pd, int off) |
1184 |
++{ |
1185 |
++ struct bfq_group *bfqg = pd_to_bfqg(pd); |
1186 |
++ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); |
1187 |
++ u64 v = 0; |
1188 |
++ |
1189 |
++ if (samples) { |
1190 |
++ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); |
1191 |
++ v = div64_u64(v, samples); |
1192 |
++ } |
1193 |
++ __blkg_prfill_u64(sf, pd, v); |
1194 |
++ return 0; |
1195 |
++} |
1196 |
++ |
1197 |
++/* print avg_queue_size */ |
1198 |
++static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) |
1199 |
++{ |
1200 |
++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), |
1201 |
++ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, |
1202 |
++ 0, false); |
1203 |
++ return 0; |
1204 |
++} |
1205 |
++ |
1206 |
++static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) |
1207 |
++{ |
1208 |
++ int ret; |
1209 |
++ |
1210 |
++ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); |
1211 |
++ if (ret) |
1212 |
++ return NULL; |
1213 |
++ |
1214 |
++ return blkg_to_bfqg(bfqd->queue->root_blkg); |
1215 |
++} |
1216 |
++ |
1217 |
++static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) |
1218 |
++{ |
1219 |
++ struct bfq_group_data *bgd; |
1220 |
++ |
1221 |
++ bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); |
1222 |
++ if (!bgd) |
1223 |
++ return NULL; |
1224 |
++ return &bgd->pd; |
1225 |
++} |
1226 |
++ |
1227 |
++static void bfq_cpd_free(struct blkcg_policy_data *cpd) |
1228 |
++{ |
1229 |
++ kfree(cpd_to_bfqgd(cpd)); |
1230 |
++} |
1231 |
++ |
1232 |
++static struct cftype bfqio_files_dfl[] = { |
1233 |
++ { |
1234 |
++ .name = "weight", |
1235 |
++ .flags = CFTYPE_NOT_ON_ROOT, |
1236 |
++ .seq_show = bfqio_cgroup_weight_read_dfl, |
1237 |
++ .write = bfqio_cgroup_weight_write_dfl, |
1238 |
++ }, |
1239 |
++ {} /* terminate */ |
1240 |
++}; |
1241 |
++ |
1242 |
++static struct cftype bfqio_files[] = { |
1243 |
++ { |
1244 |
++ .name = "bfq.weight", |
1245 |
++ .read_u64 = bfqio_cgroup_weight_read, |
1246 |
++ .write_u64 = bfqio_cgroup_weight_write, |
1247 |
++ }, |
1248 |
++ /* statistics, cover only the tasks in the bfqg */ |
1249 |
++ { |
1250 |
++ .name = "bfq.time", |
1251 |
++ .private = offsetof(struct bfq_group, stats.time), |
1252 |
++ .seq_show = bfqg_print_stat, |
1253 |
++ }, |
1254 |
++ { |
1255 |
++ .name = "bfq.sectors", |
1256 |
++ .private = offsetof(struct bfq_group, stats.sectors), |
1257 |
++ .seq_show = bfqg_print_stat, |
1258 |
++ }, |
1259 |
++ { |
1260 |
++ .name = "bfq.io_service_bytes", |
1261 |
++ .private = offsetof(struct bfq_group, stats.service_bytes), |
1262 |
++ .seq_show = bfqg_print_rwstat, |
1263 |
++ }, |
1264 |
++ { |
1265 |
++ .name = "bfq.io_serviced", |
1266 |
++ .private = offsetof(struct bfq_group, stats.serviced), |
1267 |
++ .seq_show = bfqg_print_rwstat, |
1268 |
++ }, |
1269 |
++ { |
1270 |
++ .name = "bfq.io_service_time", |
1271 |
++ .private = offsetof(struct bfq_group, stats.service_time), |
1272 |
++ .seq_show = bfqg_print_rwstat, |
1273 |
++ }, |
1274 |
++ { |
1275 |
++ .name = "bfq.io_wait_time", |
1276 |
++ .private = offsetof(struct bfq_group, stats.wait_time), |
1277 |
++ .seq_show = bfqg_print_rwstat, |
1278 |
++ }, |
1279 |
++ { |
1280 |
++ .name = "bfq.io_merged", |
1281 |
++ .private = offsetof(struct bfq_group, stats.merged), |
1282 |
++ .seq_show = bfqg_print_rwstat, |
1283 |
++ }, |
1284 |
++ { |
1285 |
++ .name = "bfq.io_queued", |
1286 |
++ .private = offsetof(struct bfq_group, stats.queued), |
1287 |
++ .seq_show = bfqg_print_rwstat, |
1288 |
++ }, |
1289 |
++ |
1290 |
++ /* the same statictics which cover the bfqg and its descendants */ |
1291 |
++ { |
1292 |
++ .name = "bfq.time_recursive", |
1293 |
++ .private = offsetof(struct bfq_group, stats.time), |
1294 |
++ .seq_show = bfqg_print_stat_recursive, |
1295 |
++ }, |
1296 |
++ { |
1297 |
++ .name = "bfq.sectors_recursive", |
1298 |
++ .private = offsetof(struct bfq_group, stats.sectors), |
1299 |
++ .seq_show = bfqg_print_stat_recursive, |
1300 |
++ }, |
1301 |
++ { |
1302 |
++ .name = "bfq.io_service_bytes_recursive", |
1303 |
++ .private = offsetof(struct bfq_group, stats.service_bytes), |
1304 |
++ .seq_show = bfqg_print_rwstat_recursive, |
1305 |
++ }, |
1306 |
++ { |
1307 |
++ .name = "bfq.io_serviced_recursive", |
1308 |
++ .private = offsetof(struct bfq_group, stats.serviced), |
1309 |
++ .seq_show = bfqg_print_rwstat_recursive, |
1310 |
++ }, |
1311 |
++ { |
1312 |
++ .name = "bfq.io_service_time_recursive", |
1313 |
++ .private = offsetof(struct bfq_group, stats.service_time), |
1314 |
++ .seq_show = bfqg_print_rwstat_recursive, |
1315 |
++ }, |
1316 |
++ { |
1317 |
++ .name = "bfq.io_wait_time_recursive", |
1318 |
++ .private = offsetof(struct bfq_group, stats.wait_time), |
1319 |
++ .seq_show = bfqg_print_rwstat_recursive, |
1320 |
++ }, |
1321 |
++ { |
1322 |
++ .name = "bfq.io_merged_recursive", |
1323 |
++ .private = offsetof(struct bfq_group, stats.merged), |
1324 |
++ .seq_show = bfqg_print_rwstat_recursive, |
1325 |
++ }, |
1326 |
++ { |
1327 |
++ .name = "bfq.io_queued_recursive", |
1328 |
++ .private = offsetof(struct bfq_group, stats.queued), |
1329 |
++ .seq_show = bfqg_print_rwstat_recursive, |
1330 |
++ }, |
1331 |
++ { |
1332 |
++ .name = "bfq.avg_queue_size", |
1333 |
++ .seq_show = bfqg_print_avg_queue_size, |
1334 |
++ }, |
1335 |
++ { |
1336 |
++ .name = "bfq.group_wait_time", |
1337 |
++ .private = offsetof(struct bfq_group, stats.group_wait_time), |
1338 |
++ .seq_show = bfqg_print_stat, |
1339 |
++ }, |
1340 |
++ { |
1341 |
++ .name = "bfq.idle_time", |
1342 |
++ .private = offsetof(struct bfq_group, stats.idle_time), |
1343 |
++ .seq_show = bfqg_print_stat, |
1344 |
++ }, |
1345 |
++ { |
1346 |
++ .name = "bfq.empty_time", |
1347 |
++ .private = offsetof(struct bfq_group, stats.empty_time), |
1348 |
++ .seq_show = bfqg_print_stat, |
1349 |
++ }, |
1350 |
++ { |
1351 |
++ .name = "bfq.dequeue", |
1352 |
++ .private = offsetof(struct bfq_group, stats.dequeue), |
1353 |
++ .seq_show = bfqg_print_stat, |
1354 |
++ }, |
1355 |
++ { |
1356 |
++ .name = "bfq.unaccounted_time", |
1357 |
++ .private = offsetof(struct bfq_group, stats.unaccounted_time), |
1358 |
++ .seq_show = bfqg_print_stat, |
1359 |
++ }, |
1360 |
++ { } /* terminate */ |
1361 |
++}; |
1362 |
++ |
1363 |
++static struct blkcg_policy blkcg_policy_bfq = { |
1364 |
++ .dfl_cftypes = bfqio_files_dfl, |
1365 |
++ .legacy_cftypes = bfqio_files, |
1366 |
++ |
1367 |
++ .pd_alloc_fn = bfq_pd_alloc, |
1368 |
++ .pd_init_fn = bfq_pd_init, |
1369 |
++ .pd_offline_fn = bfq_pd_offline, |
1370 |
++ .pd_free_fn = bfq_pd_free, |
1371 |
++ .pd_reset_stats_fn = bfq_pd_reset_stats, |
1372 |
++ |
1373 |
++ .cpd_alloc_fn = bfq_cpd_alloc, |
1374 |
++ .cpd_init_fn = bfq_cpd_init, |
1375 |
++ .cpd_bind_fn = bfq_cpd_init, |
1376 |
++ .cpd_free_fn = bfq_cpd_free, |
1377 |
++ |
1378 |
++}; |
1379 |
++ |
1380 |
++#else |
1381 |
++ |
1382 |
++static void bfq_init_entity(struct bfq_entity *entity, |
1383 |
++ struct bfq_group *bfqg) |
1384 |
++{ |
1385 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
1386 |
++ entity->weight = entity->new_weight; |
1387 |
++ entity->orig_weight = entity->new_weight; |
1388 |
++ if (bfqq) { |
1389 |
++ bfqq->ioprio = bfqq->new_ioprio; |
1390 |
++ bfqq->ioprio_class = bfqq->new_ioprio_class; |
1391 |
++ } |
1392 |
++ entity->sched_data = &bfqg->sched_data; |
1393 |
++} |
1394 |
++ |
1395 |
++static struct bfq_group * |
1396 |
++bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) |
1397 |
++{ |
1398 |
++ struct bfq_data *bfqd = bic_to_bfqd(bic); |
1399 |
++ return bfqd->root_group; |
1400 |
++} |
1401 |
++ |
1402 |
++static void bfq_bfqq_move(struct bfq_data *bfqd, |
1403 |
++ struct bfq_queue *bfqq, |
1404 |
++ struct bfq_entity *entity, |
1405 |
++ struct bfq_group *bfqg) |
1406 |
++{ |
1407 |
++} |
1408 |
++ |
1409 |
++static void bfq_end_wr_async(struct bfq_data *bfqd) |
1410 |
++{ |
1411 |
++ bfq_end_wr_async_queues(bfqd, bfqd->root_group); |
1412 |
++} |
1413 |
++ |
1414 |
++static void bfq_disconnect_groups(struct bfq_data *bfqd) |
1415 |
++{ |
1416 |
++ bfq_put_async_queues(bfqd, bfqd->root_group); |
1417 |
++} |
1418 |
++ |
1419 |
++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, |
1420 |
++ struct blkcg *blkcg) |
1421 |
++{ |
1422 |
++ return bfqd->root_group; |
1423 |
++} |
1424 |
++ |
1425 |
++static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) |
1426 |
++{ |
1427 |
++ struct bfq_group *bfqg; |
1428 |
++ int i; |
1429 |
++ |
1430 |
++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); |
1431 |
++ if (!bfqg) |
1432 |
++ return NULL; |
1433 |
++ |
1434 |
++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) |
1435 |
++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; |
1436 |
++ |
1437 |
++ return bfqg; |
1438 |
++} |
1439 |
++#endif |
1440 |
+diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c |
1441 |
+new file mode 100644 |
1442 |
+index 0000000..fb7bb8f |
1443 |
+--- /dev/null |
1444 |
++++ b/block/bfq-ioc.c |
1445 |
+@@ -0,0 +1,36 @@ |
1446 |
++/* |
1447 |
++ * BFQ: I/O context handling. |
1448 |
++ * |
1449 |
++ * Based on ideas and code from CFQ: |
1450 |
++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
1451 |
++ * |
1452 |
++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
1453 |
++ * Paolo Valente <paolo.valente@×××××××.it> |
1454 |
++ * |
1455 |
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
1456 |
++ */ |
1457 |
++ |
1458 |
++/** |
1459 |
++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. |
1460 |
++ * @icq: the iocontext queue. |
1461 |
++ */ |
1462 |
++static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) |
1463 |
++{ |
1464 |
++ /* bic->icq is the first member, %NULL will convert to %NULL */ |
1465 |
++ return container_of(icq, struct bfq_io_cq, icq); |
1466 |
++} |
1467 |
++ |
1468 |
++/** |
1469 |
++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. |
1470 |
++ * @bfqd: the lookup key. |
1471 |
++ * @ioc: the io_context of the process doing I/O. |
1472 |
++ * |
1473 |
++ * Queue lock must be held. |
1474 |
++ */ |
1475 |
++static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, |
1476 |
++ struct io_context *ioc) |
1477 |
++{ |
1478 |
++ if (ioc) |
1479 |
++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); |
1480 |
++ return NULL; |
1481 |
++} |
1482 |
+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c |
1483 |
+new file mode 100644 |
1484 |
+index 0000000..f9787a6 |
1485 |
+--- /dev/null |
1486 |
++++ b/block/bfq-iosched.c |
1487 |
+@@ -0,0 +1,3754 @@ |
1488 |
++/* |
1489 |
++ * Budget Fair Queueing (BFQ) disk scheduler. |
1490 |
++ * |
1491 |
++ * Based on ideas and code from CFQ: |
1492 |
++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
1493 |
++ * |
1494 |
++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
1495 |
++ * Paolo Valente <paolo.valente@×××××××.it> |
1496 |
++ * |
1497 |
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
1498 |
++ * |
1499 |
++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ |
1500 |
++ * file. |
1501 |
++ * |
1502 |
++ * BFQ is a proportional-share storage-I/O scheduling algorithm based on |
1503 |
++ * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, |
1504 |
++ * measured in number of sectors, to processes instead of time slices. The |
1505 |
++ * device is not granted to the in-service process for a given time slice, |
1506 |
++ * but until it has exhausted its assigned budget. This change from the time |
1507 |
++ * to the service domain allows BFQ to distribute the device throughput |
1508 |
++ * among processes as desired, without any distortion due to ZBR, workload |
1509 |
++ * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, |
1510 |
++ * called B-WF2Q+, to schedule processes according to their budgets. More |
1511 |
++ * precisely, BFQ schedules queues associated to processes. Thanks to the |
1512 |
++ * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to |
1513 |
++ * I/O-bound processes issuing sequential requests (to boost the |
1514 |
++ * throughput), and yet guarantee a low latency to interactive and soft |
1515 |
++ * real-time applications. |
1516 |
++ * |
1517 |
++ * BFQ is described in [1], where also a reference to the initial, more |
1518 |
++ * theoretical paper on BFQ can be found. The interested reader can find |
1519 |
++ * in the latter paper full details on the main algorithm, as well as |
1520 |
++ * formulas of the guarantees and formal proofs of all the properties. |
1521 |
++ * With respect to the version of BFQ presented in these papers, this |
1522 |
++ * implementation adds a few more heuristics, such as the one that |
1523 |
++ * guarantees a low latency to soft real-time applications, and a |
1524 |
++ * hierarchical extension based on H-WF2Q+. |
1525 |
++ * |
1526 |
++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with |
1527 |
++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) |
1528 |
++ * complexity derives from the one introduced with EEVDF in [3]. |
1529 |
++ * |
1530 |
++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness |
1531 |
++ * with the BFQ Disk I/O Scheduler'', |
1532 |
++ * Proceedings of the 5th Annual International Systems and Storage |
1533 |
++ * Conference (SYSTOR '12), June 2012. |
1534 |
++ * |
1535 |
++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf |
1536 |
++ * |
1537 |
++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing |
1538 |
++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, |
1539 |
++ * Oct 1997. |
1540 |
++ * |
1541 |
++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz |
1542 |
++ * |
1543 |
++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline |
1544 |
++ * First: A Flexible and Accurate Mechanism for Proportional Share |
1545 |
++ * Resource Allocation,'' technical report. |
1546 |
++ * |
1547 |
++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf |
1548 |
++ */ |
1549 |
++#include <linux/module.h> |
1550 |
++#include <linux/slab.h> |
1551 |
++#include <linux/blkdev.h> |
1552 |
++#include <linux/cgroup.h> |
1553 |
++#include <linux/elevator.h> |
1554 |
++#include <linux/jiffies.h> |
1555 |
++#include <linux/rbtree.h> |
1556 |
++#include <linux/ioprio.h> |
1557 |
++#include "bfq.h" |
1558 |
++#include "blk.h" |
1559 |
++ |
1560 |
++/* Expiration time of sync (0) and async (1) requests, in jiffies. */ |
1561 |
++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; |
1562 |
++ |
1563 |
++/* Maximum backwards seek, in KiB. */ |
1564 |
++static const int bfq_back_max = 16 * 1024; |
1565 |
++ |
1566 |
++/* Penalty of a backwards seek, in number of sectors. */ |
1567 |
++static const int bfq_back_penalty = 2; |
1568 |
++ |
1569 |
++/* Idling period duration, in jiffies. */ |
1570 |
++static int bfq_slice_idle = HZ / 125; |
1571 |
++ |
1572 |
++/* Minimum number of assigned budgets for which stats are safe to compute. */ |
1573 |
++static const int bfq_stats_min_budgets = 194; |
1574 |
++ |
1575 |
++/* Default maximum budget values, in sectors and number of requests. */ |
1576 |
++static const int bfq_default_max_budget = 16 * 1024; |
1577 |
++static const int bfq_max_budget_async_rq = 4; |
1578 |
++ |
1579 |
++/* |
1580 |
++ * Async to sync throughput distribution is controlled as follows: |
1581 |
++ * when an async request is served, the entity is charged the number |
1582 |
++ * of sectors of the request, multiplied by the factor below |
1583 |
++ */ |
1584 |
++static const int bfq_async_charge_factor = 10; |
1585 |
++ |
1586 |
++/* Default timeout values, in jiffies, approximating CFQ defaults. */ |
1587 |
++static const int bfq_timeout_sync = HZ / 8; |
1588 |
++static int bfq_timeout_async = HZ / 25; |
1589 |
++ |
1590 |
++struct kmem_cache *bfq_pool; |
1591 |
++ |
1592 |
++/* Below this threshold (in ms), we consider thinktime immediate. */ |
1593 |
++#define BFQ_MIN_TT 2 |
1594 |
++ |
1595 |
++/* hw_tag detection: parallel requests threshold and min samples needed. */ |
1596 |
++#define BFQ_HW_QUEUE_THRESHOLD 4 |
1597 |
++#define BFQ_HW_QUEUE_SAMPLES 32 |
1598 |
++ |
1599 |
++#define BFQQ_SEEK_THR (sector_t)(8 * 1024) |
1600 |
++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) |
1601 |
++ |
1602 |
++/* Min samples used for peak rate estimation (for autotuning). */ |
1603 |
++#define BFQ_PEAK_RATE_SAMPLES 32 |
1604 |
++ |
1605 |
++/* Shift used for peak rate fixed precision calculations. */ |
1606 |
++#define BFQ_RATE_SHIFT 16 |
1607 |
++ |
1608 |
++/* |
1609 |
++ * By default, BFQ computes the duration of the weight raising for |
1610 |
++ * interactive applications automatically, using the following formula: |
1611 |
++ * duration = (R / r) * T, where r is the peak rate of the device, and |
1612 |
++ * R and T are two reference parameters. |
1613 |
++ * In particular, R is the peak rate of the reference device (see below), |
1614 |
++ * and T is a reference time: given the systems that are likely to be |
1615 |
++ * installed on the reference device according to its speed class, T is |
1616 |
++ * about the maximum time needed, under BFQ and while reading two files in |
1617 |
++ * parallel, to load typical large applications on these systems. |
1618 |
++ * In practice, the slower/faster the device at hand is, the more/less it |
1619 |
++ * takes to load applications with respect to the reference device. |
1620 |
++ * Accordingly, the longer/shorter BFQ grants weight raising to interactive |
1621 |
++ * applications. |
1622 |
++ * |
1623 |
++ * BFQ uses four different reference pairs (R, T), depending on: |
1624 |
++ * . whether the device is rotational or non-rotational; |
1625 |
++ * . whether the device is slow, such as old or portable HDDs, as well as |
1626 |
++ * SD cards, or fast, such as newer HDDs and SSDs. |
1627 |
++ * |
1628 |
++ * The device's speed class is dynamically (re)detected in |
1629 |
++ * bfq_update_peak_rate() every time the estimated peak rate is updated. |
1630 |
++ * |
1631 |
++ * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] |
1632 |
++ * are the reference values for a slow/fast rotational device, whereas |
1633 |
++ * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for |
1634 |
++ * a slow/fast non-rotational device. Finally, device_speed_thresh are the |
1635 |
++ * thresholds used to switch between speed classes. |
1636 |
++ * Both the reference peak rates and the thresholds are measured in |
1637 |
++ * sectors/usec, left-shifted by BFQ_RATE_SHIFT. |
1638 |
++ */ |
1639 |
++static int R_slow[2] = {1536, 10752}; |
1640 |
++static int R_fast[2] = {17415, 34791}; |
1641 |
++/* |
1642 |
++ * To improve readability, a conversion function is used to initialize the |
1643 |
++ * following arrays, which entails that they can be initialized only in a |
1644 |
++ * function. |
1645 |
++ */ |
1646 |
++static int T_slow[2]; |
1647 |
++static int T_fast[2]; |
1648 |
++static int device_speed_thresh[2]; |
1649 |
++ |
1650 |
++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ |
1651 |
++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) |
1652 |
++ |
1653 |
++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) |
1654 |
++#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) |
1655 |
++ |
1656 |
++static void bfq_schedule_dispatch(struct bfq_data *bfqd); |
1657 |
++ |
1658 |
++#include "bfq-ioc.c" |
1659 |
++#include "bfq-sched.c" |
1660 |
++#include "bfq-cgroup.c" |
1661 |
++ |
1662 |
++#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) |
1663 |
++#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) |
1664 |
++ |
1665 |
++#define bfq_sample_valid(samples) ((samples) > 80) |
1666 |
++ |
1667 |
++/* |
1668 |
++ * We regard a request as SYNC, if either it's a read or has the SYNC bit |
1669 |
++ * set (in which case it could also be a direct WRITE). |
1670 |
++ */ |
1671 |
++static int bfq_bio_sync(struct bio *bio) |
1672 |
++{ |
1673 |
++ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) |
1674 |
++ return 1; |
1675 |
++ |
1676 |
++ return 0; |
1677 |
++} |
1678 |
++ |
1679 |
++/* |
1680 |
++ * Scheduler run of queue, if there are requests pending and no one in the |
1681 |
++ * driver that will restart queueing. |
1682 |
++ */ |
1683 |
++static void bfq_schedule_dispatch(struct bfq_data *bfqd) |
1684 |
++{ |
1685 |
++ if (bfqd->queued != 0) { |
1686 |
++ bfq_log(bfqd, "schedule dispatch"); |
1687 |
++ kblockd_schedule_work(&bfqd->unplug_work); |
1688 |
++ } |
1689 |
++} |
1690 |
++ |
1691 |
++/* |
1692 |
++ * Lifted from AS - choose which of rq1 and rq2 that is best served now. |
1693 |
++ * We choose the request that is closesr to the head right now. Distance |
1694 |
++ * behind the head is penalized and only allowed to a certain extent. |
1695 |
++ */ |
1696 |
++static struct request *bfq_choose_req(struct bfq_data *bfqd, |
1697 |
++ struct request *rq1, |
1698 |
++ struct request *rq2, |
1699 |
++ sector_t last) |
1700 |
++{ |
1701 |
++ sector_t s1, s2, d1 = 0, d2 = 0; |
1702 |
++ unsigned long back_max; |
1703 |
++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ |
1704 |
++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ |
1705 |
++ unsigned wrap = 0; /* bit mask: requests behind the disk head? */ |
1706 |
++ |
1707 |
++ if (!rq1 || rq1 == rq2) |
1708 |
++ return rq2; |
1709 |
++ if (!rq2) |
1710 |
++ return rq1; |
1711 |
++ |
1712 |
++ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) |
1713 |
++ return rq1; |
1714 |
++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) |
1715 |
++ return rq2; |
1716 |
++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) |
1717 |
++ return rq1; |
1718 |
++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) |
1719 |
++ return rq2; |
1720 |
++ |
1721 |
++ s1 = blk_rq_pos(rq1); |
1722 |
++ s2 = blk_rq_pos(rq2); |
1723 |
++ |
1724 |
++ /* |
1725 |
++ * By definition, 1KiB is 2 sectors. |
1726 |
++ */ |
1727 |
++ back_max = bfqd->bfq_back_max * 2; |
1728 |
++ |
1729 |
++ /* |
1730 |
++ * Strict one way elevator _except_ in the case where we allow |
1731 |
++ * short backward seeks which are biased as twice the cost of a |
1732 |
++ * similar forward seek. |
1733 |
++ */ |
1734 |
++ if (s1 >= last) |
1735 |
++ d1 = s1 - last; |
1736 |
++ else if (s1 + back_max >= last) |
1737 |
++ d1 = (last - s1) * bfqd->bfq_back_penalty; |
1738 |
++ else |
1739 |
++ wrap |= BFQ_RQ1_WRAP; |
1740 |
++ |
1741 |
++ if (s2 >= last) |
1742 |
++ d2 = s2 - last; |
1743 |
++ else if (s2 + back_max >= last) |
1744 |
++ d2 = (last - s2) * bfqd->bfq_back_penalty; |
1745 |
++ else |
1746 |
++ wrap |= BFQ_RQ2_WRAP; |
1747 |
++ |
1748 |
++ /* Found required data */ |
1749 |
++ |
1750 |
++ /* |
1751 |
++ * By doing switch() on the bit mask "wrap" we avoid having to |
1752 |
++ * check two variables for all permutations: --> faster! |
1753 |
++ */ |
1754 |
++ switch (wrap) { |
1755 |
++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ |
1756 |
++ if (d1 < d2) |
1757 |
++ return rq1; |
1758 |
++ else if (d2 < d1) |
1759 |
++ return rq2; |
1760 |
++ else { |
1761 |
++ if (s1 >= s2) |
1762 |
++ return rq1; |
1763 |
++ else |
1764 |
++ return rq2; |
1765 |
++ } |
1766 |
++ |
1767 |
++ case BFQ_RQ2_WRAP: |
1768 |
++ return rq1; |
1769 |
++ case BFQ_RQ1_WRAP: |
1770 |
++ return rq2; |
1771 |
++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ |
1772 |
++ default: |
1773 |
++ /* |
1774 |
++ * Since both rqs are wrapped, |
1775 |
++ * start with the one that's further behind head |
1776 |
++ * (--> only *one* back seek required), |
1777 |
++ * since back seek takes more time than forward. |
1778 |
++ */ |
1779 |
++ if (s1 <= s2) |
1780 |
++ return rq1; |
1781 |
++ else |
1782 |
++ return rq2; |
1783 |
++ } |
1784 |
++} |
1785 |
++ |
1786 |
++/* |
1787 |
++ * Tell whether there are active queues or groups with differentiated weights. |
1788 |
++ */ |
1789 |
++static bool bfq_differentiated_weights(struct bfq_data *bfqd) |
1790 |
++{ |
1791 |
++ /* |
1792 |
++ * For weights to differ, at least one of the trees must contain |
1793 |
++ * at least two nodes. |
1794 |
++ */ |
1795 |
++ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && |
1796 |
++ (bfqd->queue_weights_tree.rb_node->rb_left || |
1797 |
++ bfqd->queue_weights_tree.rb_node->rb_right) |
1798 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
1799 |
++ ) || |
1800 |
++ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && |
1801 |
++ (bfqd->group_weights_tree.rb_node->rb_left || |
1802 |
++ bfqd->group_weights_tree.rb_node->rb_right) |
1803 |
++#endif |
1804 |
++ ); |
1805 |
++} |
1806 |
++ |
1807 |
++/* |
1808 |
++ * The following function returns true if every queue must receive the |
1809 |
++ * same share of the throughput (this condition is used when deciding |
1810 |
++ * whether idling may be disabled, see the comments in the function |
1811 |
++ * bfq_bfqq_may_idle()). |
1812 |
++ * |
1813 |
++ * Such a scenario occurs when: |
1814 |
++ * 1) all active queues have the same weight, |
1815 |
++ * 2) all active groups at the same level in the groups tree have the same |
1816 |
++ * weight, |
1817 |
++ * 3) all active groups at the same level in the groups tree have the same |
1818 |
++ * number of children. |
1819 |
++ * |
1820 |
++ * Unfortunately, keeping the necessary state for evaluating exactly the |
1821 |
++ * above symmetry conditions would be quite complex and time-consuming. |
1822 |
++ * Therefore this function evaluates, instead, the following stronger |
1823 |
++ * sub-conditions, for which it is much easier to maintain the needed |
1824 |
++ * state: |
1825 |
++ * 1) all active queues have the same weight, |
1826 |
++ * 2) all active groups have the same weight, |
1827 |
++ * 3) all active groups have at most one active child each. |
1828 |
++ * In particular, the last two conditions are always true if hierarchical |
1829 |
++ * support and the cgroups interface are not enabled, thus no state needs |
1830 |
++ * to be maintained in this case. |
1831 |
++ */ |
1832 |
++static bool bfq_symmetric_scenario(struct bfq_data *bfqd) |
1833 |
++{ |
1834 |
++ return |
1835 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
1836 |
++ !bfqd->active_numerous_groups && |
1837 |
++#endif |
1838 |
++ !bfq_differentiated_weights(bfqd); |
1839 |
++} |
1840 |
++ |
1841 |
++/* |
1842 |
++ * If the weight-counter tree passed as input contains no counter for |
1843 |
++ * the weight of the input entity, then add that counter; otherwise just |
1844 |
++ * increment the existing counter. |
1845 |
++ * |
1846 |
++ * Note that weight-counter trees contain few nodes in mostly symmetric |
1847 |
++ * scenarios. For example, if all queues have the same weight, then the |
1848 |
++ * weight-counter tree for the queues may contain at most one node. |
1849 |
++ * This holds even if low_latency is on, because weight-raised queues |
1850 |
++ * are not inserted in the tree. |
1851 |
++ * In most scenarios, the rate at which nodes are created/destroyed |
1852 |
++ * should be low too. |
1853 |
++ */ |
1854 |
++static void bfq_weights_tree_add(struct bfq_data *bfqd, |
1855 |
++ struct bfq_entity *entity, |
1856 |
++ struct rb_root *root) |
1857 |
++{ |
1858 |
++ struct rb_node **new = &(root->rb_node), *parent = NULL; |
1859 |
++ |
1860 |
++ /* |
1861 |
++ * Do not insert if the entity is already associated with a |
1862 |
++ * counter, which happens if: |
1863 |
++ * 1) the entity is associated with a queue, |
1864 |
++ * 2) a request arrival has caused the queue to become both |
1865 |
++ * non-weight-raised, and hence change its weight, and |
1866 |
++ * backlogged; in this respect, each of the two events |
1867 |
++ * causes an invocation of this function, |
1868 |
++ * 3) this is the invocation of this function caused by the |
1869 |
++ * second event. This second invocation is actually useless, |
1870 |
++ * and we handle this fact by exiting immediately. More |
1871 |
++ * efficient or clearer solutions might possibly be adopted. |
1872 |
++ */ |
1873 |
++ if (entity->weight_counter) |
1874 |
++ return; |
1875 |
++ |
1876 |
++ while (*new) { |
1877 |
++ struct bfq_weight_counter *__counter = container_of(*new, |
1878 |
++ struct bfq_weight_counter, |
1879 |
++ weights_node); |
1880 |
++ parent = *new; |
1881 |
++ |
1882 |
++ if (entity->weight == __counter->weight) { |
1883 |
++ entity->weight_counter = __counter; |
1884 |
++ goto inc_counter; |
1885 |
++ } |
1886 |
++ if (entity->weight < __counter->weight) |
1887 |
++ new = &((*new)->rb_left); |
1888 |
++ else |
1889 |
++ new = &((*new)->rb_right); |
1890 |
++ } |
1891 |
++ |
1892 |
++ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), |
1893 |
++ GFP_ATOMIC); |
1894 |
++ entity->weight_counter->weight = entity->weight; |
1895 |
++ rb_link_node(&entity->weight_counter->weights_node, parent, new); |
1896 |
++ rb_insert_color(&entity->weight_counter->weights_node, root); |
1897 |
++ |
1898 |
++inc_counter: |
1899 |
++ entity->weight_counter->num_active++; |
1900 |
++} |
1901 |
++ |
1902 |
++/* |
1903 |
++ * Decrement the weight counter associated with the entity, and, if the |
1904 |
++ * counter reaches 0, remove the counter from the tree. |
1905 |
++ * See the comments to the function bfq_weights_tree_add() for considerations |
1906 |
++ * about overhead. |
1907 |
++ */ |
1908 |
++static void bfq_weights_tree_remove(struct bfq_data *bfqd, |
1909 |
++ struct bfq_entity *entity, |
1910 |
++ struct rb_root *root) |
1911 |
++{ |
1912 |
++ if (!entity->weight_counter) |
1913 |
++ return; |
1914 |
++ |
1915 |
++ BUG_ON(RB_EMPTY_ROOT(root)); |
1916 |
++ BUG_ON(entity->weight_counter->weight != entity->weight); |
1917 |
++ |
1918 |
++ BUG_ON(!entity->weight_counter->num_active); |
1919 |
++ entity->weight_counter->num_active--; |
1920 |
++ if (entity->weight_counter->num_active > 0) |
1921 |
++ goto reset_entity_pointer; |
1922 |
++ |
1923 |
++ rb_erase(&entity->weight_counter->weights_node, root); |
1924 |
++ kfree(entity->weight_counter); |
1925 |
++ |
1926 |
++reset_entity_pointer: |
1927 |
++ entity->weight_counter = NULL; |
1928 |
++} |
1929 |
++ |
1930 |
++static struct request *bfq_find_next_rq(struct bfq_data *bfqd, |
1931 |
++ struct bfq_queue *bfqq, |
1932 |
++ struct request *last) |
1933 |
++{ |
1934 |
++ struct rb_node *rbnext = rb_next(&last->rb_node); |
1935 |
++ struct rb_node *rbprev = rb_prev(&last->rb_node); |
1936 |
++ struct request *next = NULL, *prev = NULL; |
1937 |
++ |
1938 |
++ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); |
1939 |
++ |
1940 |
++ if (rbprev) |
1941 |
++ prev = rb_entry_rq(rbprev); |
1942 |
++ |
1943 |
++ if (rbnext) |
1944 |
++ next = rb_entry_rq(rbnext); |
1945 |
++ else { |
1946 |
++ rbnext = rb_first(&bfqq->sort_list); |
1947 |
++ if (rbnext && rbnext != &last->rb_node) |
1948 |
++ next = rb_entry_rq(rbnext); |
1949 |
++ } |
1950 |
++ |
1951 |
++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); |
1952 |
++} |
1953 |
++ |
1954 |
++/* see the definition of bfq_async_charge_factor for details */ |
1955 |
++static unsigned long bfq_serv_to_charge(struct request *rq, |
1956 |
++ struct bfq_queue *bfqq) |
1957 |
++{ |
1958 |
++ return blk_rq_sectors(rq) * |
1959 |
++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * |
1960 |
++ bfq_async_charge_factor)); |
1961 |
++} |
1962 |
++ |
1963 |
++/** |
1964 |
++ * bfq_updated_next_req - update the queue after a new next_rq selection. |
1965 |
++ * @bfqd: the device data the queue belongs to. |
1966 |
++ * @bfqq: the queue to update. |
1967 |
++ * |
1968 |
++ * If the first request of a queue changes we make sure that the queue |
1969 |
++ * has enough budget to serve at least its first request (if the |
1970 |
++ * request has grown). We do this because if the queue has not enough |
1971 |
++ * budget for its first request, it has to go through two dispatch |
1972 |
++ * rounds to actually get it dispatched. |
1973 |
++ */ |
1974 |
++static void bfq_updated_next_req(struct bfq_data *bfqd, |
1975 |
++ struct bfq_queue *bfqq) |
1976 |
++{ |
1977 |
++ struct bfq_entity *entity = &bfqq->entity; |
1978 |
++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); |
1979 |
++ struct request *next_rq = bfqq->next_rq; |
1980 |
++ unsigned long new_budget; |
1981 |
++ |
1982 |
++ if (!next_rq) |
1983 |
++ return; |
1984 |
++ |
1985 |
++ if (bfqq == bfqd->in_service_queue) |
1986 |
++ /* |
1987 |
++ * In order not to break guarantees, budgets cannot be |
1988 |
++ * changed after an entity has been selected. |
1989 |
++ */ |
1990 |
++ return; |
1991 |
++ |
1992 |
++ BUG_ON(entity->tree != &st->active); |
1993 |
++ BUG_ON(entity == entity->sched_data->in_service_entity); |
1994 |
++ |
1995 |
++ new_budget = max_t(unsigned long, bfqq->max_budget, |
1996 |
++ bfq_serv_to_charge(next_rq, bfqq)); |
1997 |
++ if (entity->budget != new_budget) { |
1998 |
++ entity->budget = new_budget; |
1999 |
++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", |
2000 |
++ new_budget); |
2001 |
++ bfq_activate_bfqq(bfqd, bfqq); |
2002 |
++ } |
2003 |
++} |
2004 |
++ |
2005 |
++static unsigned int bfq_wr_duration(struct bfq_data *bfqd) |
2006 |
++{ |
2007 |
++ u64 dur; |
2008 |
++ |
2009 |
++ if (bfqd->bfq_wr_max_time > 0) |
2010 |
++ return bfqd->bfq_wr_max_time; |
2011 |
++ |
2012 |
++ dur = bfqd->RT_prod; |
2013 |
++ do_div(dur, bfqd->peak_rate); |
2014 |
++ |
2015 |
++ return dur; |
2016 |
++} |
2017 |
++ |
2018 |
++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ |
2019 |
++static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
2020 |
++{ |
2021 |
++ struct bfq_queue *item; |
2022 |
++ struct hlist_node *n; |
2023 |
++ |
2024 |
++ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) |
2025 |
++ hlist_del_init(&item->burst_list_node); |
2026 |
++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); |
2027 |
++ bfqd->burst_size = 1; |
2028 |
++} |
2029 |
++ |
2030 |
++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ |
2031 |
++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
2032 |
++{ |
2033 |
++ /* Increment burst size to take into account also bfqq */ |
2034 |
++ bfqd->burst_size++; |
2035 |
++ |
2036 |
++ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { |
2037 |
++ struct bfq_queue *pos, *bfqq_item; |
2038 |
++ struct hlist_node *n; |
2039 |
++ |
2040 |
++ /* |
2041 |
++ * Enough queues have been activated shortly after each |
2042 |
++ * other to consider this burst as large. |
2043 |
++ */ |
2044 |
++ bfqd->large_burst = true; |
2045 |
++ |
2046 |
++ /* |
2047 |
++ * We can now mark all queues in the burst list as |
2048 |
++ * belonging to a large burst. |
2049 |
++ */ |
2050 |
++ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, |
2051 |
++ burst_list_node) |
2052 |
++ bfq_mark_bfqq_in_large_burst(bfqq_item); |
2053 |
++ bfq_mark_bfqq_in_large_burst(bfqq); |
2054 |
++ |
2055 |
++ /* |
2056 |
++ * From now on, and until the current burst finishes, any |
2057 |
++ * new queue being activated shortly after the last queue |
2058 |
++ * was inserted in the burst can be immediately marked as |
2059 |
++ * belonging to a large burst. So the burst list is not |
2060 |
++ * needed any more. Remove it. |
2061 |
++ */ |
2062 |
++ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, |
2063 |
++ burst_list_node) |
2064 |
++ hlist_del_init(&pos->burst_list_node); |
2065 |
++ } else /* burst not yet large: add bfqq to the burst list */ |
2066 |
++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); |
2067 |
++} |
2068 |
++ |
2069 |
++/* |
2070 |
++ * If many queues happen to become active shortly after each other, then, |
2071 |
++ * to help the processes associated to these queues get their job done as |
2072 |
++ * soon as possible, it is usually better to not grant either weight-raising |
2073 |
++ * or device idling to these queues. In this comment we describe, firstly, |
2074 |
++ * the reasons why this fact holds, and, secondly, the next function, which |
2075 |
++ * implements the main steps needed to properly mark these queues so that |
2076 |
++ * they can then be treated in a different way. |
2077 |
++ * |
2078 |
++ * As for the terminology, we say that a queue becomes active, i.e., |
2079 |
++ * switches from idle to backlogged, either when it is created (as a |
2080 |
++ * consequence of the arrival of an I/O request), or, if already existing, |
2081 |
++ * when a new request for the queue arrives while the queue is idle. |
2082 |
++ * Bursts of activations, i.e., activations of different queues occurring |
2083 |
++ * shortly after each other, are typically caused by services or applications |
2084 |
++ * that spawn or reactivate many parallel threads/processes. Examples are |
2085 |
++ * systemd during boot or git grep. |
2086 |
++ * |
2087 |
++ * These services or applications benefit mostly from a high throughput: |
2088 |
++ * the quicker the requests of the activated queues are cumulatively served, |
2089 |
++ * the sooner the target job of these queues gets completed. As a consequence, |
2090 |
++ * weight-raising any of these queues, which also implies idling the device |
2091 |
++ * for it, is almost always counterproductive: in most cases it just lowers |
2092 |
++ * throughput. |
2093 |
++ * |
2094 |
++ * On the other hand, a burst of activations may be also caused by the start |
2095 |
++ * of an application that does not consist in a lot of parallel I/O-bound |
2096 |
++ * threads. In fact, with a complex application, the burst may be just a |
2097 |
++ * consequence of the fact that several processes need to be executed to |
2098 |
++ * start-up the application. To start an application as quickly as possible, |
2099 |
++ * the best thing to do is to privilege the I/O related to the application |
2100 |
++ * with respect to all other I/O. Therefore, the best strategy to start as |
2101 |
++ * quickly as possible an application that causes a burst of activations is |
2102 |
++ * to weight-raise all the queues activated during the burst. This is the |
2103 |
++ * exact opposite of the best strategy for the other type of bursts. |
2104 |
++ * |
2105 |
++ * In the end, to take the best action for each of the two cases, the two |
2106 |
++ * types of bursts need to be distinguished. Fortunately, this seems |
2107 |
++ * relatively easy to do, by looking at the sizes of the bursts. In |
2108 |
++ * particular, we found a threshold such that bursts with a larger size |
2109 |
++ * than that threshold are apparently caused only by services or commands |
2110 |
++ * such as systemd or git grep. For brevity, hereafter we call just 'large' |
2111 |
++ * these bursts. BFQ *does not* weight-raise queues whose activations occur |
2112 |
++ * in a large burst. In addition, for each of these queues BFQ performs or |
2113 |
++ * does not perform idling depending on which choice boosts the throughput |
2114 |
++ * most. The exact choice depends on the device and request pattern at |
2115 |
++ * hand. |
2116 |
++ * |
2117 |
++ * Turning back to the next function, it implements all the steps needed |
2118 |
++ * to detect the occurrence of a large burst and to properly mark all the |
2119 |
++ * queues belonging to it (so that they can then be treated in a different |
2120 |
++ * way). This goal is achieved by maintaining a special "burst list" that |
2121 |
++ * holds, temporarily, the queues that belong to the burst in progress. The |
2122 |
++ * list is then used to mark these queues as belonging to a large burst if |
2123 |
++ * the burst does become large. The main steps are the following. |
2124 |
++ * |
2125 |
++ * . when the very first queue is activated, the queue is inserted into the |
2126 |
++ * list (as it could be the first queue in a possible burst) |
2127 |
++ * |
2128 |
++ * . if the current burst has not yet become large, and a queue Q that does |
2129 |
++ * not yet belong to the burst is activated shortly after the last time |
2130 |
++ * at which a new queue entered the burst list, then the function appends |
2131 |
++ * Q to the burst list |
2132 |
++ * |
2133 |
++ * . if, as a consequence of the previous step, the burst size reaches |
2134 |
++ * the large-burst threshold, then |
2135 |
++ * |
2136 |
++ * . all the queues in the burst list are marked as belonging to a |
2137 |
++ * large burst |
2138 |
++ * |
2139 |
++ * . the burst list is deleted; in fact, the burst list already served |
2140 |
++ * its purpose (keeping temporarily track of the queues in a burst, |
2141 |
++ * so as to be able to mark them as belonging to a large burst in the |
2142 |
++ * previous sub-step), and now is not needed any more |
2143 |
++ * |
2144 |
++ * . the device enters a large-burst mode |
2145 |
++ * |
2146 |
++ * . if a queue Q that does not belong to the burst is activated while |
2147 |
++ * the device is in large-burst mode and shortly after the last time |
2148 |
++ * at which a queue either entered the burst list or was marked as |
2149 |
++ * belonging to the current large burst, then Q is immediately marked |
2150 |
++ * as belonging to a large burst. |
2151 |
++ * |
2152 |
++ * . if a queue Q that does not belong to the burst is activated a while |
2153 |
++ * later, i.e., not shortly after, than the last time at which a queue |
2154 |
++ * either entered the burst list or was marked as belonging to the |
2155 |
++ * current large burst, then the current burst is deemed as finished and: |
2156 |
++ * |
2157 |
++ * . the large-burst mode is reset if set |
2158 |
++ * |
2159 |
++ * . the burst list is emptied |
2160 |
++ * |
2161 |
++ * . Q is inserted in the burst list, as Q may be the first queue |
2162 |
++ * in a possible new burst (then the burst list contains just Q |
2163 |
++ * after this step). |
2164 |
++ */ |
2165 |
++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
2166 |
++ bool idle_for_long_time) |
2167 |
++{ |
2168 |
++ /* |
2169 |
++ * If bfqq happened to be activated in a burst, but has been idle |
2170 |
++ * for at least as long as an interactive queue, then we assume |
2171 |
++ * that, in the overall I/O initiated in the burst, the I/O |
2172 |
++ * associated to bfqq is finished. So bfqq does not need to be |
2173 |
++ * treated as a queue belonging to a burst anymore. Accordingly, |
2174 |
++ * we reset bfqq's in_large_burst flag if set, and remove bfqq |
2175 |
++ * from the burst list if it's there. We do not decrement instead |
2176 |
++ * burst_size, because the fact that bfqq does not need to belong |
2177 |
++ * to the burst list any more does not invalidate the fact that |
2178 |
++ * bfqq may have been activated during the current burst. |
2179 |
++ */ |
2180 |
++ if (idle_for_long_time) { |
2181 |
++ hlist_del_init(&bfqq->burst_list_node); |
2182 |
++ bfq_clear_bfqq_in_large_burst(bfqq); |
2183 |
++ } |
2184 |
++ |
2185 |
++ /* |
2186 |
++ * If bfqq is already in the burst list or is part of a large |
2187 |
++ * burst, then there is nothing else to do. |
2188 |
++ */ |
2189 |
++ if (!hlist_unhashed(&bfqq->burst_list_node) || |
2190 |
++ bfq_bfqq_in_large_burst(bfqq)) |
2191 |
++ return; |
2192 |
++ |
2193 |
++ /* |
2194 |
++ * If bfqq's activation happens late enough, then the current |
2195 |
++ * burst is finished, and related data structures must be reset. |
2196 |
++ * |
2197 |
++ * In this respect, consider the special case where bfqq is the very |
2198 |
++ * first queue being activated. In this case, last_ins_in_burst is |
2199 |
++ * not yet significant when we get here. But it is easy to verify |
2200 |
++ * that, whether or not the following condition is true, bfqq will |
2201 |
++ * end up being inserted into the burst list. In particular the |
2202 |
++ * list will happen to contain only bfqq. And this is exactly what |
2203 |
++ * has to happen, as bfqq may be the first queue in a possible |
2204 |
++ * burst. |
2205 |
++ */ |
2206 |
++ if (time_is_before_jiffies(bfqd->last_ins_in_burst + |
2207 |
++ bfqd->bfq_burst_interval)) { |
2208 |
++ bfqd->large_burst = false; |
2209 |
++ bfq_reset_burst_list(bfqd, bfqq); |
2210 |
++ return; |
2211 |
++ } |
2212 |
++ |
2213 |
++ /* |
2214 |
++ * If we get here, then bfqq is being activated shortly after the |
2215 |
++ * last queue. So, if the current burst is also large, we can mark |
2216 |
++ * bfqq as belonging to this large burst immediately. |
2217 |
++ */ |
2218 |
++ if (bfqd->large_burst) { |
2219 |
++ bfq_mark_bfqq_in_large_burst(bfqq); |
2220 |
++ return; |
2221 |
++ } |
2222 |
++ |
2223 |
++ /* |
2224 |
++ * If we get here, then a large-burst state has not yet been |
2225 |
++ * reached, but bfqq is being activated shortly after the last |
2226 |
++ * queue. Then we add bfqq to the burst. |
2227 |
++ */ |
2228 |
++ bfq_add_to_burst(bfqd, bfqq); |
2229 |
++} |
2230 |
++ |
2231 |
++static void bfq_add_request(struct request *rq) |
2232 |
++{ |
2233 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
2234 |
++ struct bfq_entity *entity = &bfqq->entity; |
2235 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
2236 |
++ struct request *next_rq, *prev; |
2237 |
++ unsigned long old_wr_coeff = bfqq->wr_coeff; |
2238 |
++ bool interactive = false; |
2239 |
++ |
2240 |
++ bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); |
2241 |
++ bfqq->queued[rq_is_sync(rq)]++; |
2242 |
++ bfqd->queued++; |
2243 |
++ |
2244 |
++ elv_rb_add(&bfqq->sort_list, rq); |
2245 |
++ |
2246 |
++ /* |
2247 |
++ * Check if this request is a better next-serve candidate. |
2248 |
++ */ |
2249 |
++ prev = bfqq->next_rq; |
2250 |
++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); |
2251 |
++ BUG_ON(!next_rq); |
2252 |
++ bfqq->next_rq = next_rq; |
2253 |
++ |
2254 |
++ if (!bfq_bfqq_busy(bfqq)) { |
2255 |
++ bool soft_rt, in_burst, |
2256 |
++ idle_for_long_time = time_is_before_jiffies( |
2257 |
++ bfqq->budget_timeout + |
2258 |
++ bfqd->bfq_wr_min_idle_time); |
2259 |
++ |
2260 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
2261 |
++ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, |
2262 |
++ rq->cmd_flags); |
2263 |
++#endif |
2264 |
++ if (bfq_bfqq_sync(bfqq)) { |
2265 |
++ bool already_in_burst = |
2266 |
++ !hlist_unhashed(&bfqq->burst_list_node) || |
2267 |
++ bfq_bfqq_in_large_burst(bfqq); |
2268 |
++ bfq_handle_burst(bfqd, bfqq, idle_for_long_time); |
2269 |
++ /* |
2270 |
++ * If bfqq was not already in the current burst, |
2271 |
++ * then, at this point, bfqq either has been |
2272 |
++ * added to the current burst or has caused the |
2273 |
++ * current burst to terminate. In particular, in |
2274 |
++ * the second case, bfqq has become the first |
2275 |
++ * queue in a possible new burst. |
2276 |
++ * In both cases last_ins_in_burst needs to be |
2277 |
++ * moved forward. |
2278 |
++ */ |
2279 |
++ if (!already_in_burst) |
2280 |
++ bfqd->last_ins_in_burst = jiffies; |
2281 |
++ } |
2282 |
++ |
2283 |
++ in_burst = bfq_bfqq_in_large_burst(bfqq); |
2284 |
++ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && |
2285 |
++ !in_burst && |
2286 |
++ time_is_before_jiffies(bfqq->soft_rt_next_start); |
2287 |
++ interactive = !in_burst && idle_for_long_time; |
2288 |
++ entity->budget = max_t(unsigned long, bfqq->max_budget, |
2289 |
++ bfq_serv_to_charge(next_rq, bfqq)); |
2290 |
++ |
2291 |
++ if (!bfq_bfqq_IO_bound(bfqq)) { |
2292 |
++ if (time_before(jiffies, |
2293 |
++ RQ_BIC(rq)->ttime.last_end_request + |
2294 |
++ bfqd->bfq_slice_idle)) { |
2295 |
++ bfqq->requests_within_timer++; |
2296 |
++ if (bfqq->requests_within_timer >= |
2297 |
++ bfqd->bfq_requests_within_timer) |
2298 |
++ bfq_mark_bfqq_IO_bound(bfqq); |
2299 |
++ } else |
2300 |
++ bfqq->requests_within_timer = 0; |
2301 |
++ } |
2302 |
++ |
2303 |
++ if (!bfqd->low_latency) |
2304 |
++ goto add_bfqq_busy; |
2305 |
++ |
2306 |
++ /* |
2307 |
++ * If the queue: |
2308 |
++ * - is not being boosted, |
2309 |
++ * - has been idle for enough time, |
2310 |
++ * - is not a sync queue or is linked to a bfq_io_cq (it is |
2311 |
++ * shared "for its nature" or it is not shared and its |
2312 |
++ * requests have not been redirected to a shared queue) |
2313 |
++ * start a weight-raising period. |
2314 |
++ */ |
2315 |
++ if (old_wr_coeff == 1 && (interactive || soft_rt) && |
2316 |
++ (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { |
2317 |
++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; |
2318 |
++ if (interactive) |
2319 |
++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); |
2320 |
++ else |
2321 |
++ bfqq->wr_cur_max_time = |
2322 |
++ bfqd->bfq_wr_rt_max_time; |
2323 |
++ bfq_log_bfqq(bfqd, bfqq, |
2324 |
++ "wrais starting at %lu, rais_max_time %u", |
2325 |
++ jiffies, |
2326 |
++ jiffies_to_msecs(bfqq->wr_cur_max_time)); |
2327 |
++ } else if (old_wr_coeff > 1) { |
2328 |
++ if (interactive) |
2329 |
++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); |
2330 |
++ else if (in_burst || |
2331 |
++ (bfqq->wr_cur_max_time == |
2332 |
++ bfqd->bfq_wr_rt_max_time && |
2333 |
++ !soft_rt)) { |
2334 |
++ bfqq->wr_coeff = 1; |
2335 |
++ bfq_log_bfqq(bfqd, bfqq, |
2336 |
++ "wrais ending at %lu, rais_max_time %u", |
2337 |
++ jiffies, |
2338 |
++ jiffies_to_msecs(bfqq-> |
2339 |
++ wr_cur_max_time)); |
2340 |
++ } else if (time_before( |
2341 |
++ bfqq->last_wr_start_finish + |
2342 |
++ bfqq->wr_cur_max_time, |
2343 |
++ jiffies + |
2344 |
++ bfqd->bfq_wr_rt_max_time) && |
2345 |
++ soft_rt) { |
2346 |
++ /* |
2347 |
++ * |
2348 |
++ * The remaining weight-raising time is lower |
2349 |
++ * than bfqd->bfq_wr_rt_max_time, which means |
2350 |
++ * that the application is enjoying weight |
2351 |
++ * raising either because deemed soft-rt in |
2352 |
++ * the near past, or because deemed interactive |
2353 |
++ * a long ago. |
2354 |
++ * In both cases, resetting now the current |
2355 |
++ * remaining weight-raising time for the |
2356 |
++ * application to the weight-raising duration |
2357 |
++ * for soft rt applications would not cause any |
2358 |
++ * latency increase for the application (as the |
2359 |
++ * new duration would be higher than the |
2360 |
++ * remaining time). |
2361 |
++ * |
2362 |
++ * In addition, the application is now meeting |
2363 |
++ * the requirements for being deemed soft rt. |
2364 |
++ * In the end we can correctly and safely |
2365 |
++ * (re)charge the weight-raising duration for |
2366 |
++ * the application with the weight-raising |
2367 |
++ * duration for soft rt applications. |
2368 |
++ * |
2369 |
++ * In particular, doing this recharge now, i.e., |
2370 |
++ * before the weight-raising period for the |
2371 |
++ * application finishes, reduces the probability |
2372 |
++ * of the following negative scenario: |
2373 |
++ * 1) the weight of a soft rt application is |
2374 |
++ * raised at startup (as for any newly |
2375 |
++ * created application), |
2376 |
++ * 2) since the application is not interactive, |
2377 |
++ * at a certain time weight-raising is |
2378 |
++ * stopped for the application, |
2379 |
++ * 3) at that time the application happens to |
2380 |
++ * still have pending requests, and hence |
2381 |
++ * is destined to not have a chance to be |
2382 |
++ * deemed soft rt before these requests are |
2383 |
++ * completed (see the comments to the |
2384 |
++ * function bfq_bfqq_softrt_next_start() |
2385 |
++ * for details on soft rt detection), |
2386 |
++ * 4) these pending requests experience a high |
2387 |
++ * latency because the application is not |
2388 |
++ * weight-raised while they are pending. |
2389 |
++ */ |
2390 |
++ bfqq->last_wr_start_finish = jiffies; |
2391 |
++ bfqq->wr_cur_max_time = |
2392 |
++ bfqd->bfq_wr_rt_max_time; |
2393 |
++ } |
2394 |
++ } |
2395 |
++ if (old_wr_coeff != bfqq->wr_coeff) |
2396 |
++ entity->prio_changed = 1; |
2397 |
++add_bfqq_busy: |
2398 |
++ bfqq->last_idle_bklogged = jiffies; |
2399 |
++ bfqq->service_from_backlogged = 0; |
2400 |
++ bfq_clear_bfqq_softrt_update(bfqq); |
2401 |
++ bfq_add_bfqq_busy(bfqd, bfqq); |
2402 |
++ } else { |
2403 |
++ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && |
2404 |
++ time_is_before_jiffies( |
2405 |
++ bfqq->last_wr_start_finish + |
2406 |
++ bfqd->bfq_wr_min_inter_arr_async)) { |
2407 |
++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; |
2408 |
++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); |
2409 |
++ |
2410 |
++ bfqd->wr_busy_queues++; |
2411 |
++ entity->prio_changed = 1; |
2412 |
++ bfq_log_bfqq(bfqd, bfqq, |
2413 |
++ "non-idle wrais starting at %lu, rais_max_time %u", |
2414 |
++ jiffies, |
2415 |
++ jiffies_to_msecs(bfqq->wr_cur_max_time)); |
2416 |
++ } |
2417 |
++ if (prev != bfqq->next_rq) |
2418 |
++ bfq_updated_next_req(bfqd, bfqq); |
2419 |
++ } |
2420 |
++ |
2421 |
++ if (bfqd->low_latency && |
2422 |
++ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) |
2423 |
++ bfqq->last_wr_start_finish = jiffies; |
2424 |
++} |
2425 |
++ |
2426 |
++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, |
2427 |
++ struct bio *bio) |
2428 |
++{ |
2429 |
++ struct task_struct *tsk = current; |
2430 |
++ struct bfq_io_cq *bic; |
2431 |
++ struct bfq_queue *bfqq; |
2432 |
++ |
2433 |
++ bic = bfq_bic_lookup(bfqd, tsk->io_context); |
2434 |
++ if (!bic) |
2435 |
++ return NULL; |
2436 |
++ |
2437 |
++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); |
2438 |
++ if (bfqq) |
2439 |
++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); |
2440 |
++ |
2441 |
++ return NULL; |
2442 |
++} |
2443 |
++ |
2444 |
++static void bfq_activate_request(struct request_queue *q, struct request *rq) |
2445 |
++{ |
2446 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
2447 |
++ |
2448 |
++ bfqd->rq_in_driver++; |
2449 |
++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); |
2450 |
++ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", |
2451 |
++ (long long unsigned)bfqd->last_position); |
2452 |
++} |
2453 |
++ |
2454 |
++static void bfq_deactivate_request(struct request_queue *q, struct request *rq) |
2455 |
++{ |
2456 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
2457 |
++ |
2458 |
++ BUG_ON(bfqd->rq_in_driver == 0); |
2459 |
++ bfqd->rq_in_driver--; |
2460 |
++} |
2461 |
++ |
2462 |
++static void bfq_remove_request(struct request *rq) |
2463 |
++{ |
2464 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
2465 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
2466 |
++ const int sync = rq_is_sync(rq); |
2467 |
++ |
2468 |
++ if (bfqq->next_rq == rq) { |
2469 |
++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); |
2470 |
++ bfq_updated_next_req(bfqd, bfqq); |
2471 |
++ } |
2472 |
++ |
2473 |
++ if (rq->queuelist.prev != &rq->queuelist) |
2474 |
++ list_del_init(&rq->queuelist); |
2475 |
++ BUG_ON(bfqq->queued[sync] == 0); |
2476 |
++ bfqq->queued[sync]--; |
2477 |
++ bfqd->queued--; |
2478 |
++ elv_rb_del(&bfqq->sort_list, rq); |
2479 |
++ |
2480 |
++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { |
2481 |
++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) |
2482 |
++ bfq_del_bfqq_busy(bfqd, bfqq, 1); |
2483 |
++ /* |
2484 |
++ * Remove queue from request-position tree as it is empty. |
2485 |
++ */ |
2486 |
++ if (bfqq->pos_root) { |
2487 |
++ rb_erase(&bfqq->pos_node, bfqq->pos_root); |
2488 |
++ bfqq->pos_root = NULL; |
2489 |
++ } |
2490 |
++ } |
2491 |
++ |
2492 |
++ if (rq->cmd_flags & REQ_META) { |
2493 |
++ BUG_ON(bfqq->meta_pending == 0); |
2494 |
++ bfqq->meta_pending--; |
2495 |
++ } |
2496 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
2497 |
++ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); |
2498 |
++#endif |
2499 |
++} |
2500 |
++ |
2501 |
++static int bfq_merge(struct request_queue *q, struct request **req, |
2502 |
++ struct bio *bio) |
2503 |
++{ |
2504 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
2505 |
++ struct request *__rq; |
2506 |
++ |
2507 |
++ __rq = bfq_find_rq_fmerge(bfqd, bio); |
2508 |
++ if (__rq && elv_rq_merge_ok(__rq, bio)) { |
2509 |
++ *req = __rq; |
2510 |
++ return ELEVATOR_FRONT_MERGE; |
2511 |
++ } |
2512 |
++ |
2513 |
++ return ELEVATOR_NO_MERGE; |
2514 |
++} |
2515 |
++ |
2516 |
++static void bfq_merged_request(struct request_queue *q, struct request *req, |
2517 |
++ int type) |
2518 |
++{ |
2519 |
++ if (type == ELEVATOR_FRONT_MERGE && |
2520 |
++ rb_prev(&req->rb_node) && |
2521 |
++ blk_rq_pos(req) < |
2522 |
++ blk_rq_pos(container_of(rb_prev(&req->rb_node), |
2523 |
++ struct request, rb_node))) { |
2524 |
++ struct bfq_queue *bfqq = RQ_BFQQ(req); |
2525 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
2526 |
++ struct request *prev, *next_rq; |
2527 |
++ |
2528 |
++ /* Reposition request in its sort_list */ |
2529 |
++ elv_rb_del(&bfqq->sort_list, req); |
2530 |
++ elv_rb_add(&bfqq->sort_list, req); |
2531 |
++ /* Choose next request to be served for bfqq */ |
2532 |
++ prev = bfqq->next_rq; |
2533 |
++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, |
2534 |
++ bfqd->last_position); |
2535 |
++ BUG_ON(!next_rq); |
2536 |
++ bfqq->next_rq = next_rq; |
2537 |
++ } |
2538 |
++} |
2539 |
++ |
2540 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
2541 |
++static void bfq_bio_merged(struct request_queue *q, struct request *req, |
2542 |
++ struct bio *bio) |
2543 |
++{ |
2544 |
++ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw); |
2545 |
++} |
2546 |
++#endif |
2547 |
++ |
2548 |
++static void bfq_merged_requests(struct request_queue *q, struct request *rq, |
2549 |
++ struct request *next) |
2550 |
++{ |
2551 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); |
2552 |
++ |
2553 |
++ /* |
2554 |
++ * If next and rq belong to the same bfq_queue and next is older |
2555 |
++ * than rq, then reposition rq in the fifo (by substituting next |
2556 |
++ * with rq). Otherwise, if next and rq belong to different |
2557 |
++ * bfq_queues, never reposition rq: in fact, we would have to |
2558 |
++ * reposition it with respect to next's position in its own fifo, |
2559 |
++ * which would most certainly be too expensive with respect to |
2560 |
++ * the benefits. |
2561 |
++ */ |
2562 |
++ if (bfqq == next_bfqq && |
2563 |
++ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && |
2564 |
++ time_before(next->fifo_time, rq->fifo_time)) { |
2565 |
++ list_del_init(&rq->queuelist); |
2566 |
++ list_replace_init(&next->queuelist, &rq->queuelist); |
2567 |
++ rq->fifo_time = next->fifo_time; |
2568 |
++ } |
2569 |
++ |
2570 |
++ if (bfqq->next_rq == next) |
2571 |
++ bfqq->next_rq = rq; |
2572 |
++ |
2573 |
++ bfq_remove_request(next); |
2574 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
2575 |
++ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); |
2576 |
++#endif |
2577 |
++} |
2578 |
++ |
2579 |
++/* Must be called with bfqq != NULL */ |
2580 |
++static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) |
2581 |
++{ |
2582 |
++ BUG_ON(!bfqq); |
2583 |
++ if (bfq_bfqq_busy(bfqq)) |
2584 |
++ bfqq->bfqd->wr_busy_queues--; |
2585 |
++ bfqq->wr_coeff = 1; |
2586 |
++ bfqq->wr_cur_max_time = 0; |
2587 |
++ /* Trigger a weight change on the next activation of the queue */ |
2588 |
++ bfqq->entity.prio_changed = 1; |
2589 |
++} |
2590 |
++ |
2591 |
++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, |
2592 |
++ struct bfq_group *bfqg) |
2593 |
++{ |
2594 |
++ int i, j; |
2595 |
++ |
2596 |
++ for (i = 0; i < 2; i++) |
2597 |
++ for (j = 0; j < IOPRIO_BE_NR; j++) |
2598 |
++ if (bfqg->async_bfqq[i][j]) |
2599 |
++ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); |
2600 |
++ if (bfqg->async_idle_bfqq) |
2601 |
++ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); |
2602 |
++} |
2603 |
++ |
2604 |
++static void bfq_end_wr(struct bfq_data *bfqd) |
2605 |
++{ |
2606 |
++ struct bfq_queue *bfqq; |
2607 |
++ |
2608 |
++ spin_lock_irq(bfqd->queue->queue_lock); |
2609 |
++ |
2610 |
++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) |
2611 |
++ bfq_bfqq_end_wr(bfqq); |
2612 |
++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) |
2613 |
++ bfq_bfqq_end_wr(bfqq); |
2614 |
++ bfq_end_wr_async(bfqd); |
2615 |
++ |
2616 |
++ spin_unlock_irq(bfqd->queue->queue_lock); |
2617 |
++} |
2618 |
++ |
2619 |
++static int bfq_allow_merge(struct request_queue *q, struct request *rq, |
2620 |
++ struct bio *bio) |
2621 |
++{ |
2622 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
2623 |
++ struct bfq_io_cq *bic; |
2624 |
++ |
2625 |
++ /* |
2626 |
++ * Disallow merge of a sync bio into an async request. |
2627 |
++ */ |
2628 |
++ if (bfq_bio_sync(bio) && !rq_is_sync(rq)) |
2629 |
++ return 0; |
2630 |
++ |
2631 |
++ /* |
2632 |
++ * Lookup the bfqq that this bio will be queued with. Allow |
2633 |
++ * merge only if rq is queued there. |
2634 |
++ * Queue lock is held here. |
2635 |
++ */ |
2636 |
++ bic = bfq_bic_lookup(bfqd, current->io_context); |
2637 |
++ if (!bic) |
2638 |
++ return 0; |
2639 |
++ |
2640 |
++ return bic_to_bfqq(bic, bfq_bio_sync(bio)) == RQ_BFQQ(rq); |
2641 |
++} |
2642 |
++ |
2643 |
++static void __bfq_set_in_service_queue(struct bfq_data *bfqd, |
2644 |
++ struct bfq_queue *bfqq) |
2645 |
++{ |
2646 |
++ if (bfqq) { |
2647 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
2648 |
++ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); |
2649 |
++#endif |
2650 |
++ bfq_mark_bfqq_must_alloc(bfqq); |
2651 |
++ bfq_mark_bfqq_budget_new(bfqq); |
2652 |
++ bfq_clear_bfqq_fifo_expire(bfqq); |
2653 |
++ |
2654 |
++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; |
2655 |
++ |
2656 |
++ bfq_log_bfqq(bfqd, bfqq, |
2657 |
++ "set_in_service_queue, cur-budget = %d", |
2658 |
++ bfqq->entity.budget); |
2659 |
++ } |
2660 |
++ |
2661 |
++ bfqd->in_service_queue = bfqq; |
2662 |
++} |
2663 |
++ |
2664 |
++/* |
2665 |
++ * Get and set a new queue for service. |
2666 |
++ */ |
2667 |
++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) |
2668 |
++{ |
2669 |
++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); |
2670 |
++ |
2671 |
++ __bfq_set_in_service_queue(bfqd, bfqq); |
2672 |
++ return bfqq; |
2673 |
++} |
2674 |
++ |
2675 |
++/* |
2676 |
++ * If enough samples have been computed, return the current max budget |
2677 |
++ * stored in bfqd, which is dynamically updated according to the |
2678 |
++ * estimated disk peak rate; otherwise return the default max budget |
2679 |
++ */ |
2680 |
++static int bfq_max_budget(struct bfq_data *bfqd) |
2681 |
++{ |
2682 |
++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) |
2683 |
++ return bfq_default_max_budget; |
2684 |
++ else |
2685 |
++ return bfqd->bfq_max_budget; |
2686 |
++} |
2687 |
++ |
2688 |
++/* |
2689 |
++ * Return min budget, which is a fraction of the current or default |
2690 |
++ * max budget (trying with 1/32) |
2691 |
++ */ |
2692 |
++static int bfq_min_budget(struct bfq_data *bfqd) |
2693 |
++{ |
2694 |
++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) |
2695 |
++ return bfq_default_max_budget / 32; |
2696 |
++ else |
2697 |
++ return bfqd->bfq_max_budget / 32; |
2698 |
++} |
2699 |
++ |
2700 |
++static void bfq_arm_slice_timer(struct bfq_data *bfqd) |
2701 |
++{ |
2702 |
++ struct bfq_queue *bfqq = bfqd->in_service_queue; |
2703 |
++ struct bfq_io_cq *bic; |
2704 |
++ unsigned long sl; |
2705 |
++ |
2706 |
++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); |
2707 |
++ |
2708 |
++ /* Processes have exited, don't wait. */ |
2709 |
++ bic = bfqd->in_service_bic; |
2710 |
++ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) |
2711 |
++ return; |
2712 |
++ |
2713 |
++ bfq_mark_bfqq_wait_request(bfqq); |
2714 |
++ |
2715 |
++ /* |
2716 |
++ * We don't want to idle for seeks, but we do want to allow |
2717 |
++ * fair distribution of slice time for a process doing back-to-back |
2718 |
++ * seeks. So allow a little bit of time for him to submit a new rq. |
2719 |
++ * |
2720 |
++ * To prevent processes with (partly) seeky workloads from |
2721 |
++ * being too ill-treated, grant them a small fraction of the |
2722 |
++ * assigned budget before reducing the waiting time to |
2723 |
++ * BFQ_MIN_TT. This happened to help reduce latency. |
2724 |
++ */ |
2725 |
++ sl = bfqd->bfq_slice_idle; |
2726 |
++ /* |
2727 |
++ * Unless the queue is being weight-raised or the scenario is |
2728 |
++ * asymmetric, grant only minimum idle time if the queue either |
2729 |
++ * has been seeky for long enough or has already proved to be |
2730 |
++ * constantly seeky. |
2731 |
++ */ |
2732 |
++ if (bfq_sample_valid(bfqq->seek_samples) && |
2733 |
++ ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > |
2734 |
++ bfq_max_budget(bfqq->bfqd) / 8) || |
2735 |
++ bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && |
2736 |
++ bfq_symmetric_scenario(bfqd)) |
2737 |
++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); |
2738 |
++ else if (bfqq->wr_coeff > 1) |
2739 |
++ sl = sl * 3; |
2740 |
++ bfqd->last_idling_start = ktime_get(); |
2741 |
++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl); |
2742 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
2743 |
++ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); |
2744 |
++#endif |
2745 |
++ bfq_log(bfqd, "arm idle: %u/%u ms", |
2746 |
++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); |
2747 |
++} |
2748 |
++ |
2749 |
++/* |
2750 |
++ * Set the maximum time for the in-service queue to consume its |
2751 |
++ * budget. This prevents seeky processes from lowering the disk |
2752 |
++ * throughput (always guaranteed with a time slice scheme as in CFQ). |
2753 |
++ */ |
2754 |
++static void bfq_set_budget_timeout(struct bfq_data *bfqd) |
2755 |
++{ |
2756 |
++ struct bfq_queue *bfqq = bfqd->in_service_queue; |
2757 |
++ unsigned int timeout_coeff; |
2758 |
++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) |
2759 |
++ timeout_coeff = 1; |
2760 |
++ else |
2761 |
++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; |
2762 |
++ |
2763 |
++ bfqd->last_budget_start = ktime_get(); |
2764 |
++ |
2765 |
++ bfq_clear_bfqq_budget_new(bfqq); |
2766 |
++ bfqq->budget_timeout = jiffies + |
2767 |
++ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; |
2768 |
++ |
2769 |
++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", |
2770 |
++ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * |
2771 |
++ timeout_coeff)); |
2772 |
++} |
2773 |
++ |
2774 |
++/* |
2775 |
++ * Move request from internal lists to the request queue dispatch list. |
2776 |
++ */ |
2777 |
++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) |
2778 |
++{ |
2779 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
2780 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
2781 |
++ |
2782 |
++ /* |
2783 |
++ * For consistency, the next instruction should have been executed |
2784 |
++ * after removing the request from the queue and dispatching it. |
2785 |
++ * We execute instead this instruction before bfq_remove_request() |
2786 |
++ * (and hence introduce a temporary inconsistency), for efficiency. |
2787 |
++ * In fact, in a forced_dispatch, this prevents two counters related |
2788 |
++ * to bfqq->dispatched to risk to be uselessly decremented if bfqq |
2789 |
++ * is not in service, and then to be incremented again after |
2790 |
++ * incrementing bfqq->dispatched. |
2791 |
++ */ |
2792 |
++ bfqq->dispatched++; |
2793 |
++ bfq_remove_request(rq); |
2794 |
++ elv_dispatch_sort(q, rq); |
2795 |
++ |
2796 |
++ if (bfq_bfqq_sync(bfqq)) |
2797 |
++ bfqd->sync_flight++; |
2798 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
2799 |
++ bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq), |
2800 |
++ rq->cmd_flags); |
2801 |
++#endif |
2802 |
++} |
2803 |
++ |
2804 |
++/* |
2805 |
++ * Return expired entry, or NULL to just start from scratch in rbtree. |
2806 |
++ */ |
2807 |
++static struct request *bfq_check_fifo(struct bfq_queue *bfqq) |
2808 |
++{ |
2809 |
++ struct request *rq = NULL; |
2810 |
++ |
2811 |
++ if (bfq_bfqq_fifo_expire(bfqq)) |
2812 |
++ return NULL; |
2813 |
++ |
2814 |
++ bfq_mark_bfqq_fifo_expire(bfqq); |
2815 |
++ |
2816 |
++ if (list_empty(&bfqq->fifo)) |
2817 |
++ return NULL; |
2818 |
++ |
2819 |
++ rq = rq_entry_fifo(bfqq->fifo.next); |
2820 |
++ |
2821 |
++ if (time_before(jiffies, rq->fifo_time)) |
2822 |
++ return NULL; |
2823 |
++ |
2824 |
++ return rq; |
2825 |
++} |
2826 |
++ |
2827 |
++static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) |
2828 |
++{ |
2829 |
++ struct bfq_entity *entity = &bfqq->entity; |
2830 |
++ return entity->budget - entity->service; |
2831 |
++} |
2832 |
++ |
2833 |
++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
2834 |
++{ |
2835 |
++ BUG_ON(bfqq != bfqd->in_service_queue); |
2836 |
++ |
2837 |
++ __bfq_bfqd_reset_in_service(bfqd); |
2838 |
++ |
2839 |
++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { |
2840 |
++ /* |
2841 |
++ * Overloading budget_timeout field to store the time |
2842 |
++ * at which the queue remains with no backlog; used by |
2843 |
++ * the weight-raising mechanism. |
2844 |
++ */ |
2845 |
++ bfqq->budget_timeout = jiffies; |
2846 |
++ bfq_del_bfqq_busy(bfqd, bfqq, 1); |
2847 |
++ } else |
2848 |
++ bfq_activate_bfqq(bfqd, bfqq); |
2849 |
++} |
2850 |
++ |
2851 |
++/** |
2852 |
++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. |
2853 |
++ * @bfqd: device data. |
2854 |
++ * @bfqq: queue to update. |
2855 |
++ * @reason: reason for expiration. |
2856 |
++ * |
2857 |
++ * Handle the feedback on @bfqq budget at queue expiration. |
2858 |
++ * See the body for detailed comments. |
2859 |
++ */ |
2860 |
++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, |
2861 |
++ struct bfq_queue *bfqq, |
2862 |
++ enum bfqq_expiration reason) |
2863 |
++{ |
2864 |
++ struct request *next_rq; |
2865 |
++ int budget, min_budget; |
2866 |
++ |
2867 |
++ budget = bfqq->max_budget; |
2868 |
++ min_budget = bfq_min_budget(bfqd); |
2869 |
++ |
2870 |
++ BUG_ON(bfqq != bfqd->in_service_queue); |
2871 |
++ |
2872 |
++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", |
2873 |
++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); |
2874 |
++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", |
2875 |
++ budget, bfq_min_budget(bfqd)); |
2876 |
++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", |
2877 |
++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); |
2878 |
++ |
2879 |
++ if (bfq_bfqq_sync(bfqq)) { |
2880 |
++ switch (reason) { |
2881 |
++ /* |
2882 |
++ * Caveat: in all the following cases we trade latency |
2883 |
++ * for throughput. |
2884 |
++ */ |
2885 |
++ case BFQ_BFQQ_TOO_IDLE: |
2886 |
++ /* |
2887 |
++ * This is the only case where we may reduce |
2888 |
++ * the budget: if there is no request of the |
2889 |
++ * process still waiting for completion, then |
2890 |
++ * we assume (tentatively) that the timer has |
2891 |
++ * expired because the batch of requests of |
2892 |
++ * the process could have been served with a |
2893 |
++ * smaller budget. Hence, betting that |
2894 |
++ * process will behave in the same way when it |
2895 |
++ * becomes backlogged again, we reduce its |
2896 |
++ * next budget. As long as we guess right, |
2897 |
++ * this budget cut reduces the latency |
2898 |
++ * experienced by the process. |
2899 |
++ * |
2900 |
++ * However, if there are still outstanding |
2901 |
++ * requests, then the process may have not yet |
2902 |
++ * issued its next request just because it is |
2903 |
++ * still waiting for the completion of some of |
2904 |
++ * the still outstanding ones. So in this |
2905 |
++ * subcase we do not reduce its budget, on the |
2906 |
++ * contrary we increase it to possibly boost |
2907 |
++ * the throughput, as discussed in the |
2908 |
++ * comments to the BUDGET_TIMEOUT case. |
2909 |
++ */ |
2910 |
++ if (bfqq->dispatched > 0) /* still outstanding reqs */ |
2911 |
++ budget = min(budget * 2, bfqd->bfq_max_budget); |
2912 |
++ else { |
2913 |
++ if (budget > 5 * min_budget) |
2914 |
++ budget -= 4 * min_budget; |
2915 |
++ else |
2916 |
++ budget = min_budget; |
2917 |
++ } |
2918 |
++ break; |
2919 |
++ case BFQ_BFQQ_BUDGET_TIMEOUT: |
2920 |
++ /* |
2921 |
++ * We double the budget here because: 1) it |
2922 |
++ * gives the chance to boost the throughput if |
2923 |
++ * this is not a seeky process (which may have |
2924 |
++ * bumped into this timeout because of, e.g., |
2925 |
++ * ZBR), 2) together with charge_full_budget |
2926 |
++ * it helps give seeky processes higher |
2927 |
++ * timestamps, and hence be served less |
2928 |
++ * frequently. |
2929 |
++ */ |
2930 |
++ budget = min(budget * 2, bfqd->bfq_max_budget); |
2931 |
++ break; |
2932 |
++ case BFQ_BFQQ_BUDGET_EXHAUSTED: |
2933 |
++ /* |
2934 |
++ * The process still has backlog, and did not |
2935 |
++ * let either the budget timeout or the disk |
2936 |
++ * idling timeout expire. Hence it is not |
2937 |
++ * seeky, has a short thinktime and may be |
2938 |
++ * happy with a higher budget too. So |
2939 |
++ * definitely increase the budget of this good |
2940 |
++ * candidate to boost the disk throughput. |
2941 |
++ */ |
2942 |
++ budget = min(budget * 4, bfqd->bfq_max_budget); |
2943 |
++ break; |
2944 |
++ case BFQ_BFQQ_NO_MORE_REQUESTS: |
2945 |
++ /* |
2946 |
++ * Leave the budget unchanged. |
2947 |
++ */ |
2948 |
++ default: |
2949 |
++ return; |
2950 |
++ } |
2951 |
++ } else |
2952 |
++ /* |
2953 |
++ * Async queues get always the maximum possible budget |
2954 |
++ * (their ability to dispatch is limited by |
2955 |
++ * @bfqd->bfq_max_budget_async_rq). |
2956 |
++ */ |
2957 |
++ budget = bfqd->bfq_max_budget; |
2958 |
++ |
2959 |
++ bfqq->max_budget = budget; |
2960 |
++ |
2961 |
++ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && |
2962 |
++ !bfqd->bfq_user_max_budget) |
2963 |
++ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); |
2964 |
++ |
2965 |
++ /* |
2966 |
++ * Make sure that we have enough budget for the next request. |
2967 |
++ * Since the finish time of the bfqq must be kept in sync with |
2968 |
++ * the budget, be sure to call __bfq_bfqq_expire() after the |
2969 |
++ * update. |
2970 |
++ */ |
2971 |
++ next_rq = bfqq->next_rq; |
2972 |
++ if (next_rq) |
2973 |
++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, |
2974 |
++ bfq_serv_to_charge(next_rq, bfqq)); |
2975 |
++ else |
2976 |
++ bfqq->entity.budget = bfqq->max_budget; |
2977 |
++ |
2978 |
++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", |
2979 |
++ next_rq ? blk_rq_sectors(next_rq) : 0, |
2980 |
++ bfqq->entity.budget); |
2981 |
++} |
2982 |
++ |
2983 |
++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) |
2984 |
++{ |
2985 |
++ unsigned long max_budget; |
2986 |
++ |
2987 |
++ /* |
2988 |
++ * The max_budget calculated when autotuning is equal to the |
2989 |
++ * amount of sectors transfered in timeout_sync at the |
2990 |
++ * estimated peak rate. |
2991 |
++ */ |
2992 |
++ max_budget = (unsigned long)(peak_rate * 1000 * |
2993 |
++ timeout >> BFQ_RATE_SHIFT); |
2994 |
++ |
2995 |
++ return max_budget; |
2996 |
++} |
2997 |
++ |
2998 |
++/* |
2999 |
++ * In addition to updating the peak rate, checks whether the process |
3000 |
++ * is "slow", and returns 1 if so. This slow flag is used, in addition |
3001 |
++ * to the budget timeout, to reduce the amount of service provided to |
3002 |
++ * seeky processes, and hence reduce their chances to lower the |
3003 |
++ * throughput. See the code for more details. |
3004 |
++ */ |
3005 |
++static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
3006 |
++ bool compensate, enum bfqq_expiration reason) |
3007 |
++{ |
3008 |
++ u64 bw, usecs, expected, timeout; |
3009 |
++ ktime_t delta; |
3010 |
++ int update = 0; |
3011 |
++ |
3012 |
++ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) |
3013 |
++ return false; |
3014 |
++ |
3015 |
++ if (compensate) |
3016 |
++ delta = bfqd->last_idling_start; |
3017 |
++ else |
3018 |
++ delta = ktime_get(); |
3019 |
++ delta = ktime_sub(delta, bfqd->last_budget_start); |
3020 |
++ usecs = ktime_to_us(delta); |
3021 |
++ |
3022 |
++ /* Don't trust short/unrealistic values. */ |
3023 |
++ if (usecs < 100 || usecs >= LONG_MAX) |
3024 |
++ return false; |
3025 |
++ |
3026 |
++ /* |
3027 |
++ * Calculate the bandwidth for the last slice. We use a 64 bit |
3028 |
++ * value to store the peak rate, in sectors per usec in fixed |
3029 |
++ * point math. We do so to have enough precision in the estimate |
3030 |
++ * and to avoid overflows. |
3031 |
++ */ |
3032 |
++ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; |
3033 |
++ do_div(bw, (unsigned long)usecs); |
3034 |
++ |
3035 |
++ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); |
3036 |
++ |
3037 |
++ /* |
3038 |
++ * Use only long (> 20ms) intervals to filter out spikes for |
3039 |
++ * the peak rate estimation. |
3040 |
++ */ |
3041 |
++ if (usecs > 20000) { |
3042 |
++ if (bw > bfqd->peak_rate || |
3043 |
++ (!BFQQ_SEEKY(bfqq) && |
3044 |
++ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { |
3045 |
++ bfq_log(bfqd, "measured bw =%llu", bw); |
3046 |
++ /* |
3047 |
++ * To smooth oscillations use a low-pass filter with |
3048 |
++ * alpha=7/8, i.e., |
3049 |
++ * new_rate = (7/8) * old_rate + (1/8) * bw |
3050 |
++ */ |
3051 |
++ do_div(bw, 8); |
3052 |
++ if (bw == 0) |
3053 |
++ return 0; |
3054 |
++ bfqd->peak_rate *= 7; |
3055 |
++ do_div(bfqd->peak_rate, 8); |
3056 |
++ bfqd->peak_rate += bw; |
3057 |
++ update = 1; |
3058 |
++ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); |
3059 |
++ } |
3060 |
++ |
3061 |
++ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; |
3062 |
++ |
3063 |
++ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) |
3064 |
++ bfqd->peak_rate_samples++; |
3065 |
++ |
3066 |
++ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && |
3067 |
++ update) { |
3068 |
++ int dev_type = blk_queue_nonrot(bfqd->queue); |
3069 |
++ if (bfqd->bfq_user_max_budget == 0) { |
3070 |
++ bfqd->bfq_max_budget = |
3071 |
++ bfq_calc_max_budget(bfqd->peak_rate, |
3072 |
++ timeout); |
3073 |
++ bfq_log(bfqd, "new max_budget=%d", |
3074 |
++ bfqd->bfq_max_budget); |
3075 |
++ } |
3076 |
++ if (bfqd->device_speed == BFQ_BFQD_FAST && |
3077 |
++ bfqd->peak_rate < device_speed_thresh[dev_type]) { |
3078 |
++ bfqd->device_speed = BFQ_BFQD_SLOW; |
3079 |
++ bfqd->RT_prod = R_slow[dev_type] * |
3080 |
++ T_slow[dev_type]; |
3081 |
++ } else if (bfqd->device_speed == BFQ_BFQD_SLOW && |
3082 |
++ bfqd->peak_rate > device_speed_thresh[dev_type]) { |
3083 |
++ bfqd->device_speed = BFQ_BFQD_FAST; |
3084 |
++ bfqd->RT_prod = R_fast[dev_type] * |
3085 |
++ T_fast[dev_type]; |
3086 |
++ } |
3087 |
++ } |
3088 |
++ } |
3089 |
++ |
3090 |
++ /* |
3091 |
++ * If the process has been served for a too short time |
3092 |
++ * interval to let its possible sequential accesses prevail on |
3093 |
++ * the initial seek time needed to move the disk head on the |
3094 |
++ * first sector it requested, then give the process a chance |
3095 |
++ * and for the moment return false. |
3096 |
++ */ |
3097 |
++ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) |
3098 |
++ return false; |
3099 |
++ |
3100 |
++ /* |
3101 |
++ * A process is considered ``slow'' (i.e., seeky, so that we |
3102 |
++ * cannot treat it fairly in the service domain, as it would |
3103 |
++ * slow down too much the other processes) if, when a slice |
3104 |
++ * ends for whatever reason, it has received service at a |
3105 |
++ * rate that would not be high enough to complete the budget |
3106 |
++ * before the budget timeout expiration. |
3107 |
++ */ |
3108 |
++ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; |
3109 |
++ |
3110 |
++ /* |
3111 |
++ * Caveat: processes doing IO in the slower disk zones will |
3112 |
++ * tend to be slow(er) even if not seeky. And the estimated |
3113 |
++ * peak rate will actually be an average over the disk |
3114 |
++ * surface. Hence, to not be too harsh with unlucky processes, |
3115 |
++ * we keep a budget/3 margin of safety before declaring a |
3116 |
++ * process slow. |
3117 |
++ */ |
3118 |
++ return expected > (4 * bfqq->entity.budget) / 3; |
3119 |
++} |
3120 |
++ |
3121 |
++/* |
3122 |
++ * To be deemed as soft real-time, an application must meet two |
3123 |
++ * requirements. First, the application must not require an average |
3124 |
++ * bandwidth higher than the approximate bandwidth required to playback or |
3125 |
++ * record a compressed high-definition video. |
3126 |
++ * The next function is invoked on the completion of the last request of a |
3127 |
++ * batch, to compute the next-start time instant, soft_rt_next_start, such |
3128 |
++ * that, if the next request of the application does not arrive before |
3129 |
++ * soft_rt_next_start, then the above requirement on the bandwidth is met. |
3130 |
++ * |
3131 |
++ * The second requirement is that the request pattern of the application is |
3132 |
++ * isochronous, i.e., that, after issuing a request or a batch of requests, |
3133 |
++ * the application stops issuing new requests until all its pending requests |
3134 |
++ * have been completed. After that, the application may issue a new batch, |
3135 |
++ * and so on. |
3136 |
++ * For this reason the next function is invoked to compute |
3137 |
++ * soft_rt_next_start only for applications that meet this requirement, |
3138 |
++ * whereas soft_rt_next_start is set to infinity for applications that do |
3139 |
++ * not. |
3140 |
++ * |
3141 |
++ * Unfortunately, even a greedy application may happen to behave in an |
3142 |
++ * isochronous way if the CPU load is high. In fact, the application may |
3143 |
++ * stop issuing requests while the CPUs are busy serving other processes, |
3144 |
++ * then restart, then stop again for a while, and so on. In addition, if |
3145 |
++ * the disk achieves a low enough throughput with the request pattern |
3146 |
++ * issued by the application (e.g., because the request pattern is random |
3147 |
++ * and/or the device is slow), then the application may meet the above |
3148 |
++ * bandwidth requirement too. To prevent such a greedy application to be |
3149 |
++ * deemed as soft real-time, a further rule is used in the computation of |
3150 |
++ * soft_rt_next_start: soft_rt_next_start must be higher than the current |
3151 |
++ * time plus the maximum time for which the arrival of a request is waited |
3152 |
++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. |
3153 |
++ * This filters out greedy applications, as the latter issue instead their |
3154 |
++ * next request as soon as possible after the last one has been completed |
3155 |
++ * (in contrast, when a batch of requests is completed, a soft real-time |
3156 |
++ * application spends some time processing data). |
3157 |
++ * |
3158 |
++ * Unfortunately, the last filter may easily generate false positives if |
3159 |
++ * only bfqd->bfq_slice_idle is used as a reference time interval and one |
3160 |
++ * or both the following cases occur: |
3161 |
++ * 1) HZ is so low that the duration of a jiffy is comparable to or higher |
3162 |
++ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with |
3163 |
++ * HZ=100. |
3164 |
++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing |
3165 |
++ * for a while, then suddenly 'jump' by several units to recover the lost |
3166 |
++ * increments. This seems to happen, e.g., inside virtual machines. |
3167 |
++ * To address this issue, we do not use as a reference time interval just |
3168 |
++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In |
3169 |
++ * particular we add the minimum number of jiffies for which the filter |
3170 |
++ * seems to be quite precise also in embedded systems and KVM/QEMU virtual |
3171 |
++ * machines. |
3172 |
++ */ |
3173 |
++static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, |
3174 |
++ struct bfq_queue *bfqq) |
3175 |
++{ |
3176 |
++ return max(bfqq->last_idle_bklogged + |
3177 |
++ HZ * bfqq->service_from_backlogged / |
3178 |
++ bfqd->bfq_wr_max_softrt_rate, |
3179 |
++ jiffies + bfqq->bfqd->bfq_slice_idle + 4); |
3180 |
++} |
3181 |
++ |
3182 |
++/* |
3183 |
++ * Return the largest-possible time instant such that, for as long as possible, |
3184 |
++ * the current time will be lower than this time instant according to the macro |
3185 |
++ * time_is_before_jiffies(). |
3186 |
++ */ |
3187 |
++static unsigned long bfq_infinity_from_now(unsigned long now) |
3188 |
++{ |
3189 |
++ return now + ULONG_MAX / 2; |
3190 |
++} |
3191 |
++ |
3192 |
++/** |
3193 |
++ * bfq_bfqq_expire - expire a queue. |
3194 |
++ * @bfqd: device owning the queue. |
3195 |
++ * @bfqq: the queue to expire. |
3196 |
++ * @compensate: if true, compensate for the time spent idling. |
3197 |
++ * @reason: the reason causing the expiration. |
3198 |
++ * |
3199 |
++ * |
3200 |
++ * If the process associated to the queue is slow (i.e., seeky), or in |
3201 |
++ * case of budget timeout, or, finally, if it is async, we |
3202 |
++ * artificially charge it an entire budget (independently of the |
3203 |
++ * actual service it received). As a consequence, the queue will get |
3204 |
++ * higher timestamps than the correct ones upon reactivation, and |
3205 |
++ * hence it will be rescheduled as if it had received more service |
3206 |
++ * than what it actually received. In the end, this class of processes |
3207 |
++ * will receive less service in proportion to how slowly they consume |
3208 |
++ * their budgets (and hence how seriously they tend to lower the |
3209 |
++ * throughput). |
3210 |
++ * |
3211 |
++ * In contrast, when a queue expires because it has been idling for |
3212 |
++ * too much or because it exhausted its budget, we do not touch the |
3213 |
++ * amount of service it has received. Hence when the queue will be |
3214 |
++ * reactivated and its timestamps updated, the latter will be in sync |
3215 |
++ * with the actual service received by the queue until expiration. |
3216 |
++ * |
3217 |
++ * Charging a full budget to the first type of queues and the exact |
3218 |
++ * service to the others has the effect of using the WF2Q+ policy to |
3219 |
++ * schedule the former on a timeslice basis, without violating the |
3220 |
++ * service domain guarantees of the latter. |
3221 |
++ */ |
3222 |
++static void bfq_bfqq_expire(struct bfq_data *bfqd, |
3223 |
++ struct bfq_queue *bfqq, |
3224 |
++ bool compensate, |
3225 |
++ enum bfqq_expiration reason) |
3226 |
++{ |
3227 |
++ bool slow; |
3228 |
++ BUG_ON(bfqq != bfqd->in_service_queue); |
3229 |
++ |
3230 |
++ /* |
3231 |
++ * Update disk peak rate for autotuning and check whether the |
3232 |
++ * process is slow (see bfq_update_peak_rate). |
3233 |
++ */ |
3234 |
++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); |
3235 |
++ |
3236 |
++ /* |
3237 |
++ * As above explained, 'punish' slow (i.e., seeky), timed-out |
3238 |
++ * and async queues, to favor sequential sync workloads. |
3239 |
++ * |
3240 |
++ * Processes doing I/O in the slower disk zones will tend to be |
3241 |
++ * slow(er) even if not seeky. Hence, since the estimated peak |
3242 |
++ * rate is actually an average over the disk surface, these |
3243 |
++ * processes may timeout just for bad luck. To avoid punishing |
3244 |
++ * them we do not charge a full budget to a process that |
3245 |
++ * succeeded in consuming at least 2/3 of its budget. |
3246 |
++ */ |
3247 |
++ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && |
3248 |
++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) |
3249 |
++ bfq_bfqq_charge_full_budget(bfqq); |
3250 |
++ |
3251 |
++ bfqq->service_from_backlogged += bfqq->entity.service; |
3252 |
++ |
3253 |
++ if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && |
3254 |
++ !bfq_bfqq_constantly_seeky(bfqq)) { |
3255 |
++ bfq_mark_bfqq_constantly_seeky(bfqq); |
3256 |
++ if (!blk_queue_nonrot(bfqd->queue)) |
3257 |
++ bfqd->const_seeky_busy_in_flight_queues++; |
3258 |
++ } |
3259 |
++ |
3260 |
++ if (reason == BFQ_BFQQ_TOO_IDLE && |
3261 |
++ bfqq->entity.service <= 2 * bfqq->entity.budget / 10 ) |
3262 |
++ bfq_clear_bfqq_IO_bound(bfqq); |
3263 |
++ |
3264 |
++ if (bfqd->low_latency && bfqq->wr_coeff == 1) |
3265 |
++ bfqq->last_wr_start_finish = jiffies; |
3266 |
++ |
3267 |
++ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && |
3268 |
++ RB_EMPTY_ROOT(&bfqq->sort_list)) { |
3269 |
++ /* |
3270 |
++ * If we get here, and there are no outstanding requests, |
3271 |
++ * then the request pattern is isochronous (see the comments |
3272 |
++ * to the function bfq_bfqq_softrt_next_start()). Hence we |
3273 |
++ * can compute soft_rt_next_start. If, instead, the queue |
3274 |
++ * still has outstanding requests, then we have to wait |
3275 |
++ * for the completion of all the outstanding requests to |
3276 |
++ * discover whether the request pattern is actually |
3277 |
++ * isochronous. |
3278 |
++ */ |
3279 |
++ if (bfqq->dispatched == 0) |
3280 |
++ bfqq->soft_rt_next_start = |
3281 |
++ bfq_bfqq_softrt_next_start(bfqd, bfqq); |
3282 |
++ else { |
3283 |
++ /* |
3284 |
++ * The application is still waiting for the |
3285 |
++ * completion of one or more requests: |
3286 |
++ * prevent it from possibly being incorrectly |
3287 |
++ * deemed as soft real-time by setting its |
3288 |
++ * soft_rt_next_start to infinity. In fact, |
3289 |
++ * without this assignment, the application |
3290 |
++ * would be incorrectly deemed as soft |
3291 |
++ * real-time if: |
3292 |
++ * 1) it issued a new request before the |
3293 |
++ * completion of all its in-flight |
3294 |
++ * requests, and |
3295 |
++ * 2) at that time, its soft_rt_next_start |
3296 |
++ * happened to be in the past. |
3297 |
++ */ |
3298 |
++ bfqq->soft_rt_next_start = |
3299 |
++ bfq_infinity_from_now(jiffies); |
3300 |
++ /* |
3301 |
++ * Schedule an update of soft_rt_next_start to when |
3302 |
++ * the task may be discovered to be isochronous. |
3303 |
++ */ |
3304 |
++ bfq_mark_bfqq_softrt_update(bfqq); |
3305 |
++ } |
3306 |
++ } |
3307 |
++ |
3308 |
++ bfq_log_bfqq(bfqd, bfqq, |
3309 |
++ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, |
3310 |
++ slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); |
3311 |
++ |
3312 |
++ /* |
3313 |
++ * Increase, decrease or leave budget unchanged according to |
3314 |
++ * reason. |
3315 |
++ */ |
3316 |
++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); |
3317 |
++ __bfq_bfqq_expire(bfqd, bfqq); |
3318 |
++} |
3319 |
++ |
3320 |
++/* |
3321 |
++ * Budget timeout is not implemented through a dedicated timer, but |
3322 |
++ * just checked on request arrivals and completions, as well as on |
3323 |
++ * idle timer expirations. |
3324 |
++ */ |
3325 |
++static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) |
3326 |
++{ |
3327 |
++ if (bfq_bfqq_budget_new(bfqq) || |
3328 |
++ time_before(jiffies, bfqq->budget_timeout)) |
3329 |
++ return false; |
3330 |
++ return true; |
3331 |
++} |
3332 |
++ |
3333 |
++/* |
3334 |
++ * If we expire a queue that is waiting for the arrival of a new |
3335 |
++ * request, we may prevent the fictitious timestamp back-shifting that |
3336 |
++ * allows the guarantees of the queue to be preserved (see [1] for |
3337 |
++ * this tricky aspect). Hence we return true only if this condition |
3338 |
++ * does not hold, or if the queue is slow enough to deserve only to be |
3339 |
++ * kicked off for preserving a high throughput. |
3340 |
++*/ |
3341 |
++static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) |
3342 |
++{ |
3343 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
3344 |
++ "may_budget_timeout: wait_request %d left %d timeout %d", |
3345 |
++ bfq_bfqq_wait_request(bfqq), |
3346 |
++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, |
3347 |
++ bfq_bfqq_budget_timeout(bfqq)); |
3348 |
++ |
3349 |
++ return (!bfq_bfqq_wait_request(bfqq) || |
3350 |
++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) |
3351 |
++ && |
3352 |
++ bfq_bfqq_budget_timeout(bfqq); |
3353 |
++} |
3354 |
++ |
3355 |
++/* |
3356 |
++ * For a queue that becomes empty, device idling is allowed only if |
3357 |
++ * this function returns true for that queue. As a consequence, since |
3358 |
++ * device idling plays a critical role for both throughput boosting |
3359 |
++ * and service guarantees, the return value of this function plays a |
3360 |
++ * critical role as well. |
3361 |
++ * |
3362 |
++ * In a nutshell, this function returns true only if idling is |
3363 |
++ * beneficial for throughput or, even if detrimental for throughput, |
3364 |
++ * idling is however necessary to preserve service guarantees (low |
3365 |
++ * latency, desired throughput distribution, ...). In particular, on |
3366 |
++ * NCQ-capable devices, this function tries to return false, so as to |
3367 |
++ * help keep the drives' internal queues full, whenever this helps the |
3368 |
++ * device boost the throughput without causing any service-guarantee |
3369 |
++ * issue. |
3370 |
++ * |
3371 |
++ * In more detail, the return value of this function is obtained by, |
3372 |
++ * first, computing a number of boolean variables that take into |
3373 |
++ * account throughput and service-guarantee issues, and, then, |
3374 |
++ * combining these variables in a logical expression. Most of the |
3375 |
++ * issues taken into account are not trivial. We discuss these issues |
3376 |
++ * while introducing the variables. |
3377 |
++ */ |
3378 |
++static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) |
3379 |
++{ |
3380 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
3381 |
++ bool idling_boosts_thr, idling_boosts_thr_without_issues, |
3382 |
++ all_queues_seeky, on_hdd_and_not_all_queues_seeky, |
3383 |
++ idling_needed_for_service_guarantees, |
3384 |
++ asymmetric_scenario; |
3385 |
++ |
3386 |
++ /* |
3387 |
++ * The next variable takes into account the cases where idling |
3388 |
++ * boosts the throughput. |
3389 |
++ * |
3390 |
++ * The value of the variable is computed considering, first, that |
3391 |
++ * idling is virtually always beneficial for the throughput if: |
3392 |
++ * (a) the device is not NCQ-capable, or |
3393 |
++ * (b) regardless of the presence of NCQ, the device is rotational |
3394 |
++ * and the request pattern for bfqq is I/O-bound and sequential. |
3395 |
++ * |
3396 |
++ * Secondly, and in contrast to the above item (b), idling an |
3397 |
++ * NCQ-capable flash-based device would not boost the |
3398 |
++ * throughput even with sequential I/O; rather it would lower |
3399 |
++ * the throughput in proportion to how fast the device |
3400 |
++ * is. Accordingly, the next variable is true if any of the |
3401 |
++ * above conditions (a) and (b) is true, and, in particular, |
3402 |
++ * happens to be false if bfqd is an NCQ-capable flash-based |
3403 |
++ * device. |
3404 |
++ */ |
3405 |
++ idling_boosts_thr = !bfqd->hw_tag || |
3406 |
++ (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && |
3407 |
++ bfq_bfqq_idle_window(bfqq)) ; |
3408 |
++ |
3409 |
++ /* |
3410 |
++ * The value of the next variable, |
3411 |
++ * idling_boosts_thr_without_issues, is equal to that of |
3412 |
++ * idling_boosts_thr, unless a special case holds. In this |
3413 |
++ * special case, described below, idling may cause problems to |
3414 |
++ * weight-raised queues. |
3415 |
++ * |
3416 |
++ * When the request pool is saturated (e.g., in the presence |
3417 |
++ * of write hogs), if the processes associated with |
3418 |
++ * non-weight-raised queues ask for requests at a lower rate, |
3419 |
++ * then processes associated with weight-raised queues have a |
3420 |
++ * higher probability to get a request from the pool |
3421 |
++ * immediately (or at least soon) when they need one. Thus |
3422 |
++ * they have a higher probability to actually get a fraction |
3423 |
++ * of the device throughput proportional to their high |
3424 |
++ * weight. This is especially true with NCQ-capable drives, |
3425 |
++ * which enqueue several requests in advance, and further |
3426 |
++ * reorder internally-queued requests. |
3427 |
++ * |
3428 |
++ * For this reason, we force to false the value of |
3429 |
++ * idling_boosts_thr_without_issues if there are weight-raised |
3430 |
++ * busy queues. In this case, and if bfqq is not weight-raised, |
3431 |
++ * this guarantees that the device is not idled for bfqq (if, |
3432 |
++ * instead, bfqq is weight-raised, then idling will be |
3433 |
++ * guaranteed by another variable, see below). Combined with |
3434 |
++ * the timestamping rules of BFQ (see [1] for details), this |
3435 |
++ * behavior causes bfqq, and hence any sync non-weight-raised |
3436 |
++ * queue, to get a lower number of requests served, and thus |
3437 |
++ * to ask for a lower number of requests from the request |
3438 |
++ * pool, before the busy weight-raised queues get served |
3439 |
++ * again. This often mitigates starvation problems in the |
3440 |
++ * presence of heavy write workloads and NCQ, thereby |
3441 |
++ * guaranteeing a higher application and system responsiveness |
3442 |
++ * in these hostile scenarios. |
3443 |
++ */ |
3444 |
++ idling_boosts_thr_without_issues = idling_boosts_thr && |
3445 |
++ bfqd->wr_busy_queues == 0; |
3446 |
++ |
3447 |
++ /* |
3448 |
++ * There are then two cases where idling must be performed not |
3449 |
++ * for throughput concerns, but to preserve service |
3450 |
++ * guarantees. In the description of these cases, we say, for |
3451 |
++ * short, that a queue is sequential/random if the process |
3452 |
++ * associated to the queue issues sequential/random requests |
3453 |
++ * (in the second case the queue may be tagged as seeky or |
3454 |
++ * even constantly_seeky). |
3455 |
++ * |
3456 |
++ * To introduce the first case, we note that, since |
3457 |
++ * bfq_bfqq_idle_window(bfqq) is false if the device is |
3458 |
++ * NCQ-capable and bfqq is random (see |
3459 |
++ * bfq_update_idle_window()), then, from the above two |
3460 |
++ * assignments it follows that |
3461 |
++ * idling_boosts_thr_without_issues is false if the device is |
3462 |
++ * NCQ-capable and bfqq is random. Therefore, for this case, |
3463 |
++ * device idling would never be allowed if we used just |
3464 |
++ * idling_boosts_thr_without_issues to decide whether to allow |
3465 |
++ * it. And, beneficially, this would imply that throughput |
3466 |
++ * would always be boosted also with random I/O on NCQ-capable |
3467 |
++ * HDDs. |
3468 |
++ * |
3469 |
++ * But we must be careful on this point, to avoid an unfair |
3470 |
++ * treatment for bfqq. In fact, because of the same above |
3471 |
++ * assignments, idling_boosts_thr_without_issues is, on the |
3472 |
++ * other hand, true if 1) the device is an HDD and bfqq is |
3473 |
++ * sequential, and 2) there are no busy weight-raised |
3474 |
++ * queues. As a consequence, if we used just |
3475 |
++ * idling_boosts_thr_without_issues to decide whether to idle |
3476 |
++ * the device, then with an HDD we might easily bump into a |
3477 |
++ * scenario where queues that are sequential and I/O-bound |
3478 |
++ * would enjoy idling, whereas random queues would not. The |
3479 |
++ * latter might then get a low share of the device throughput, |
3480 |
++ * simply because the former would get many requests served |
3481 |
++ * after being set as in service, while the latter would not. |
3482 |
++ * |
3483 |
++ * To address this issue, we start by setting to true a |
3484 |
++ * sentinel variable, on_hdd_and_not_all_queues_seeky, if the |
3485 |
++ * device is rotational and not all queues with pending or |
3486 |
++ * in-flight requests are constantly seeky (i.e., there are |
3487 |
++ * active sequential queues, and bfqq might then be mistreated |
3488 |
++ * if it does not enjoy idling because it is random). |
3489 |
++ */ |
3490 |
++ all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && |
3491 |
++ bfqd->busy_in_flight_queues == |
3492 |
++ bfqd->const_seeky_busy_in_flight_queues; |
3493 |
++ |
3494 |
++ on_hdd_and_not_all_queues_seeky = |
3495 |
++ !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; |
3496 |
++ |
3497 |
++ /* |
3498 |
++ * To introduce the second case where idling needs to be |
3499 |
++ * performed to preserve service guarantees, we can note that |
3500 |
++ * allowing the drive to enqueue more than one request at a |
3501 |
++ * time, and hence delegating de facto final scheduling |
3502 |
++ * decisions to the drive's internal scheduler, causes loss of |
3503 |
++ * control on the actual request service order. In particular, |
3504 |
++ * the critical situation is when requests from different |
3505 |
++ * processes happens to be present, at the same time, in the |
3506 |
++ * internal queue(s) of the drive. In such a situation, the |
3507 |
++ * drive, by deciding the service order of the |
3508 |
++ * internally-queued requests, does determine also the actual |
3509 |
++ * throughput distribution among these processes. But the |
3510 |
++ * drive typically has no notion or concern about per-process |
3511 |
++ * throughput distribution, and makes its decisions only on a |
3512 |
++ * per-request basis. Therefore, the service distribution |
3513 |
++ * enforced by the drive's internal scheduler is likely to |
3514 |
++ * coincide with the desired device-throughput distribution |
3515 |
++ * only in a completely symmetric scenario where: |
3516 |
++ * (i) each of these processes must get the same throughput as |
3517 |
++ * the others; |
3518 |
++ * (ii) all these processes have the same I/O pattern |
3519 |
++ (either sequential or random). |
3520 |
++ * In fact, in such a scenario, the drive will tend to treat |
3521 |
++ * the requests of each of these processes in about the same |
3522 |
++ * way as the requests of the others, and thus to provide |
3523 |
++ * each of these processes with about the same throughput |
3524 |
++ * (which is exactly the desired throughput distribution). In |
3525 |
++ * contrast, in any asymmetric scenario, device idling is |
3526 |
++ * certainly needed to guarantee that bfqq receives its |
3527 |
++ * assigned fraction of the device throughput (see [1] for |
3528 |
++ * details). |
3529 |
++ * |
3530 |
++ * We address this issue by controlling, actually, only the |
3531 |
++ * symmetry sub-condition (i), i.e., provided that |
3532 |
++ * sub-condition (i) holds, idling is not performed, |
3533 |
++ * regardless of whether sub-condition (ii) holds. In other |
3534 |
++ * words, only if sub-condition (i) holds, then idling is |
3535 |
++ * allowed, and the device tends to be prevented from queueing |
3536 |
++ * many requests, possibly of several processes. The reason |
3537 |
++ * for not controlling also sub-condition (ii) is that, first, |
3538 |
++ * in the case of an HDD, the asymmetry in terms of types of |
3539 |
++ * I/O patterns is already taken in to account in the above |
3540 |
++ * sentinel variable |
3541 |
++ * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a |
3542 |
++ * flash-based device, we prefer however to privilege |
3543 |
++ * throughput (and idling lowers throughput for this type of |
3544 |
++ * devices), for the following reasons: |
3545 |
++ * 1) differently from HDDs, the service time of random |
3546 |
++ * requests is not orders of magnitudes lower than the service |
3547 |
++ * time of sequential requests; thus, even if processes doing |
3548 |
++ * sequential I/O get a preferential treatment with respect to |
3549 |
++ * others doing random I/O, the consequences are not as |
3550 |
++ * dramatic as with HDDs; |
3551 |
++ * 2) if a process doing random I/O does need strong |
3552 |
++ * throughput guarantees, it is hopefully already being |
3553 |
++ * weight-raised, or the user is likely to have assigned it a |
3554 |
++ * higher weight than the other processes (and thus |
3555 |
++ * sub-condition (i) is likely to be false, which triggers |
3556 |
++ * idling). |
3557 |
++ * |
3558 |
++ * According to the above considerations, the next variable is |
3559 |
++ * true (only) if sub-condition (i) holds. To compute the |
3560 |
++ * value of this variable, we not only use the return value of |
3561 |
++ * the function bfq_symmetric_scenario(), but also check |
3562 |
++ * whether bfqq is being weight-raised, because |
3563 |
++ * bfq_symmetric_scenario() does not take into account also |
3564 |
++ * weight-raised queues (see comments to |
3565 |
++ * bfq_weights_tree_add()). |
3566 |
++ * |
3567 |
++ * As a side note, it is worth considering that the above |
3568 |
++ * device-idling countermeasures may however fail in the |
3569 |
++ * following unlucky scenario: if idling is (correctly) |
3570 |
++ * disabled in a time period during which all symmetry |
3571 |
++ * sub-conditions hold, and hence the device is allowed to |
3572 |
++ * enqueue many requests, but at some later point in time some |
3573 |
++ * sub-condition stops to hold, then it may become impossible |
3574 |
++ * to let requests be served in the desired order until all |
3575 |
++ * the requests already queued in the device have been served. |
3576 |
++ */ |
3577 |
++ asymmetric_scenario = bfqq->wr_coeff > 1 || |
3578 |
++ !bfq_symmetric_scenario(bfqd); |
3579 |
++ |
3580 |
++ /* |
3581 |
++ * Finally, there is a case where maximizing throughput is the |
3582 |
++ * best choice even if it may cause unfairness toward |
3583 |
++ * bfqq. Such a case is when bfqq became active in a burst of |
3584 |
++ * queue activations. Queues that became active during a large |
3585 |
++ * burst benefit only from throughput, as discussed in the |
3586 |
++ * comments to bfq_handle_burst. Thus, if bfqq became active |
3587 |
++ * in a burst and not idling the device maximizes throughput, |
3588 |
++ * then the device must no be idled, because not idling the |
3589 |
++ * device provides bfqq and all other queues in the burst with |
3590 |
++ * maximum benefit. Combining this and the two cases above, we |
3591 |
++ * can now establish when idling is actually needed to |
3592 |
++ * preserve service guarantees. |
3593 |
++ */ |
3594 |
++ idling_needed_for_service_guarantees = |
3595 |
++ (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) && |
3596 |
++ !bfq_bfqq_in_large_burst(bfqq); |
3597 |
++ |
3598 |
++ /* |
3599 |
++ * We have now all the components we need to compute the return |
3600 |
++ * value of the function, which is true only if both the following |
3601 |
++ * conditions hold: |
3602 |
++ * 1) bfqq is sync, because idling make sense only for sync queues; |
3603 |
++ * 2) idling either boosts the throughput (without issues), or |
3604 |
++ * is necessary to preserve service guarantees. |
3605 |
++ */ |
3606 |
++ return bfq_bfqq_sync(bfqq) && |
3607 |
++ (idling_boosts_thr_without_issues || |
3608 |
++ idling_needed_for_service_guarantees); |
3609 |
++} |
3610 |
++ |
3611 |
++/* |
3612 |
++ * If the in-service queue is empty but the function bfq_bfqq_may_idle |
3613 |
++ * returns true, then: |
3614 |
++ * 1) the queue must remain in service and cannot be expired, and |
3615 |
++ * 2) the device must be idled to wait for the possible arrival of a new |
3616 |
++ * request for the queue. |
3617 |
++ * See the comments to the function bfq_bfqq_may_idle for the reasons |
3618 |
++ * why performing device idling is the best choice to boost the throughput |
3619 |
++ * and preserve service guarantees when bfq_bfqq_may_idle itself |
3620 |
++ * returns true. |
3621 |
++ */ |
3622 |
++static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) |
3623 |
++{ |
3624 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
3625 |
++ |
3626 |
++ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && |
3627 |
++ bfq_bfqq_may_idle(bfqq); |
3628 |
++} |
3629 |
++ |
3630 |
++/* |
3631 |
++ * Select a queue for service. If we have a current queue in service, |
3632 |
++ * check whether to continue servicing it, or retrieve and set a new one. |
3633 |
++ */ |
3634 |
++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) |
3635 |
++{ |
3636 |
++ struct bfq_queue *bfqq; |
3637 |
++ struct request *next_rq; |
3638 |
++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; |
3639 |
++ |
3640 |
++ bfqq = bfqd->in_service_queue; |
3641 |
++ if (!bfqq) |
3642 |
++ goto new_queue; |
3643 |
++ |
3644 |
++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); |
3645 |
++ |
3646 |
++ if (bfq_may_expire_for_budg_timeout(bfqq) && |
3647 |
++ !timer_pending(&bfqd->idle_slice_timer) && |
3648 |
++ !bfq_bfqq_must_idle(bfqq)) |
3649 |
++ goto expire; |
3650 |
++ |
3651 |
++ next_rq = bfqq->next_rq; |
3652 |
++ /* |
3653 |
++ * If bfqq has requests queued and it has enough budget left to |
3654 |
++ * serve them, keep the queue, otherwise expire it. |
3655 |
++ */ |
3656 |
++ if (next_rq) { |
3657 |
++ if (bfq_serv_to_charge(next_rq, bfqq) > |
3658 |
++ bfq_bfqq_budget_left(bfqq)) { |
3659 |
++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; |
3660 |
++ goto expire; |
3661 |
++ } else { |
3662 |
++ /* |
3663 |
++ * The idle timer may be pending because we may |
3664 |
++ * not disable disk idling even when a new request |
3665 |
++ * arrives. |
3666 |
++ */ |
3667 |
++ if (timer_pending(&bfqd->idle_slice_timer)) { |
3668 |
++ /* |
3669 |
++ * If we get here: 1) at least a new request |
3670 |
++ * has arrived but we have not disabled the |
3671 |
++ * timer because the request was too small, |
3672 |
++ * 2) then the block layer has unplugged |
3673 |
++ * the device, causing the dispatch to be |
3674 |
++ * invoked. |
3675 |
++ * |
3676 |
++ * Since the device is unplugged, now the |
3677 |
++ * requests are probably large enough to |
3678 |
++ * provide a reasonable throughput. |
3679 |
++ * So we disable idling. |
3680 |
++ */ |
3681 |
++ bfq_clear_bfqq_wait_request(bfqq); |
3682 |
++ del_timer(&bfqd->idle_slice_timer); |
3683 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
3684 |
++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); |
3685 |
++#endif |
3686 |
++ } |
3687 |
++ goto keep_queue; |
3688 |
++ } |
3689 |
++ } |
3690 |
++ |
3691 |
++ /* |
3692 |
++ * No requests pending. However, if the in-service queue is idling |
3693 |
++ * for a new request, or has requests waiting for a completion and |
3694 |
++ * may idle after their completion, then keep it anyway. |
3695 |
++ */ |
3696 |
++ if (timer_pending(&bfqd->idle_slice_timer) || |
3697 |
++ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { |
3698 |
++ bfqq = NULL; |
3699 |
++ goto keep_queue; |
3700 |
++ } |
3701 |
++ |
3702 |
++ reason = BFQ_BFQQ_NO_MORE_REQUESTS; |
3703 |
++expire: |
3704 |
++ bfq_bfqq_expire(bfqd, bfqq, false, reason); |
3705 |
++new_queue: |
3706 |
++ bfqq = bfq_set_in_service_queue(bfqd); |
3707 |
++ bfq_log(bfqd, "select_queue: new queue %d returned", |
3708 |
++ bfqq ? bfqq->pid : 0); |
3709 |
++keep_queue: |
3710 |
++ return bfqq; |
3711 |
++} |
3712 |
++ |
3713 |
++static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
3714 |
++{ |
3715 |
++ struct bfq_entity *entity = &bfqq->entity; |
3716 |
++ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ |
3717 |
++ bfq_log_bfqq(bfqd, bfqq, |
3718 |
++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", |
3719 |
++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), |
3720 |
++ jiffies_to_msecs(bfqq->wr_cur_max_time), |
3721 |
++ bfqq->wr_coeff, |
3722 |
++ bfqq->entity.weight, bfqq->entity.orig_weight); |
3723 |
++ |
3724 |
++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != |
3725 |
++ entity->orig_weight * bfqq->wr_coeff); |
3726 |
++ if (entity->prio_changed) |
3727 |
++ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); |
3728 |
++ |
3729 |
++ /* |
3730 |
++ * If the queue was activated in a burst, or |
3731 |
++ * too much time has elapsed from the beginning |
3732 |
++ * of this weight-raising period, then end weight |
3733 |
++ * raising. |
3734 |
++ */ |
3735 |
++ if (bfq_bfqq_in_large_burst(bfqq) || |
3736 |
++ time_is_before_jiffies(bfqq->last_wr_start_finish + |
3737 |
++ bfqq->wr_cur_max_time)) { |
3738 |
++ bfqq->last_wr_start_finish = jiffies; |
3739 |
++ bfq_log_bfqq(bfqd, bfqq, |
3740 |
++ "wrais ending at %lu, rais_max_time %u", |
3741 |
++ bfqq->last_wr_start_finish, |
3742 |
++ jiffies_to_msecs(bfqq->wr_cur_max_time)); |
3743 |
++ bfq_bfqq_end_wr(bfqq); |
3744 |
++ } |
3745 |
++ } |
3746 |
++ /* Update weight both if it must be raised and if it must be lowered */ |
3747 |
++ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) |
3748 |
++ __bfq_entity_update_weight_prio( |
3749 |
++ bfq_entity_service_tree(entity), |
3750 |
++ entity); |
3751 |
++} |
3752 |
++ |
3753 |
++/* |
3754 |
++ * Dispatch one request from bfqq, moving it to the request queue |
3755 |
++ * dispatch list. |
3756 |
++ */ |
3757 |
++static int bfq_dispatch_request(struct bfq_data *bfqd, |
3758 |
++ struct bfq_queue *bfqq) |
3759 |
++{ |
3760 |
++ int dispatched = 0; |
3761 |
++ struct request *rq; |
3762 |
++ unsigned long service_to_charge; |
3763 |
++ |
3764 |
++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); |
3765 |
++ |
3766 |
++ /* Follow expired path, else get first next available. */ |
3767 |
++ rq = bfq_check_fifo(bfqq); |
3768 |
++ if (!rq) |
3769 |
++ rq = bfqq->next_rq; |
3770 |
++ service_to_charge = bfq_serv_to_charge(rq, bfqq); |
3771 |
++ |
3772 |
++ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { |
3773 |
++ /* |
3774 |
++ * This may happen if the next rq is chosen in fifo order |
3775 |
++ * instead of sector order. The budget is properly |
3776 |
++ * dimensioned to be always sufficient to serve the next |
3777 |
++ * request only if it is chosen in sector order. The reason |
3778 |
++ * is that it would be quite inefficient and little useful |
3779 |
++ * to always make sure that the budget is large enough to |
3780 |
++ * serve even the possible next rq in fifo order. |
3781 |
++ * In fact, requests are seldom served in fifo order. |
3782 |
++ * |
3783 |
++ * Expire the queue for budget exhaustion, and make sure |
3784 |
++ * that the next act_budget is enough to serve the next |
3785 |
++ * request, even if it comes from the fifo expired path. |
3786 |
++ */ |
3787 |
++ bfqq->next_rq = rq; |
3788 |
++ /* |
3789 |
++ * Since this dispatch is failed, make sure that |
3790 |
++ * a new one will be performed |
3791 |
++ */ |
3792 |
++ if (!bfqd->rq_in_driver) |
3793 |
++ bfq_schedule_dispatch(bfqd); |
3794 |
++ goto expire; |
3795 |
++ } |
3796 |
++ |
3797 |
++ /* Finally, insert request into driver dispatch list. */ |
3798 |
++ bfq_bfqq_served(bfqq, service_to_charge); |
3799 |
++ bfq_dispatch_insert(bfqd->queue, rq); |
3800 |
++ |
3801 |
++ bfq_update_wr_data(bfqd, bfqq); |
3802 |
++ |
3803 |
++ bfq_log_bfqq(bfqd, bfqq, |
3804 |
++ "dispatched %u sec req (%llu), budg left %d", |
3805 |
++ blk_rq_sectors(rq), |
3806 |
++ (long long unsigned)blk_rq_pos(rq), |
3807 |
++ bfq_bfqq_budget_left(bfqq)); |
3808 |
++ |
3809 |
++ dispatched++; |
3810 |
++ |
3811 |
++ if (!bfqd->in_service_bic) { |
3812 |
++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); |
3813 |
++ bfqd->in_service_bic = RQ_BIC(rq); |
3814 |
++ } |
3815 |
++ |
3816 |
++ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && |
3817 |
++ dispatched >= bfqd->bfq_max_budget_async_rq) || |
3818 |
++ bfq_class_idle(bfqq))) |
3819 |
++ goto expire; |
3820 |
++ |
3821 |
++ return dispatched; |
3822 |
++ |
3823 |
++expire: |
3824 |
++ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); |
3825 |
++ return dispatched; |
3826 |
++} |
3827 |
++ |
3828 |
++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) |
3829 |
++{ |
3830 |
++ int dispatched = 0; |
3831 |
++ |
3832 |
++ while (bfqq->next_rq) { |
3833 |
++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); |
3834 |
++ dispatched++; |
3835 |
++ } |
3836 |
++ |
3837 |
++ BUG_ON(!list_empty(&bfqq->fifo)); |
3838 |
++ return dispatched; |
3839 |
++} |
3840 |
++ |
3841 |
++/* |
3842 |
++ * Drain our current requests. |
3843 |
++ * Used for barriers and when switching io schedulers on-the-fly. |
3844 |
++ */ |
3845 |
++static int bfq_forced_dispatch(struct bfq_data *bfqd) |
3846 |
++{ |
3847 |
++ struct bfq_queue *bfqq, *n; |
3848 |
++ struct bfq_service_tree *st; |
3849 |
++ int dispatched = 0; |
3850 |
++ |
3851 |
++ bfqq = bfqd->in_service_queue; |
3852 |
++ if (bfqq) |
3853 |
++ __bfq_bfqq_expire(bfqd, bfqq); |
3854 |
++ |
3855 |
++ /* |
3856 |
++ * Loop through classes, and be careful to leave the scheduler |
3857 |
++ * in a consistent state, as feedback mechanisms and vtime |
3858 |
++ * updates cannot be disabled during the process. |
3859 |
++ */ |
3860 |
++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { |
3861 |
++ st = bfq_entity_service_tree(&bfqq->entity); |
3862 |
++ |
3863 |
++ dispatched += __bfq_forced_dispatch_bfqq(bfqq); |
3864 |
++ bfqq->max_budget = bfq_max_budget(bfqd); |
3865 |
++ |
3866 |
++ bfq_forget_idle(st); |
3867 |
++ } |
3868 |
++ |
3869 |
++ BUG_ON(bfqd->busy_queues != 0); |
3870 |
++ |
3871 |
++ return dispatched; |
3872 |
++} |
3873 |
++ |
3874 |
++static int bfq_dispatch_requests(struct request_queue *q, int force) |
3875 |
++{ |
3876 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
3877 |
++ struct bfq_queue *bfqq; |
3878 |
++ int max_dispatch; |
3879 |
++ |
3880 |
++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); |
3881 |
++ if (bfqd->busy_queues == 0) |
3882 |
++ return 0; |
3883 |
++ |
3884 |
++ if (unlikely(force)) |
3885 |
++ return bfq_forced_dispatch(bfqd); |
3886 |
++ |
3887 |
++ bfqq = bfq_select_queue(bfqd); |
3888 |
++ if (!bfqq) |
3889 |
++ return 0; |
3890 |
++ |
3891 |
++ if (bfq_class_idle(bfqq)) |
3892 |
++ max_dispatch = 1; |
3893 |
++ |
3894 |
++ if (!bfq_bfqq_sync(bfqq)) |
3895 |
++ max_dispatch = bfqd->bfq_max_budget_async_rq; |
3896 |
++ |
3897 |
++ if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { |
3898 |
++ if (bfqd->busy_queues > 1) |
3899 |
++ return 0; |
3900 |
++ if (bfqq->dispatched >= 4 * max_dispatch) |
3901 |
++ return 0; |
3902 |
++ } |
3903 |
++ |
3904 |
++ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) |
3905 |
++ return 0; |
3906 |
++ |
3907 |
++ bfq_clear_bfqq_wait_request(bfqq); |
3908 |
++ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); |
3909 |
++ |
3910 |
++ if (!bfq_dispatch_request(bfqd, bfqq)) |
3911 |
++ return 0; |
3912 |
++ |
3913 |
++ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", |
3914 |
++ bfq_bfqq_sync(bfqq) ? "sync" : "async"); |
3915 |
++ |
3916 |
++ return 1; |
3917 |
++} |
3918 |
++ |
3919 |
++/* |
3920 |
++ * Task holds one reference to the queue, dropped when task exits. Each rq |
3921 |
++ * in-flight on this queue also holds a reference, dropped when rq is freed. |
3922 |
++ * |
3923 |
++ * Queue lock must be held here. |
3924 |
++ */ |
3925 |
++static void bfq_put_queue(struct bfq_queue *bfqq) |
3926 |
++{ |
3927 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
3928 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
3929 |
++ struct bfq_group *bfqg = bfqq_group(bfqq); |
3930 |
++#endif |
3931 |
++ |
3932 |
++ BUG_ON(atomic_read(&bfqq->ref) <= 0); |
3933 |
++ |
3934 |
++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, |
3935 |
++ atomic_read(&bfqq->ref)); |
3936 |
++ if (!atomic_dec_and_test(&bfqq->ref)) |
3937 |
++ return; |
3938 |
++ |
3939 |
++ BUG_ON(rb_first(&bfqq->sort_list)); |
3940 |
++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); |
3941 |
++ BUG_ON(bfqq->entity.tree); |
3942 |
++ BUG_ON(bfq_bfqq_busy(bfqq)); |
3943 |
++ BUG_ON(bfqd->in_service_queue == bfqq); |
3944 |
++ |
3945 |
++ if (bfq_bfqq_sync(bfqq)) |
3946 |
++ /* |
3947 |
++ * The fact that this queue is being destroyed does not |
3948 |
++ * invalidate the fact that this queue may have been |
3949 |
++ * activated during the current burst. As a consequence, |
3950 |
++ * although the queue does not exist anymore, and hence |
3951 |
++ * needs to be removed from the burst list if there, |
3952 |
++ * the burst size has not to be decremented. |
3953 |
++ */ |
3954 |
++ hlist_del_init(&bfqq->burst_list_node); |
3955 |
++ |
3956 |
++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); |
3957 |
++ |
3958 |
++ kmem_cache_free(bfq_pool, bfqq); |
3959 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
3960 |
++ bfqg_put(bfqg); |
3961 |
++#endif |
3962 |
++} |
3963 |
++ |
3964 |
++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
3965 |
++{ |
3966 |
++ if (bfqq == bfqd->in_service_queue) { |
3967 |
++ __bfq_bfqq_expire(bfqd, bfqq); |
3968 |
++ bfq_schedule_dispatch(bfqd); |
3969 |
++ } |
3970 |
++ |
3971 |
++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, |
3972 |
++ atomic_read(&bfqq->ref)); |
3973 |
++ |
3974 |
++ bfq_put_queue(bfqq); |
3975 |
++} |
3976 |
++ |
3977 |
++static void bfq_init_icq(struct io_cq *icq) |
3978 |
++{ |
3979 |
++ struct bfq_io_cq *bic = icq_to_bic(icq); |
3980 |
++ |
3981 |
++ bic->ttime.last_end_request = jiffies; |
3982 |
++} |
3983 |
++ |
3984 |
++static void bfq_exit_icq(struct io_cq *icq) |
3985 |
++{ |
3986 |
++ struct bfq_io_cq *bic = icq_to_bic(icq); |
3987 |
++ struct bfq_data *bfqd = bic_to_bfqd(bic); |
3988 |
++ |
3989 |
++ if (bic->bfqq[BLK_RW_ASYNC]) { |
3990 |
++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); |
3991 |
++ bic->bfqq[BLK_RW_ASYNC] = NULL; |
3992 |
++ } |
3993 |
++ |
3994 |
++ if (bic->bfqq[BLK_RW_SYNC]) { |
3995 |
++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); |
3996 |
++ bic->bfqq[BLK_RW_SYNC] = NULL; |
3997 |
++ } |
3998 |
++} |
3999 |
++ |
4000 |
++/* |
4001 |
++ * Update the entity prio values; note that the new values will not |
4002 |
++ * be used until the next (re)activation. |
4003 |
++ */ |
4004 |
++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) |
4005 |
++{ |
4006 |
++ struct task_struct *tsk = current; |
4007 |
++ int ioprio_class; |
4008 |
++ |
4009 |
++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); |
4010 |
++ switch (ioprio_class) { |
4011 |
++ default: |
4012 |
++ dev_err(bfqq->bfqd->queue->backing_dev_info.dev, |
4013 |
++ "bfq: bad prio class %d\n", ioprio_class); |
4014 |
++ case IOPRIO_CLASS_NONE: |
4015 |
++ /* |
4016 |
++ * No prio set, inherit CPU scheduling settings. |
4017 |
++ */ |
4018 |
++ bfqq->new_ioprio = task_nice_ioprio(tsk); |
4019 |
++ bfqq->new_ioprio_class = task_nice_ioclass(tsk); |
4020 |
++ break; |
4021 |
++ case IOPRIO_CLASS_RT: |
4022 |
++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); |
4023 |
++ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; |
4024 |
++ break; |
4025 |
++ case IOPRIO_CLASS_BE: |
4026 |
++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); |
4027 |
++ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; |
4028 |
++ break; |
4029 |
++ case IOPRIO_CLASS_IDLE: |
4030 |
++ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; |
4031 |
++ bfqq->new_ioprio = 7; |
4032 |
++ bfq_clear_bfqq_idle_window(bfqq); |
4033 |
++ break; |
4034 |
++ } |
4035 |
++ |
4036 |
++ if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) { |
4037 |
++ printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n", |
4038 |
++ bfqq->new_ioprio); |
4039 |
++ BUG(); |
4040 |
++ } |
4041 |
++ |
4042 |
++ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); |
4043 |
++ bfqq->entity.prio_changed = 1; |
4044 |
++} |
4045 |
++ |
4046 |
++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) |
4047 |
++{ |
4048 |
++ struct bfq_data *bfqd; |
4049 |
++ struct bfq_queue *bfqq, *new_bfqq; |
4050 |
++ unsigned long uninitialized_var(flags); |
4051 |
++ int ioprio = bic->icq.ioc->ioprio; |
4052 |
++ |
4053 |
++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), |
4054 |
++ &flags); |
4055 |
++ /* |
4056 |
++ * This condition may trigger on a newly created bic, be sure to |
4057 |
++ * drop the lock before returning. |
4058 |
++ */ |
4059 |
++ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) |
4060 |
++ goto out; |
4061 |
++ |
4062 |
++ bic->ioprio = ioprio; |
4063 |
++ |
4064 |
++ bfqq = bic->bfqq[BLK_RW_ASYNC]; |
4065 |
++ if (bfqq) { |
4066 |
++ new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, |
4067 |
++ GFP_ATOMIC); |
4068 |
++ if (new_bfqq) { |
4069 |
++ bic->bfqq[BLK_RW_ASYNC] = new_bfqq; |
4070 |
++ bfq_log_bfqq(bfqd, bfqq, |
4071 |
++ "check_ioprio_change: bfqq %p %d", |
4072 |
++ bfqq, atomic_read(&bfqq->ref)); |
4073 |
++ bfq_put_queue(bfqq); |
4074 |
++ } |
4075 |
++ } |
4076 |
++ |
4077 |
++ bfqq = bic->bfqq[BLK_RW_SYNC]; |
4078 |
++ if (bfqq) |
4079 |
++ bfq_set_next_ioprio_data(bfqq, bic); |
4080 |
++ |
4081 |
++out: |
4082 |
++ bfq_put_bfqd_unlock(bfqd, &flags); |
4083 |
++} |
4084 |
++ |
4085 |
++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
4086 |
++ struct bfq_io_cq *bic, pid_t pid, int is_sync) |
4087 |
++{ |
4088 |
++ RB_CLEAR_NODE(&bfqq->entity.rb_node); |
4089 |
++ INIT_LIST_HEAD(&bfqq->fifo); |
4090 |
++ INIT_HLIST_NODE(&bfqq->burst_list_node); |
4091 |
++ |
4092 |
++ atomic_set(&bfqq->ref, 0); |
4093 |
++ bfqq->bfqd = bfqd; |
4094 |
++ |
4095 |
++ if (bic) |
4096 |
++ bfq_set_next_ioprio_data(bfqq, bic); |
4097 |
++ |
4098 |
++ if (is_sync) { |
4099 |
++ if (!bfq_class_idle(bfqq)) |
4100 |
++ bfq_mark_bfqq_idle_window(bfqq); |
4101 |
++ bfq_mark_bfqq_sync(bfqq); |
4102 |
++ } else |
4103 |
++ bfq_clear_bfqq_sync(bfqq); |
4104 |
++ bfq_mark_bfqq_IO_bound(bfqq); |
4105 |
++ |
4106 |
++ /* Tentative initial value to trade off between thr and lat */ |
4107 |
++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; |
4108 |
++ bfqq->pid = pid; |
4109 |
++ |
4110 |
++ bfqq->wr_coeff = 1; |
4111 |
++ bfqq->last_wr_start_finish = 0; |
4112 |
++ /* |
4113 |
++ * Set to the value for which bfqq will not be deemed as |
4114 |
++ * soft rt when it becomes backlogged. |
4115 |
++ */ |
4116 |
++ bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); |
4117 |
++} |
4118 |
++ |
4119 |
++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, |
4120 |
++ struct bio *bio, int is_sync, |
4121 |
++ struct bfq_io_cq *bic, |
4122 |
++ gfp_t gfp_mask) |
4123 |
++{ |
4124 |
++ struct bfq_group *bfqg; |
4125 |
++ struct bfq_queue *bfqq, *new_bfqq = NULL; |
4126 |
++ struct blkcg *blkcg; |
4127 |
++ |
4128 |
++retry: |
4129 |
++ rcu_read_lock(); |
4130 |
++ |
4131 |
++ blkcg = bio_blkcg(bio); |
4132 |
++ bfqg = bfq_find_alloc_group(bfqd, blkcg); |
4133 |
++ /* bic always exists here */ |
4134 |
++ bfqq = bic_to_bfqq(bic, is_sync); |
4135 |
++ |
4136 |
++ /* |
4137 |
++ * Always try a new alloc if we fall back to the OOM bfqq |
4138 |
++ * originally, since it should just be a temporary situation. |
4139 |
++ */ |
4140 |
++ if (!bfqq || bfqq == &bfqd->oom_bfqq) { |
4141 |
++ bfqq = NULL; |
4142 |
++ if (new_bfqq) { |
4143 |
++ bfqq = new_bfqq; |
4144 |
++ new_bfqq = NULL; |
4145 |
++ } else if (gfpflags_allow_blocking(gfp_mask)) { |
4146 |
++ rcu_read_unlock(); |
4147 |
++ spin_unlock_irq(bfqd->queue->queue_lock); |
4148 |
++ new_bfqq = kmem_cache_alloc_node(bfq_pool, |
4149 |
++ gfp_mask | __GFP_ZERO, |
4150 |
++ bfqd->queue->node); |
4151 |
++ spin_lock_irq(bfqd->queue->queue_lock); |
4152 |
++ if (new_bfqq) |
4153 |
++ goto retry; |
4154 |
++ } else { |
4155 |
++ bfqq = kmem_cache_alloc_node(bfq_pool, |
4156 |
++ gfp_mask | __GFP_ZERO, |
4157 |
++ bfqd->queue->node); |
4158 |
++ } |
4159 |
++ |
4160 |
++ if (bfqq) { |
4161 |
++ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, |
4162 |
++ is_sync); |
4163 |
++ bfq_init_entity(&bfqq->entity, bfqg); |
4164 |
++ bfq_log_bfqq(bfqd, bfqq, "allocated"); |
4165 |
++ } else { |
4166 |
++ bfqq = &bfqd->oom_bfqq; |
4167 |
++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); |
4168 |
++ } |
4169 |
++ } |
4170 |
++ |
4171 |
++ if (new_bfqq) |
4172 |
++ kmem_cache_free(bfq_pool, new_bfqq); |
4173 |
++ |
4174 |
++ rcu_read_unlock(); |
4175 |
++ |
4176 |
++ return bfqq; |
4177 |
++} |
4178 |
++ |
4179 |
++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, |
4180 |
++ struct bfq_group *bfqg, |
4181 |
++ int ioprio_class, int ioprio) |
4182 |
++{ |
4183 |
++ switch (ioprio_class) { |
4184 |
++ case IOPRIO_CLASS_RT: |
4185 |
++ return &bfqg->async_bfqq[0][ioprio]; |
4186 |
++ case IOPRIO_CLASS_NONE: |
4187 |
++ ioprio = IOPRIO_NORM; |
4188 |
++ /* fall through */ |
4189 |
++ case IOPRIO_CLASS_BE: |
4190 |
++ return &bfqg->async_bfqq[1][ioprio]; |
4191 |
++ case IOPRIO_CLASS_IDLE: |
4192 |
++ return &bfqg->async_idle_bfqq; |
4193 |
++ default: |
4194 |
++ BUG(); |
4195 |
++ } |
4196 |
++} |
4197 |
++ |
4198 |
++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, |
4199 |
++ struct bio *bio, int is_sync, |
4200 |
++ struct bfq_io_cq *bic, gfp_t gfp_mask) |
4201 |
++{ |
4202 |
++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); |
4203 |
++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); |
4204 |
++ struct bfq_queue **async_bfqq = NULL; |
4205 |
++ struct bfq_queue *bfqq = NULL; |
4206 |
++ |
4207 |
++ if (!is_sync) { |
4208 |
++ struct blkcg *blkcg; |
4209 |
++ struct bfq_group *bfqg; |
4210 |
++ |
4211 |
++ rcu_read_lock(); |
4212 |
++ blkcg = bio_blkcg(bio); |
4213 |
++ rcu_read_unlock(); |
4214 |
++ bfqg = bfq_find_alloc_group(bfqd, blkcg); |
4215 |
++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, |
4216 |
++ ioprio); |
4217 |
++ bfqq = *async_bfqq; |
4218 |
++ } |
4219 |
++ |
4220 |
++ if (!bfqq) |
4221 |
++ bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask); |
4222 |
++ |
4223 |
++ /* |
4224 |
++ * Pin the queue now that it's allocated, scheduler exit will |
4225 |
++ * prune it. |
4226 |
++ */ |
4227 |
++ if (!is_sync && !(*async_bfqq)) { |
4228 |
++ atomic_inc(&bfqq->ref); |
4229 |
++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", |
4230 |
++ bfqq, atomic_read(&bfqq->ref)); |
4231 |
++ *async_bfqq = bfqq; |
4232 |
++ } |
4233 |
++ |
4234 |
++ atomic_inc(&bfqq->ref); |
4235 |
++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, |
4236 |
++ atomic_read(&bfqq->ref)); |
4237 |
++ return bfqq; |
4238 |
++} |
4239 |
++ |
4240 |
++static void bfq_update_io_thinktime(struct bfq_data *bfqd, |
4241 |
++ struct bfq_io_cq *bic) |
4242 |
++{ |
4243 |
++ unsigned long elapsed = jiffies - bic->ttime.last_end_request; |
4244 |
++ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); |
4245 |
++ |
4246 |
++ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; |
4247 |
++ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; |
4248 |
++ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / |
4249 |
++ bic->ttime.ttime_samples; |
4250 |
++} |
4251 |
++ |
4252 |
++static void bfq_update_io_seektime(struct bfq_data *bfqd, |
4253 |
++ struct bfq_queue *bfqq, |
4254 |
++ struct request *rq) |
4255 |
++{ |
4256 |
++ sector_t sdist; |
4257 |
++ u64 total; |
4258 |
++ |
4259 |
++ if (bfqq->last_request_pos < blk_rq_pos(rq)) |
4260 |
++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos; |
4261 |
++ else |
4262 |
++ sdist = bfqq->last_request_pos - blk_rq_pos(rq); |
4263 |
++ |
4264 |
++ /* |
4265 |
++ * Don't allow the seek distance to get too large from the |
4266 |
++ * odd fragment, pagein, etc. |
4267 |
++ */ |
4268 |
++ if (bfqq->seek_samples == 0) /* first request, not really a seek */ |
4269 |
++ sdist = 0; |
4270 |
++ else if (bfqq->seek_samples <= 60) /* second & third seek */ |
4271 |
++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); |
4272 |
++ else |
4273 |
++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); |
4274 |
++ |
4275 |
++ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; |
4276 |
++ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; |
4277 |
++ total = bfqq->seek_total + (bfqq->seek_samples/2); |
4278 |
++ do_div(total, bfqq->seek_samples); |
4279 |
++ bfqq->seek_mean = (sector_t)total; |
4280 |
++ |
4281 |
++ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, |
4282 |
++ (u64)bfqq->seek_mean); |
4283 |
++} |
4284 |
++ |
4285 |
++/* |
4286 |
++ * Disable idle window if the process thinks too long or seeks so much that |
4287 |
++ * it doesn't matter. |
4288 |
++ */ |
4289 |
++static void bfq_update_idle_window(struct bfq_data *bfqd, |
4290 |
++ struct bfq_queue *bfqq, |
4291 |
++ struct bfq_io_cq *bic) |
4292 |
++{ |
4293 |
++ int enable_idle; |
4294 |
++ |
4295 |
++ /* Don't idle for async or idle io prio class. */ |
4296 |
++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) |
4297 |
++ return; |
4298 |
++ |
4299 |
++ enable_idle = bfq_bfqq_idle_window(bfqq); |
4300 |
++ |
4301 |
++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || |
4302 |
++ bfqd->bfq_slice_idle == 0 || |
4303 |
++ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && |
4304 |
++ bfqq->wr_coeff == 1)) |
4305 |
++ enable_idle = 0; |
4306 |
++ else if (bfq_sample_valid(bic->ttime.ttime_samples)) { |
4307 |
++ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && |
4308 |
++ bfqq->wr_coeff == 1) |
4309 |
++ enable_idle = 0; |
4310 |
++ else |
4311 |
++ enable_idle = 1; |
4312 |
++ } |
4313 |
++ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", |
4314 |
++ enable_idle); |
4315 |
++ |
4316 |
++ if (enable_idle) |
4317 |
++ bfq_mark_bfqq_idle_window(bfqq); |
4318 |
++ else |
4319 |
++ bfq_clear_bfqq_idle_window(bfqq); |
4320 |
++} |
4321 |
++ |
4322 |
++/* |
4323 |
++ * Called when a new fs request (rq) is added to bfqq. Check if there's |
4324 |
++ * something we should do about it. |
4325 |
++ */ |
4326 |
++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
4327 |
++ struct request *rq) |
4328 |
++{ |
4329 |
++ struct bfq_io_cq *bic = RQ_BIC(rq); |
4330 |
++ |
4331 |
++ if (rq->cmd_flags & REQ_META) |
4332 |
++ bfqq->meta_pending++; |
4333 |
++ |
4334 |
++ bfq_update_io_thinktime(bfqd, bic); |
4335 |
++ bfq_update_io_seektime(bfqd, bfqq, rq); |
4336 |
++ if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { |
4337 |
++ bfq_clear_bfqq_constantly_seeky(bfqq); |
4338 |
++ if (!blk_queue_nonrot(bfqd->queue)) { |
4339 |
++ BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); |
4340 |
++ bfqd->const_seeky_busy_in_flight_queues--; |
4341 |
++ } |
4342 |
++ } |
4343 |
++ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || |
4344 |
++ !BFQQ_SEEKY(bfqq)) |
4345 |
++ bfq_update_idle_window(bfqd, bfqq, bic); |
4346 |
++ |
4347 |
++ bfq_log_bfqq(bfqd, bfqq, |
4348 |
++ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", |
4349 |
++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), |
4350 |
++ (long long unsigned)bfqq->seek_mean); |
4351 |
++ |
4352 |
++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); |
4353 |
++ |
4354 |
++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { |
4355 |
++ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && |
4356 |
++ blk_rq_sectors(rq) < 32; |
4357 |
++ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); |
4358 |
++ |
4359 |
++ /* |
4360 |
++ * There is just this request queued: if the request |
4361 |
++ * is small and the queue is not to be expired, then |
4362 |
++ * just exit. |
4363 |
++ * |
4364 |
++ * In this way, if the disk is being idled to wait for |
4365 |
++ * a new request from the in-service queue, we avoid |
4366 |
++ * unplugging the device and committing the disk to serve |
4367 |
++ * just a small request. On the contrary, we wait for |
4368 |
++ * the block layer to decide when to unplug the device: |
4369 |
++ * hopefully, new requests will be merged to this one |
4370 |
++ * quickly, then the device will be unplugged and |
4371 |
++ * larger requests will be dispatched. |
4372 |
++ */ |
4373 |
++ if (small_req && !budget_timeout) |
4374 |
++ return; |
4375 |
++ |
4376 |
++ /* |
4377 |
++ * A large enough request arrived, or the queue is to |
4378 |
++ * be expired: in both cases disk idling is to be |
4379 |
++ * stopped, so clear wait_request flag and reset |
4380 |
++ * timer. |
4381 |
++ */ |
4382 |
++ bfq_clear_bfqq_wait_request(bfqq); |
4383 |
++ del_timer(&bfqd->idle_slice_timer); |
4384 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
4385 |
++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); |
4386 |
++#endif |
4387 |
++ |
4388 |
++ /* |
4389 |
++ * The queue is not empty, because a new request just |
4390 |
++ * arrived. Hence we can safely expire the queue, in |
4391 |
++ * case of budget timeout, without risking that the |
4392 |
++ * timestamps of the queue are not updated correctly. |
4393 |
++ * See [1] for more details. |
4394 |
++ */ |
4395 |
++ if (budget_timeout) |
4396 |
++ bfq_bfqq_expire(bfqd, bfqq, false, |
4397 |
++ BFQ_BFQQ_BUDGET_TIMEOUT); |
4398 |
++ |
4399 |
++ /* |
4400 |
++ * Let the request rip immediately, or let a new queue be |
4401 |
++ * selected if bfqq has just been expired. |
4402 |
++ */ |
4403 |
++ __blk_run_queue(bfqd->queue); |
4404 |
++ } |
4405 |
++} |
4406 |
++ |
4407 |
++static void bfq_insert_request(struct request_queue *q, struct request *rq) |
4408 |
++{ |
4409 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
4410 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
4411 |
++ |
4412 |
++ assert_spin_locked(bfqd->queue->queue_lock); |
4413 |
++ |
4414 |
++ bfq_add_request(rq); |
4415 |
++ |
4416 |
++ rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; |
4417 |
++ list_add_tail(&rq->queuelist, &bfqq->fifo); |
4418 |
++ |
4419 |
++ bfq_rq_enqueued(bfqd, bfqq, rq); |
4420 |
++} |
4421 |
++ |
4422 |
++static void bfq_update_hw_tag(struct bfq_data *bfqd) |
4423 |
++{ |
4424 |
++ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, |
4425 |
++ bfqd->rq_in_driver); |
4426 |
++ |
4427 |
++ if (bfqd->hw_tag == 1) |
4428 |
++ return; |
4429 |
++ |
4430 |
++ /* |
4431 |
++ * This sample is valid if the number of outstanding requests |
4432 |
++ * is large enough to allow a queueing behavior. Note that the |
4433 |
++ * sum is not exact, as it's not taking into account deactivated |
4434 |
++ * requests. |
4435 |
++ */ |
4436 |
++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) |
4437 |
++ return; |
4438 |
++ |
4439 |
++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) |
4440 |
++ return; |
4441 |
++ |
4442 |
++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; |
4443 |
++ bfqd->max_rq_in_driver = 0; |
4444 |
++ bfqd->hw_tag_samples = 0; |
4445 |
++} |
4446 |
++ |
4447 |
++static void bfq_completed_request(struct request_queue *q, struct request *rq) |
4448 |
++{ |
4449 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
4450 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
4451 |
++ bool sync = bfq_bfqq_sync(bfqq); |
4452 |
++ |
4453 |
++ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", |
4454 |
++ blk_rq_sectors(rq), sync); |
4455 |
++ |
4456 |
++ bfq_update_hw_tag(bfqd); |
4457 |
++ |
4458 |
++ BUG_ON(!bfqd->rq_in_driver); |
4459 |
++ BUG_ON(!bfqq->dispatched); |
4460 |
++ bfqd->rq_in_driver--; |
4461 |
++ bfqq->dispatched--; |
4462 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
4463 |
++ bfqg_stats_update_completion(bfqq_group(bfqq), |
4464 |
++ rq_start_time_ns(rq), |
4465 |
++ rq_io_start_time_ns(rq), rq->cmd_flags); |
4466 |
++#endif |
4467 |
++ |
4468 |
++ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { |
4469 |
++ bfq_weights_tree_remove(bfqd, &bfqq->entity, |
4470 |
++ &bfqd->queue_weights_tree); |
4471 |
++ if (!blk_queue_nonrot(bfqd->queue)) { |
4472 |
++ BUG_ON(!bfqd->busy_in_flight_queues); |
4473 |
++ bfqd->busy_in_flight_queues--; |
4474 |
++ if (bfq_bfqq_constantly_seeky(bfqq)) { |
4475 |
++ BUG_ON(!bfqd-> |
4476 |
++ const_seeky_busy_in_flight_queues); |
4477 |
++ bfqd->const_seeky_busy_in_flight_queues--; |
4478 |
++ } |
4479 |
++ } |
4480 |
++ } |
4481 |
++ |
4482 |
++ if (sync) { |
4483 |
++ bfqd->sync_flight--; |
4484 |
++ RQ_BIC(rq)->ttime.last_end_request = jiffies; |
4485 |
++ } |
4486 |
++ |
4487 |
++ /* |
4488 |
++ * If we are waiting to discover whether the request pattern of the |
4489 |
++ * task associated with the queue is actually isochronous, and |
4490 |
++ * both requisites for this condition to hold are satisfied, then |
4491 |
++ * compute soft_rt_next_start (see the comments to the function |
4492 |
++ * bfq_bfqq_softrt_next_start()). |
4493 |
++ */ |
4494 |
++ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && |
4495 |
++ RB_EMPTY_ROOT(&bfqq->sort_list)) |
4496 |
++ bfqq->soft_rt_next_start = |
4497 |
++ bfq_bfqq_softrt_next_start(bfqd, bfqq); |
4498 |
++ |
4499 |
++ /* |
4500 |
++ * If this is the in-service queue, check if it needs to be expired, |
4501 |
++ * or if we want to idle in case it has no pending requests. |
4502 |
++ */ |
4503 |
++ if (bfqd->in_service_queue == bfqq) { |
4504 |
++ if (bfq_bfqq_budget_new(bfqq)) |
4505 |
++ bfq_set_budget_timeout(bfqd); |
4506 |
++ |
4507 |
++ if (bfq_bfqq_must_idle(bfqq)) { |
4508 |
++ bfq_arm_slice_timer(bfqd); |
4509 |
++ goto out; |
4510 |
++ } else if (bfq_may_expire_for_budg_timeout(bfqq)) |
4511 |
++ bfq_bfqq_expire(bfqd, bfqq, false, |
4512 |
++ BFQ_BFQQ_BUDGET_TIMEOUT); |
4513 |
++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && |
4514 |
++ (bfqq->dispatched == 0 || |
4515 |
++ !bfq_bfqq_may_idle(bfqq))) |
4516 |
++ bfq_bfqq_expire(bfqd, bfqq, false, |
4517 |
++ BFQ_BFQQ_NO_MORE_REQUESTS); |
4518 |
++ } |
4519 |
++ |
4520 |
++ if (!bfqd->rq_in_driver) |
4521 |
++ bfq_schedule_dispatch(bfqd); |
4522 |
++ |
4523 |
++out: |
4524 |
++ return; |
4525 |
++} |
4526 |
++ |
4527 |
++static int __bfq_may_queue(struct bfq_queue *bfqq) |
4528 |
++{ |
4529 |
++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { |
4530 |
++ bfq_clear_bfqq_must_alloc(bfqq); |
4531 |
++ return ELV_MQUEUE_MUST; |
4532 |
++ } |
4533 |
++ |
4534 |
++ return ELV_MQUEUE_MAY; |
4535 |
++} |
4536 |
++ |
4537 |
++static int bfq_may_queue(struct request_queue *q, int rw) |
4538 |
++{ |
4539 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
4540 |
++ struct task_struct *tsk = current; |
4541 |
++ struct bfq_io_cq *bic; |
4542 |
++ struct bfq_queue *bfqq; |
4543 |
++ |
4544 |
++ /* |
4545 |
++ * Don't force setup of a queue from here, as a call to may_queue |
4546 |
++ * does not necessarily imply that a request actually will be |
4547 |
++ * queued. So just lookup a possibly existing queue, or return |
4548 |
++ * 'may queue' if that fails. |
4549 |
++ */ |
4550 |
++ bic = bfq_bic_lookup(bfqd, tsk->io_context); |
4551 |
++ if (!bic) |
4552 |
++ return ELV_MQUEUE_MAY; |
4553 |
++ |
4554 |
++ bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); |
4555 |
++ if (bfqq) |
4556 |
++ return __bfq_may_queue(bfqq); |
4557 |
++ |
4558 |
++ return ELV_MQUEUE_MAY; |
4559 |
++} |
4560 |
++ |
4561 |
++/* |
4562 |
++ * Queue lock held here. |
4563 |
++ */ |
4564 |
++static void bfq_put_request(struct request *rq) |
4565 |
++{ |
4566 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
4567 |
++ |
4568 |
++ if (bfqq) { |
4569 |
++ const int rw = rq_data_dir(rq); |
4570 |
++ |
4571 |
++ BUG_ON(!bfqq->allocated[rw]); |
4572 |
++ bfqq->allocated[rw]--; |
4573 |
++ |
4574 |
++ rq->elv.priv[0] = NULL; |
4575 |
++ rq->elv.priv[1] = NULL; |
4576 |
++ |
4577 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", |
4578 |
++ bfqq, atomic_read(&bfqq->ref)); |
4579 |
++ bfq_put_queue(bfqq); |
4580 |
++ } |
4581 |
++} |
4582 |
++ |
4583 |
++/* |
4584 |
++ * Allocate bfq data structures associated with this request. |
4585 |
++ */ |
4586 |
++static int bfq_set_request(struct request_queue *q, struct request *rq, |
4587 |
++ struct bio *bio, gfp_t gfp_mask) |
4588 |
++{ |
4589 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
4590 |
++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); |
4591 |
++ const int rw = rq_data_dir(rq); |
4592 |
++ const int is_sync = rq_is_sync(rq); |
4593 |
++ struct bfq_queue *bfqq; |
4594 |
++ unsigned long flags; |
4595 |
++ |
4596 |
++ might_sleep_if(gfpflags_allow_blocking(gfp_mask)); |
4597 |
++ |
4598 |
++ bfq_check_ioprio_change(bic, bio); |
4599 |
++ |
4600 |
++ spin_lock_irqsave(q->queue_lock, flags); |
4601 |
++ |
4602 |
++ if (!bic) |
4603 |
++ goto queue_fail; |
4604 |
++ |
4605 |
++ bfq_bic_update_cgroup(bic, bio); |
4606 |
++ |
4607 |
++ bfqq = bic_to_bfqq(bic, is_sync); |
4608 |
++ if (!bfqq || bfqq == &bfqd->oom_bfqq) { |
4609 |
++ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); |
4610 |
++ bic_set_bfqq(bic, bfqq, is_sync); |
4611 |
++ if (is_sync) { |
4612 |
++ if (bfqd->large_burst) |
4613 |
++ bfq_mark_bfqq_in_large_burst(bfqq); |
4614 |
++ else |
4615 |
++ bfq_clear_bfqq_in_large_burst(bfqq); |
4616 |
++ } |
4617 |
++ } |
4618 |
++ |
4619 |
++ bfqq->allocated[rw]++; |
4620 |
++ atomic_inc(&bfqq->ref); |
4621 |
++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, |
4622 |
++ atomic_read(&bfqq->ref)); |
4623 |
++ |
4624 |
++ rq->elv.priv[0] = bic; |
4625 |
++ rq->elv.priv[1] = bfqq; |
4626 |
++ |
4627 |
++ spin_unlock_irqrestore(q->queue_lock, flags); |
4628 |
++ |
4629 |
++ return 0; |
4630 |
++ |
4631 |
++queue_fail: |
4632 |
++ bfq_schedule_dispatch(bfqd); |
4633 |
++ spin_unlock_irqrestore(q->queue_lock, flags); |
4634 |
++ |
4635 |
++ return 1; |
4636 |
++} |
4637 |
++ |
4638 |
++static void bfq_kick_queue(struct work_struct *work) |
4639 |
++{ |
4640 |
++ struct bfq_data *bfqd = |
4641 |
++ container_of(work, struct bfq_data, unplug_work); |
4642 |
++ struct request_queue *q = bfqd->queue; |
4643 |
++ |
4644 |
++ spin_lock_irq(q->queue_lock); |
4645 |
++ __blk_run_queue(q); |
4646 |
++ spin_unlock_irq(q->queue_lock); |
4647 |
++} |
4648 |
++ |
4649 |
++/* |
4650 |
++ * Handler of the expiration of the timer running if the in-service queue |
4651 |
++ * is idling inside its time slice. |
4652 |
++ */ |
4653 |
++static void bfq_idle_slice_timer(unsigned long data) |
4654 |
++{ |
4655 |
++ struct bfq_data *bfqd = (struct bfq_data *)data; |
4656 |
++ struct bfq_queue *bfqq; |
4657 |
++ unsigned long flags; |
4658 |
++ enum bfqq_expiration reason; |
4659 |
++ |
4660 |
++ spin_lock_irqsave(bfqd->queue->queue_lock, flags); |
4661 |
++ |
4662 |
++ bfqq = bfqd->in_service_queue; |
4663 |
++ /* |
4664 |
++ * Theoretical race here: the in-service queue can be NULL or |
4665 |
++ * different from the queue that was idling if the timer handler |
4666 |
++ * spins on the queue_lock and a new request arrives for the |
4667 |
++ * current queue and there is a full dispatch cycle that changes |
4668 |
++ * the in-service queue. This can hardly happen, but in the worst |
4669 |
++ * case we just expire a queue too early. |
4670 |
++ */ |
4671 |
++ if (bfqq) { |
4672 |
++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); |
4673 |
++ if (bfq_bfqq_budget_timeout(bfqq)) |
4674 |
++ /* |
4675 |
++ * Also here the queue can be safely expired |
4676 |
++ * for budget timeout without wasting |
4677 |
++ * guarantees |
4678 |
++ */ |
4679 |
++ reason = BFQ_BFQQ_BUDGET_TIMEOUT; |
4680 |
++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) |
4681 |
++ /* |
4682 |
++ * The queue may not be empty upon timer expiration, |
4683 |
++ * because we may not disable the timer when the |
4684 |
++ * first request of the in-service queue arrives |
4685 |
++ * during disk idling. |
4686 |
++ */ |
4687 |
++ reason = BFQ_BFQQ_TOO_IDLE; |
4688 |
++ else |
4689 |
++ goto schedule_dispatch; |
4690 |
++ |
4691 |
++ bfq_bfqq_expire(bfqd, bfqq, true, reason); |
4692 |
++ } |
4693 |
++ |
4694 |
++schedule_dispatch: |
4695 |
++ bfq_schedule_dispatch(bfqd); |
4696 |
++ |
4697 |
++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); |
4698 |
++} |
4699 |
++ |
4700 |
++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) |
4701 |
++{ |
4702 |
++ del_timer_sync(&bfqd->idle_slice_timer); |
4703 |
++ cancel_work_sync(&bfqd->unplug_work); |
4704 |
++} |
4705 |
++ |
4706 |
++static void __bfq_put_async_bfqq(struct bfq_data *bfqd, |
4707 |
++ struct bfq_queue **bfqq_ptr) |
4708 |
++{ |
4709 |
++ struct bfq_group *root_group = bfqd->root_group; |
4710 |
++ struct bfq_queue *bfqq = *bfqq_ptr; |
4711 |
++ |
4712 |
++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); |
4713 |
++ if (bfqq) { |
4714 |
++ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); |
4715 |
++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", |
4716 |
++ bfqq, atomic_read(&bfqq->ref)); |
4717 |
++ bfq_put_queue(bfqq); |
4718 |
++ *bfqq_ptr = NULL; |
4719 |
++ } |
4720 |
++} |
4721 |
++ |
4722 |
++/* |
4723 |
++ * Release all the bfqg references to its async queues. If we are |
4724 |
++ * deallocating the group these queues may still contain requests, so |
4725 |
++ * we reparent them to the root cgroup (i.e., the only one that will |
4726 |
++ * exist for sure until all the requests on a device are gone). |
4727 |
++ */ |
4728 |
++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) |
4729 |
++{ |
4730 |
++ int i, j; |
4731 |
++ |
4732 |
++ for (i = 0; i < 2; i++) |
4733 |
++ for (j = 0; j < IOPRIO_BE_NR; j++) |
4734 |
++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); |
4735 |
++ |
4736 |
++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); |
4737 |
++} |
4738 |
++ |
4739 |
++static void bfq_exit_queue(struct elevator_queue *e) |
4740 |
++{ |
4741 |
++ struct bfq_data *bfqd = e->elevator_data; |
4742 |
++ struct request_queue *q = bfqd->queue; |
4743 |
++ struct bfq_queue *bfqq, *n; |
4744 |
++ |
4745 |
++ bfq_shutdown_timer_wq(bfqd); |
4746 |
++ |
4747 |
++ spin_lock_irq(q->queue_lock); |
4748 |
++ |
4749 |
++ BUG_ON(bfqd->in_service_queue); |
4750 |
++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) |
4751 |
++ bfq_deactivate_bfqq(bfqd, bfqq, 0); |
4752 |
++ |
4753 |
++ spin_unlock_irq(q->queue_lock); |
4754 |
++ |
4755 |
++ bfq_shutdown_timer_wq(bfqd); |
4756 |
++ |
4757 |
++ synchronize_rcu(); |
4758 |
++ |
4759 |
++ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); |
4760 |
++ |
4761 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
4762 |
++ blkcg_deactivate_policy(q, &blkcg_policy_bfq); |
4763 |
++#else |
4764 |
++ kfree(bfqd->root_group); |
4765 |
++#endif |
4766 |
++ |
4767 |
++ kfree(bfqd); |
4768 |
++} |
4769 |
++ |
4770 |
++static void bfq_init_root_group(struct bfq_group *root_group, |
4771 |
++ struct bfq_data *bfqd) |
4772 |
++{ |
4773 |
++ int i; |
4774 |
++ |
4775 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
4776 |
++ root_group->entity.parent = NULL; |
4777 |
++ root_group->my_entity = NULL; |
4778 |
++ root_group->bfqd = bfqd; |
4779 |
++#endif |
4780 |
++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) |
4781 |
++ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; |
4782 |
++} |
4783 |
++ |
4784 |
++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) |
4785 |
++{ |
4786 |
++ struct bfq_data *bfqd; |
4787 |
++ struct elevator_queue *eq; |
4788 |
++ |
4789 |
++ eq = elevator_alloc(q, e); |
4790 |
++ if (!eq) |
4791 |
++ return -ENOMEM; |
4792 |
++ |
4793 |
++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); |
4794 |
++ if (!bfqd) { |
4795 |
++ kobject_put(&eq->kobj); |
4796 |
++ return -ENOMEM; |
4797 |
++ } |
4798 |
++ eq->elevator_data = bfqd; |
4799 |
++ |
4800 |
++ /* |
4801 |
++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. |
4802 |
++ * Grab a permanent reference to it, so that the normal code flow |
4803 |
++ * will not attempt to free it. |
4804 |
++ */ |
4805 |
++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); |
4806 |
++ atomic_inc(&bfqd->oom_bfqq.ref); |
4807 |
++ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; |
4808 |
++ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; |
4809 |
++ bfqd->oom_bfqq.entity.new_weight = |
4810 |
++ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); |
4811 |
++ /* |
4812 |
++ * Trigger weight initialization, according to ioprio, at the |
4813 |
++ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio |
4814 |
++ * class won't be changed any more. |
4815 |
++ */ |
4816 |
++ bfqd->oom_bfqq.entity.prio_changed = 1; |
4817 |
++ |
4818 |
++ bfqd->queue = q; |
4819 |
++ |
4820 |
++ spin_lock_irq(q->queue_lock); |
4821 |
++ q->elevator = eq; |
4822 |
++ spin_unlock_irq(q->queue_lock); |
4823 |
++ |
4824 |
++ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); |
4825 |
++ if (!bfqd->root_group) |
4826 |
++ goto out_free; |
4827 |
++ bfq_init_root_group(bfqd->root_group, bfqd); |
4828 |
++ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); |
4829 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
4830 |
++ bfqd->active_numerous_groups = 0; |
4831 |
++#endif |
4832 |
++ |
4833 |
++ init_timer(&bfqd->idle_slice_timer); |
4834 |
++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; |
4835 |
++ bfqd->idle_slice_timer.data = (unsigned long)bfqd; |
4836 |
++ |
4837 |
++ bfqd->queue_weights_tree = RB_ROOT; |
4838 |
++ bfqd->group_weights_tree = RB_ROOT; |
4839 |
++ |
4840 |
++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); |
4841 |
++ |
4842 |
++ INIT_LIST_HEAD(&bfqd->active_list); |
4843 |
++ INIT_LIST_HEAD(&bfqd->idle_list); |
4844 |
++ INIT_HLIST_HEAD(&bfqd->burst_list); |
4845 |
++ |
4846 |
++ bfqd->hw_tag = -1; |
4847 |
++ |
4848 |
++ bfqd->bfq_max_budget = bfq_default_max_budget; |
4849 |
++ |
4850 |
++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; |
4851 |
++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; |
4852 |
++ bfqd->bfq_back_max = bfq_back_max; |
4853 |
++ bfqd->bfq_back_penalty = bfq_back_penalty; |
4854 |
++ bfqd->bfq_slice_idle = bfq_slice_idle; |
4855 |
++ bfqd->bfq_class_idle_last_service = 0; |
4856 |
++ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; |
4857 |
++ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; |
4858 |
++ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; |
4859 |
++ |
4860 |
++ bfqd->bfq_requests_within_timer = 120; |
4861 |
++ |
4862 |
++ bfqd->bfq_large_burst_thresh = 11; |
4863 |
++ bfqd->bfq_burst_interval = msecs_to_jiffies(500); |
4864 |
++ |
4865 |
++ bfqd->low_latency = true; |
4866 |
++ |
4867 |
++ bfqd->bfq_wr_coeff = 20; |
4868 |
++ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); |
4869 |
++ bfqd->bfq_wr_max_time = 0; |
4870 |
++ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); |
4871 |
++ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); |
4872 |
++ bfqd->bfq_wr_max_softrt_rate = 7000; /* |
4873 |
++ * Approximate rate required |
4874 |
++ * to playback or record a |
4875 |
++ * high-definition compressed |
4876 |
++ * video. |
4877 |
++ */ |
4878 |
++ bfqd->wr_busy_queues = 0; |
4879 |
++ bfqd->busy_in_flight_queues = 0; |
4880 |
++ bfqd->const_seeky_busy_in_flight_queues = 0; |
4881 |
++ |
4882 |
++ /* |
4883 |
++ * Begin by assuming, optimistically, that the device peak rate is |
4884 |
++ * equal to the highest reference rate. |
4885 |
++ */ |
4886 |
++ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * |
4887 |
++ T_fast[blk_queue_nonrot(bfqd->queue)]; |
4888 |
++ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; |
4889 |
++ bfqd->device_speed = BFQ_BFQD_FAST; |
4890 |
++ |
4891 |
++ return 0; |
4892 |
++ |
4893 |
++out_free: |
4894 |
++ kfree(bfqd); |
4895 |
++ kobject_put(&eq->kobj); |
4896 |
++ return -ENOMEM; |
4897 |
++} |
4898 |
++ |
4899 |
++static void bfq_slab_kill(void) |
4900 |
++{ |
4901 |
++ if (bfq_pool) |
4902 |
++ kmem_cache_destroy(bfq_pool); |
4903 |
++} |
4904 |
++ |
4905 |
++static int __init bfq_slab_setup(void) |
4906 |
++{ |
4907 |
++ bfq_pool = KMEM_CACHE(bfq_queue, 0); |
4908 |
++ if (!bfq_pool) |
4909 |
++ return -ENOMEM; |
4910 |
++ return 0; |
4911 |
++} |
4912 |
++ |
4913 |
++static ssize_t bfq_var_show(unsigned int var, char *page) |
4914 |
++{ |
4915 |
++ return sprintf(page, "%d\n", var); |
4916 |
++} |
4917 |
++ |
4918 |
++static ssize_t bfq_var_store(unsigned long *var, const char *page, |
4919 |
++ size_t count) |
4920 |
++{ |
4921 |
++ unsigned long new_val; |
4922 |
++ int ret = kstrtoul(page, 10, &new_val); |
4923 |
++ |
4924 |
++ if (ret == 0) |
4925 |
++ *var = new_val; |
4926 |
++ |
4927 |
++ return count; |
4928 |
++} |
4929 |
++ |
4930 |
++static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) |
4931 |
++{ |
4932 |
++ struct bfq_data *bfqd = e->elevator_data; |
4933 |
++ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? |
4934 |
++ jiffies_to_msecs(bfqd->bfq_wr_max_time) : |
4935 |
++ jiffies_to_msecs(bfq_wr_duration(bfqd))); |
4936 |
++} |
4937 |
++ |
4938 |
++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) |
4939 |
++{ |
4940 |
++ struct bfq_queue *bfqq; |
4941 |
++ struct bfq_data *bfqd = e->elevator_data; |
4942 |
++ ssize_t num_char = 0; |
4943 |
++ |
4944 |
++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", |
4945 |
++ bfqd->queued); |
4946 |
++ |
4947 |
++ spin_lock_irq(bfqd->queue->queue_lock); |
4948 |
++ |
4949 |
++ num_char += sprintf(page + num_char, "Active:\n"); |
4950 |
++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { |
4951 |
++ num_char += sprintf(page + num_char, |
4952 |
++ "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n", |
4953 |
++ bfqq->pid, |
4954 |
++ bfqq->entity.weight, |
4955 |
++ bfqq->queued[0], |
4956 |
++ bfqq->queued[1], |
4957 |
++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), |
4958 |
++ jiffies_to_msecs(bfqq->wr_cur_max_time)); |
4959 |
++ } |
4960 |
++ |
4961 |
++ num_char += sprintf(page + num_char, "Idle:\n"); |
4962 |
++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { |
4963 |
++ num_char += sprintf(page + num_char, |
4964 |
++ "pid%d: weight %hu, dur %d/%u\n", |
4965 |
++ bfqq->pid, |
4966 |
++ bfqq->entity.weight, |
4967 |
++ jiffies_to_msecs(jiffies - |
4968 |
++ bfqq->last_wr_start_finish), |
4969 |
++ jiffies_to_msecs(bfqq->wr_cur_max_time)); |
4970 |
++ } |
4971 |
++ |
4972 |
++ spin_unlock_irq(bfqd->queue->queue_lock); |
4973 |
++ |
4974 |
++ return num_char; |
4975 |
++} |
4976 |
++ |
4977 |
++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ |
4978 |
++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ |
4979 |
++{ \ |
4980 |
++ struct bfq_data *bfqd = e->elevator_data; \ |
4981 |
++ unsigned int __data = __VAR; \ |
4982 |
++ if (__CONV) \ |
4983 |
++ __data = jiffies_to_msecs(__data); \ |
4984 |
++ return bfq_var_show(__data, (page)); \ |
4985 |
++} |
4986 |
++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); |
4987 |
++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); |
4988 |
++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); |
4989 |
++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); |
4990 |
++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); |
4991 |
++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); |
4992 |
++SHOW_FUNCTION(bfq_max_budget_async_rq_show, |
4993 |
++ bfqd->bfq_max_budget_async_rq, 0); |
4994 |
++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); |
4995 |
++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); |
4996 |
++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); |
4997 |
++SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); |
4998 |
++SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); |
4999 |
++SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); |
5000 |
++SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, |
5001 |
++ 1); |
5002 |
++SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); |
5003 |
++#undef SHOW_FUNCTION |
5004 |
++ |
5005 |
++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ |
5006 |
++static ssize_t \ |
5007 |
++__FUNC(struct elevator_queue *e, const char *page, size_t count) \ |
5008 |
++{ \ |
5009 |
++ struct bfq_data *bfqd = e->elevator_data; \ |
5010 |
++ unsigned long uninitialized_var(__data); \ |
5011 |
++ int ret = bfq_var_store(&__data, (page), count); \ |
5012 |
++ if (__data < (MIN)) \ |
5013 |
++ __data = (MIN); \ |
5014 |
++ else if (__data > (MAX)) \ |
5015 |
++ __data = (MAX); \ |
5016 |
++ if (__CONV) \ |
5017 |
++ *(__PTR) = msecs_to_jiffies(__data); \ |
5018 |
++ else \ |
5019 |
++ *(__PTR) = __data; \ |
5020 |
++ return ret; \ |
5021 |
++} |
5022 |
++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, |
5023 |
++ INT_MAX, 1); |
5024 |
++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, |
5025 |
++ INT_MAX, 1); |
5026 |
++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); |
5027 |
++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, |
5028 |
++ INT_MAX, 0); |
5029 |
++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); |
5030 |
++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, |
5031 |
++ 1, INT_MAX, 0); |
5032 |
++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, |
5033 |
++ INT_MAX, 1); |
5034 |
++STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); |
5035 |
++STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); |
5036 |
++STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, |
5037 |
++ 1); |
5038 |
++STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, |
5039 |
++ INT_MAX, 1); |
5040 |
++STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, |
5041 |
++ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); |
5042 |
++STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, |
5043 |
++ INT_MAX, 0); |
5044 |
++#undef STORE_FUNCTION |
5045 |
++ |
5046 |
++/* do nothing for the moment */ |
5047 |
++static ssize_t bfq_weights_store(struct elevator_queue *e, |
5048 |
++ const char *page, size_t count) |
5049 |
++{ |
5050 |
++ return count; |
5051 |
++} |
5052 |
++ |
5053 |
++static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) |
5054 |
++{ |
5055 |
++ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); |
5056 |
++ |
5057 |
++ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) |
5058 |
++ return bfq_calc_max_budget(bfqd->peak_rate, timeout); |
5059 |
++ else |
5060 |
++ return bfq_default_max_budget; |
5061 |
++} |
5062 |
++ |
5063 |
++static ssize_t bfq_max_budget_store(struct elevator_queue *e, |
5064 |
++ const char *page, size_t count) |
5065 |
++{ |
5066 |
++ struct bfq_data *bfqd = e->elevator_data; |
5067 |
++ unsigned long uninitialized_var(__data); |
5068 |
++ int ret = bfq_var_store(&__data, (page), count); |
5069 |
++ |
5070 |
++ if (__data == 0) |
5071 |
++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); |
5072 |
++ else { |
5073 |
++ if (__data > INT_MAX) |
5074 |
++ __data = INT_MAX; |
5075 |
++ bfqd->bfq_max_budget = __data; |
5076 |
++ } |
5077 |
++ |
5078 |
++ bfqd->bfq_user_max_budget = __data; |
5079 |
++ |
5080 |
++ return ret; |
5081 |
++} |
5082 |
++ |
5083 |
++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, |
5084 |
++ const char *page, size_t count) |
5085 |
++{ |
5086 |
++ struct bfq_data *bfqd = e->elevator_data; |
5087 |
++ unsigned long uninitialized_var(__data); |
5088 |
++ int ret = bfq_var_store(&__data, (page), count); |
5089 |
++ |
5090 |
++ if (__data < 1) |
5091 |
++ __data = 1; |
5092 |
++ else if (__data > INT_MAX) |
5093 |
++ __data = INT_MAX; |
5094 |
++ |
5095 |
++ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); |
5096 |
++ if (bfqd->bfq_user_max_budget == 0) |
5097 |
++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); |
5098 |
++ |
5099 |
++ return ret; |
5100 |
++} |
5101 |
++ |
5102 |
++static ssize_t bfq_low_latency_store(struct elevator_queue *e, |
5103 |
++ const char *page, size_t count) |
5104 |
++{ |
5105 |
++ struct bfq_data *bfqd = e->elevator_data; |
5106 |
++ unsigned long uninitialized_var(__data); |
5107 |
++ int ret = bfq_var_store(&__data, (page), count); |
5108 |
++ |
5109 |
++ if (__data > 1) |
5110 |
++ __data = 1; |
5111 |
++ if (__data == 0 && bfqd->low_latency != 0) |
5112 |
++ bfq_end_wr(bfqd); |
5113 |
++ bfqd->low_latency = __data; |
5114 |
++ |
5115 |
++ return ret; |
5116 |
++} |
5117 |
++ |
5118 |
++#define BFQ_ATTR(name) \ |
5119 |
++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) |
5120 |
++ |
5121 |
++static struct elv_fs_entry bfq_attrs[] = { |
5122 |
++ BFQ_ATTR(fifo_expire_sync), |
5123 |
++ BFQ_ATTR(fifo_expire_async), |
5124 |
++ BFQ_ATTR(back_seek_max), |
5125 |
++ BFQ_ATTR(back_seek_penalty), |
5126 |
++ BFQ_ATTR(slice_idle), |
5127 |
++ BFQ_ATTR(max_budget), |
5128 |
++ BFQ_ATTR(max_budget_async_rq), |
5129 |
++ BFQ_ATTR(timeout_sync), |
5130 |
++ BFQ_ATTR(timeout_async), |
5131 |
++ BFQ_ATTR(low_latency), |
5132 |
++ BFQ_ATTR(wr_coeff), |
5133 |
++ BFQ_ATTR(wr_max_time), |
5134 |
++ BFQ_ATTR(wr_rt_max_time), |
5135 |
++ BFQ_ATTR(wr_min_idle_time), |
5136 |
++ BFQ_ATTR(wr_min_inter_arr_async), |
5137 |
++ BFQ_ATTR(wr_max_softrt_rate), |
5138 |
++ BFQ_ATTR(weights), |
5139 |
++ __ATTR_NULL |
5140 |
++}; |
5141 |
++ |
5142 |
++static struct elevator_type iosched_bfq = { |
5143 |
++ .ops = { |
5144 |
++ .elevator_merge_fn = bfq_merge, |
5145 |
++ .elevator_merged_fn = bfq_merged_request, |
5146 |
++ .elevator_merge_req_fn = bfq_merged_requests, |
5147 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
5148 |
++ .elevator_bio_merged_fn = bfq_bio_merged, |
5149 |
++#endif |
5150 |
++ .elevator_allow_merge_fn = bfq_allow_merge, |
5151 |
++ .elevator_dispatch_fn = bfq_dispatch_requests, |
5152 |
++ .elevator_add_req_fn = bfq_insert_request, |
5153 |
++ .elevator_activate_req_fn = bfq_activate_request, |
5154 |
++ .elevator_deactivate_req_fn = bfq_deactivate_request, |
5155 |
++ .elevator_completed_req_fn = bfq_completed_request, |
5156 |
++ .elevator_former_req_fn = elv_rb_former_request, |
5157 |
++ .elevator_latter_req_fn = elv_rb_latter_request, |
5158 |
++ .elevator_init_icq_fn = bfq_init_icq, |
5159 |
++ .elevator_exit_icq_fn = bfq_exit_icq, |
5160 |
++ .elevator_set_req_fn = bfq_set_request, |
5161 |
++ .elevator_put_req_fn = bfq_put_request, |
5162 |
++ .elevator_may_queue_fn = bfq_may_queue, |
5163 |
++ .elevator_init_fn = bfq_init_queue, |
5164 |
++ .elevator_exit_fn = bfq_exit_queue, |
5165 |
++ }, |
5166 |
++ .icq_size = sizeof(struct bfq_io_cq), |
5167 |
++ .icq_align = __alignof__(struct bfq_io_cq), |
5168 |
++ .elevator_attrs = bfq_attrs, |
5169 |
++ .elevator_name = "bfq", |
5170 |
++ .elevator_owner = THIS_MODULE, |
5171 |
++}; |
5172 |
++ |
5173 |
++static int __init bfq_init(void) |
5174 |
++{ |
5175 |
++ int ret; |
5176 |
++ |
5177 |
++ /* |
5178 |
++ * Can be 0 on HZ < 1000 setups. |
5179 |
++ */ |
5180 |
++ if (bfq_slice_idle == 0) |
5181 |
++ bfq_slice_idle = 1; |
5182 |
++ |
5183 |
++ if (bfq_timeout_async == 0) |
5184 |
++ bfq_timeout_async = 1; |
5185 |
++ |
5186 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
5187 |
++ ret = blkcg_policy_register(&blkcg_policy_bfq); |
5188 |
++ if (ret) |
5189 |
++ return ret; |
5190 |
++#endif |
5191 |
++ |
5192 |
++ ret = -ENOMEM; |
5193 |
++ if (bfq_slab_setup()) |
5194 |
++ goto err_pol_unreg; |
5195 |
++ |
5196 |
++ /* |
5197 |
++ * Times to load large popular applications for the typical systems |
5198 |
++ * installed on the reference devices (see the comments before the |
5199 |
++ * definitions of the two arrays). |
5200 |
++ */ |
5201 |
++ T_slow[0] = msecs_to_jiffies(2600); |
5202 |
++ T_slow[1] = msecs_to_jiffies(1000); |
5203 |
++ T_fast[0] = msecs_to_jiffies(5500); |
5204 |
++ T_fast[1] = msecs_to_jiffies(2000); |
5205 |
++ |
5206 |
++ /* |
5207 |
++ * Thresholds that determine the switch between speed classes (see |
5208 |
++ * the comments before the definition of the array). |
5209 |
++ */ |
5210 |
++ device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; |
5211 |
++ device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; |
5212 |
++ |
5213 |
++ ret = elv_register(&iosched_bfq); |
5214 |
++ if (ret) |
5215 |
++ goto err_pol_unreg; |
5216 |
++ |
5217 |
++ pr_info("BFQ I/O-scheduler: v7r11"); |
5218 |
++ |
5219 |
++ return 0; |
5220 |
++ |
5221 |
++err_pol_unreg: |
5222 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
5223 |
++ blkcg_policy_unregister(&blkcg_policy_bfq); |
5224 |
++#endif |
5225 |
++ return ret; |
5226 |
++} |
5227 |
++ |
5228 |
++static void __exit bfq_exit(void) |
5229 |
++{ |
5230 |
++ elv_unregister(&iosched_bfq); |
5231 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
5232 |
++ blkcg_policy_unregister(&blkcg_policy_bfq); |
5233 |
++#endif |
5234 |
++ bfq_slab_kill(); |
5235 |
++} |
5236 |
++ |
5237 |
++module_init(bfq_init); |
5238 |
++module_exit(bfq_exit); |
5239 |
++ |
5240 |
++MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); |
5241 |
++MODULE_LICENSE("GPL"); |
5242 |
+diff --git a/block/bfq-sched.c b/block/bfq-sched.c |
5243 |
+new file mode 100644 |
5244 |
+index 0000000..a64fec1 |
5245 |
+--- /dev/null |
5246 |
++++ b/block/bfq-sched.c |
5247 |
+@@ -0,0 +1,1200 @@ |
5248 |
++/* |
5249 |
++ * BFQ: Hierarchical B-WF2Q+ scheduler. |
5250 |
++ * |
5251 |
++ * Based on ideas and code from CFQ: |
5252 |
++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
5253 |
++ * |
5254 |
++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
5255 |
++ * Paolo Valente <paolo.valente@×××××××.it> |
5256 |
++ * |
5257 |
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
5258 |
++ */ |
5259 |
++ |
5260 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
5261 |
++#define for_each_entity(entity) \ |
5262 |
++ for (; entity ; entity = entity->parent) |
5263 |
++ |
5264 |
++#define for_each_entity_safe(entity, parent) \ |
5265 |
++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) |
5266 |
++ |
5267 |
++ |
5268 |
++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, |
5269 |
++ int extract, |
5270 |
++ struct bfq_data *bfqd); |
5271 |
++ |
5272 |
++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); |
5273 |
++ |
5274 |
++static void bfq_update_budget(struct bfq_entity *next_in_service) |
5275 |
++{ |
5276 |
++ struct bfq_entity *bfqg_entity; |
5277 |
++ struct bfq_group *bfqg; |
5278 |
++ struct bfq_sched_data *group_sd; |
5279 |
++ |
5280 |
++ BUG_ON(!next_in_service); |
5281 |
++ |
5282 |
++ group_sd = next_in_service->sched_data; |
5283 |
++ |
5284 |
++ bfqg = container_of(group_sd, struct bfq_group, sched_data); |
5285 |
++ /* |
5286 |
++ * bfq_group's my_entity field is not NULL only if the group |
5287 |
++ * is not the root group. We must not touch the root entity |
5288 |
++ * as it must never become an in-service entity. |
5289 |
++ */ |
5290 |
++ bfqg_entity = bfqg->my_entity; |
5291 |
++ if (bfqg_entity) |
5292 |
++ bfqg_entity->budget = next_in_service->budget; |
5293 |
++} |
5294 |
++ |
5295 |
++static int bfq_update_next_in_service(struct bfq_sched_data *sd) |
5296 |
++{ |
5297 |
++ struct bfq_entity *next_in_service; |
5298 |
++ |
5299 |
++ if (sd->in_service_entity) |
5300 |
++ /* will update/requeue at the end of service */ |
5301 |
++ return 0; |
5302 |
++ |
5303 |
++ /* |
5304 |
++ * NOTE: this can be improved in many ways, such as returning |
5305 |
++ * 1 (and thus propagating upwards the update) only when the |
5306 |
++ * budget changes, or caching the bfqq that will be scheduled |
5307 |
++ * next from this subtree. By now we worry more about |
5308 |
++ * correctness than about performance... |
5309 |
++ */ |
5310 |
++ next_in_service = bfq_lookup_next_entity(sd, 0, NULL); |
5311 |
++ sd->next_in_service = next_in_service; |
5312 |
++ |
5313 |
++ if (next_in_service) |
5314 |
++ bfq_update_budget(next_in_service); |
5315 |
++ |
5316 |
++ return 1; |
5317 |
++} |
5318 |
++ |
5319 |
++static void bfq_check_next_in_service(struct bfq_sched_data *sd, |
5320 |
++ struct bfq_entity *entity) |
5321 |
++{ |
5322 |
++ BUG_ON(sd->next_in_service != entity); |
5323 |
++} |
5324 |
++#else |
5325 |
++#define for_each_entity(entity) \ |
5326 |
++ for (; entity ; entity = NULL) |
5327 |
++ |
5328 |
++#define for_each_entity_safe(entity, parent) \ |
5329 |
++ for (parent = NULL; entity ; entity = parent) |
5330 |
++ |
5331 |
++static int bfq_update_next_in_service(struct bfq_sched_data *sd) |
5332 |
++{ |
5333 |
++ return 0; |
5334 |
++} |
5335 |
++ |
5336 |
++static void bfq_check_next_in_service(struct bfq_sched_data *sd, |
5337 |
++ struct bfq_entity *entity) |
5338 |
++{ |
5339 |
++} |
5340 |
++ |
5341 |
++static void bfq_update_budget(struct bfq_entity *next_in_service) |
5342 |
++{ |
5343 |
++} |
5344 |
++#endif |
5345 |
++ |
5346 |
++/* |
5347 |
++ * Shift for timestamp calculations. This actually limits the maximum |
5348 |
++ * service allowed in one timestamp delta (small shift values increase it), |
5349 |
++ * the maximum total weight that can be used for the queues in the system |
5350 |
++ * (big shift values increase it), and the period of virtual time |
5351 |
++ * wraparounds. |
5352 |
++ */ |
5353 |
++#define WFQ_SERVICE_SHIFT 22 |
5354 |
++ |
5355 |
++/** |
5356 |
++ * bfq_gt - compare two timestamps. |
5357 |
++ * @a: first ts. |
5358 |
++ * @b: second ts. |
5359 |
++ * |
5360 |
++ * Return @a > @b, dealing with wrapping correctly. |
5361 |
++ */ |
5362 |
++static int bfq_gt(u64 a, u64 b) |
5363 |
++{ |
5364 |
++ return (s64)(a - b) > 0; |
5365 |
++} |
5366 |
++ |
5367 |
++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) |
5368 |
++{ |
5369 |
++ struct bfq_queue *bfqq = NULL; |
5370 |
++ |
5371 |
++ BUG_ON(!entity); |
5372 |
++ |
5373 |
++ if (!entity->my_sched_data) |
5374 |
++ bfqq = container_of(entity, struct bfq_queue, entity); |
5375 |
++ |
5376 |
++ return bfqq; |
5377 |
++} |
5378 |
++ |
5379 |
++ |
5380 |
++/** |
5381 |
++ * bfq_delta - map service into the virtual time domain. |
5382 |
++ * @service: amount of service. |
5383 |
++ * @weight: scale factor (weight of an entity or weight sum). |
5384 |
++ */ |
5385 |
++static u64 bfq_delta(unsigned long service, unsigned long weight) |
5386 |
++{ |
5387 |
++ u64 d = (u64)service << WFQ_SERVICE_SHIFT; |
5388 |
++ |
5389 |
++ do_div(d, weight); |
5390 |
++ return d; |
5391 |
++} |
5392 |
++ |
5393 |
++/** |
5394 |
++ * bfq_calc_finish - assign the finish time to an entity. |
5395 |
++ * @entity: the entity to act upon. |
5396 |
++ * @service: the service to be charged to the entity. |
5397 |
++ */ |
5398 |
++static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) |
5399 |
++{ |
5400 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
5401 |
++ |
5402 |
++ BUG_ON(entity->weight == 0); |
5403 |
++ |
5404 |
++ entity->finish = entity->start + |
5405 |
++ bfq_delta(service, entity->weight); |
5406 |
++ |
5407 |
++ if (bfqq) { |
5408 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
5409 |
++ "calc_finish: serv %lu, w %d", |
5410 |
++ service, entity->weight); |
5411 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
5412 |
++ "calc_finish: start %llu, finish %llu, delta %llu", |
5413 |
++ entity->start, entity->finish, |
5414 |
++ bfq_delta(service, entity->weight)); |
5415 |
++ } |
5416 |
++} |
5417 |
++ |
5418 |
++/** |
5419 |
++ * bfq_entity_of - get an entity from a node. |
5420 |
++ * @node: the node field of the entity. |
5421 |
++ * |
5422 |
++ * Convert a node pointer to the relative entity. This is used only |
5423 |
++ * to simplify the logic of some functions and not as the generic |
5424 |
++ * conversion mechanism because, e.g., in the tree walking functions, |
5425 |
++ * the check for a %NULL value would be redundant. |
5426 |
++ */ |
5427 |
++static struct bfq_entity *bfq_entity_of(struct rb_node *node) |
5428 |
++{ |
5429 |
++ struct bfq_entity *entity = NULL; |
5430 |
++ |
5431 |
++ if (node) |
5432 |
++ entity = rb_entry(node, struct bfq_entity, rb_node); |
5433 |
++ |
5434 |
++ return entity; |
5435 |
++} |
5436 |
++ |
5437 |
++/** |
5438 |
++ * bfq_extract - remove an entity from a tree. |
5439 |
++ * @root: the tree root. |
5440 |
++ * @entity: the entity to remove. |
5441 |
++ */ |
5442 |
++static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) |
5443 |
++{ |
5444 |
++ BUG_ON(entity->tree != root); |
5445 |
++ |
5446 |
++ entity->tree = NULL; |
5447 |
++ rb_erase(&entity->rb_node, root); |
5448 |
++} |
5449 |
++ |
5450 |
++/** |
5451 |
++ * bfq_idle_extract - extract an entity from the idle tree. |
5452 |
++ * @st: the service tree of the owning @entity. |
5453 |
++ * @entity: the entity being removed. |
5454 |
++ */ |
5455 |
++static void bfq_idle_extract(struct bfq_service_tree *st, |
5456 |
++ struct bfq_entity *entity) |
5457 |
++{ |
5458 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
5459 |
++ struct rb_node *next; |
5460 |
++ |
5461 |
++ BUG_ON(entity->tree != &st->idle); |
5462 |
++ |
5463 |
++ if (entity == st->first_idle) { |
5464 |
++ next = rb_next(&entity->rb_node); |
5465 |
++ st->first_idle = bfq_entity_of(next); |
5466 |
++ } |
5467 |
++ |
5468 |
++ if (entity == st->last_idle) { |
5469 |
++ next = rb_prev(&entity->rb_node); |
5470 |
++ st->last_idle = bfq_entity_of(next); |
5471 |
++ } |
5472 |
++ |
5473 |
++ bfq_extract(&st->idle, entity); |
5474 |
++ |
5475 |
++ if (bfqq) |
5476 |
++ list_del(&bfqq->bfqq_list); |
5477 |
++} |
5478 |
++ |
5479 |
++/** |
5480 |
++ * bfq_insert - generic tree insertion. |
5481 |
++ * @root: tree root. |
5482 |
++ * @entity: entity to insert. |
5483 |
++ * |
5484 |
++ * This is used for the idle and the active tree, since they are both |
5485 |
++ * ordered by finish time. |
5486 |
++ */ |
5487 |
++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) |
5488 |
++{ |
5489 |
++ struct bfq_entity *entry; |
5490 |
++ struct rb_node **node = &root->rb_node; |
5491 |
++ struct rb_node *parent = NULL; |
5492 |
++ |
5493 |
++ BUG_ON(entity->tree); |
5494 |
++ |
5495 |
++ while (*node) { |
5496 |
++ parent = *node; |
5497 |
++ entry = rb_entry(parent, struct bfq_entity, rb_node); |
5498 |
++ |
5499 |
++ if (bfq_gt(entry->finish, entity->finish)) |
5500 |
++ node = &parent->rb_left; |
5501 |
++ else |
5502 |
++ node = &parent->rb_right; |
5503 |
++ } |
5504 |
++ |
5505 |
++ rb_link_node(&entity->rb_node, parent, node); |
5506 |
++ rb_insert_color(&entity->rb_node, root); |
5507 |
++ |
5508 |
++ entity->tree = root; |
5509 |
++} |
5510 |
++ |
5511 |
++/** |
5512 |
++ * bfq_update_min - update the min_start field of a entity. |
5513 |
++ * @entity: the entity to update. |
5514 |
++ * @node: one of its children. |
5515 |
++ * |
5516 |
++ * This function is called when @entity may store an invalid value for |
5517 |
++ * min_start due to updates to the active tree. The function assumes |
5518 |
++ * that the subtree rooted at @node (which may be its left or its right |
5519 |
++ * child) has a valid min_start value. |
5520 |
++ */ |
5521 |
++static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) |
5522 |
++{ |
5523 |
++ struct bfq_entity *child; |
5524 |
++ |
5525 |
++ if (node) { |
5526 |
++ child = rb_entry(node, struct bfq_entity, rb_node); |
5527 |
++ if (bfq_gt(entity->min_start, child->min_start)) |
5528 |
++ entity->min_start = child->min_start; |
5529 |
++ } |
5530 |
++} |
5531 |
++ |
5532 |
++/** |
5533 |
++ * bfq_update_active_node - recalculate min_start. |
5534 |
++ * @node: the node to update. |
5535 |
++ * |
5536 |
++ * @node may have changed position or one of its children may have moved, |
5537 |
++ * this function updates its min_start value. The left and right subtrees |
5538 |
++ * are assumed to hold a correct min_start value. |
5539 |
++ */ |
5540 |
++static void bfq_update_active_node(struct rb_node *node) |
5541 |
++{ |
5542 |
++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); |
5543 |
++ |
5544 |
++ entity->min_start = entity->start; |
5545 |
++ bfq_update_min(entity, node->rb_right); |
5546 |
++ bfq_update_min(entity, node->rb_left); |
5547 |
++} |
5548 |
++ |
5549 |
++/** |
5550 |
++ * bfq_update_active_tree - update min_start for the whole active tree. |
5551 |
++ * @node: the starting node. |
5552 |
++ * |
5553 |
++ * @node must be the deepest modified node after an update. This function |
5554 |
++ * updates its min_start using the values held by its children, assuming |
5555 |
++ * that they did not change, and then updates all the nodes that may have |
5556 |
++ * changed in the path to the root. The only nodes that may have changed |
5557 |
++ * are the ones in the path or their siblings. |
5558 |
++ */ |
5559 |
++static void bfq_update_active_tree(struct rb_node *node) |
5560 |
++{ |
5561 |
++ struct rb_node *parent; |
5562 |
++ |
5563 |
++up: |
5564 |
++ bfq_update_active_node(node); |
5565 |
++ |
5566 |
++ parent = rb_parent(node); |
5567 |
++ if (!parent) |
5568 |
++ return; |
5569 |
++ |
5570 |
++ if (node == parent->rb_left && parent->rb_right) |
5571 |
++ bfq_update_active_node(parent->rb_right); |
5572 |
++ else if (parent->rb_left) |
5573 |
++ bfq_update_active_node(parent->rb_left); |
5574 |
++ |
5575 |
++ node = parent; |
5576 |
++ goto up; |
5577 |
++} |
5578 |
++ |
5579 |
++static void bfq_weights_tree_add(struct bfq_data *bfqd, |
5580 |
++ struct bfq_entity *entity, |
5581 |
++ struct rb_root *root); |
5582 |
++ |
5583 |
++static void bfq_weights_tree_remove(struct bfq_data *bfqd, |
5584 |
++ struct bfq_entity *entity, |
5585 |
++ struct rb_root *root); |
5586 |
++ |
5587 |
++ |
5588 |
++/** |
5589 |
++ * bfq_active_insert - insert an entity in the active tree of its |
5590 |
++ * group/device. |
5591 |
++ * @st: the service tree of the entity. |
5592 |
++ * @entity: the entity being inserted. |
5593 |
++ * |
5594 |
++ * The active tree is ordered by finish time, but an extra key is kept |
5595 |
++ * per each node, containing the minimum value for the start times of |
5596 |
++ * its children (and the node itself), so it's possible to search for |
5597 |
++ * the eligible node with the lowest finish time in logarithmic time. |
5598 |
++ */ |
5599 |
++static void bfq_active_insert(struct bfq_service_tree *st, |
5600 |
++ struct bfq_entity *entity) |
5601 |
++{ |
5602 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
5603 |
++ struct rb_node *node = &entity->rb_node; |
5604 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
5605 |
++ struct bfq_sched_data *sd = NULL; |
5606 |
++ struct bfq_group *bfqg = NULL; |
5607 |
++ struct bfq_data *bfqd = NULL; |
5608 |
++#endif |
5609 |
++ |
5610 |
++ bfq_insert(&st->active, entity); |
5611 |
++ |
5612 |
++ if (node->rb_left) |
5613 |
++ node = node->rb_left; |
5614 |
++ else if (node->rb_right) |
5615 |
++ node = node->rb_right; |
5616 |
++ |
5617 |
++ bfq_update_active_tree(node); |
5618 |
++ |
5619 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
5620 |
++ sd = entity->sched_data; |
5621 |
++ bfqg = container_of(sd, struct bfq_group, sched_data); |
5622 |
++ BUG_ON(!bfqg); |
5623 |
++ bfqd = (struct bfq_data *)bfqg->bfqd; |
5624 |
++#endif |
5625 |
++ if (bfqq) |
5626 |
++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); |
5627 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
5628 |
++ else { /* bfq_group */ |
5629 |
++ BUG_ON(!bfqd); |
5630 |
++ bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); |
5631 |
++ } |
5632 |
++ if (bfqg != bfqd->root_group) { |
5633 |
++ BUG_ON(!bfqg); |
5634 |
++ BUG_ON(!bfqd); |
5635 |
++ bfqg->active_entities++; |
5636 |
++ if (bfqg->active_entities == 2) |
5637 |
++ bfqd->active_numerous_groups++; |
5638 |
++ } |
5639 |
++#endif |
5640 |
++} |
5641 |
++ |
5642 |
++/** |
5643 |
++ * bfq_ioprio_to_weight - calc a weight from an ioprio. |
5644 |
++ * @ioprio: the ioprio value to convert. |
5645 |
++ */ |
5646 |
++static unsigned short bfq_ioprio_to_weight(int ioprio) |
5647 |
++{ |
5648 |
++ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); |
5649 |
++ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio; |
5650 |
++} |
5651 |
++ |
5652 |
++/** |
5653 |
++ * bfq_weight_to_ioprio - calc an ioprio from a weight. |
5654 |
++ * @weight: the weight value to convert. |
5655 |
++ * |
5656 |
++ * To preserve as much as possible the old only-ioprio user interface, |
5657 |
++ * 0 is used as an escape ioprio value for weights (numerically) equal or |
5658 |
++ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. |
5659 |
++ */ |
5660 |
++static unsigned short bfq_weight_to_ioprio(int weight) |
5661 |
++{ |
5662 |
++ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); |
5663 |
++ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? |
5664 |
++ 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; |
5665 |
++} |
5666 |
++ |
5667 |
++static void bfq_get_entity(struct bfq_entity *entity) |
5668 |
++{ |
5669 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
5670 |
++ |
5671 |
++ if (bfqq) { |
5672 |
++ atomic_inc(&bfqq->ref); |
5673 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", |
5674 |
++ bfqq, atomic_read(&bfqq->ref)); |
5675 |
++ } |
5676 |
++} |
5677 |
++ |
5678 |
++/** |
5679 |
++ * bfq_find_deepest - find the deepest node that an extraction can modify. |
5680 |
++ * @node: the node being removed. |
5681 |
++ * |
5682 |
++ * Do the first step of an extraction in an rb tree, looking for the |
5683 |
++ * node that will replace @node, and returning the deepest node that |
5684 |
++ * the following modifications to the tree can touch. If @node is the |
5685 |
++ * last node in the tree return %NULL. |
5686 |
++ */ |
5687 |
++static struct rb_node *bfq_find_deepest(struct rb_node *node) |
5688 |
++{ |
5689 |
++ struct rb_node *deepest; |
5690 |
++ |
5691 |
++ if (!node->rb_right && !node->rb_left) |
5692 |
++ deepest = rb_parent(node); |
5693 |
++ else if (!node->rb_right) |
5694 |
++ deepest = node->rb_left; |
5695 |
++ else if (!node->rb_left) |
5696 |
++ deepest = node->rb_right; |
5697 |
++ else { |
5698 |
++ deepest = rb_next(node); |
5699 |
++ if (deepest->rb_right) |
5700 |
++ deepest = deepest->rb_right; |
5701 |
++ else if (rb_parent(deepest) != node) |
5702 |
++ deepest = rb_parent(deepest); |
5703 |
++ } |
5704 |
++ |
5705 |
++ return deepest; |
5706 |
++} |
5707 |
++ |
5708 |
++/** |
5709 |
++ * bfq_active_extract - remove an entity from the active tree. |
5710 |
++ * @st: the service_tree containing the tree. |
5711 |
++ * @entity: the entity being removed. |
5712 |
++ */ |
5713 |
++static void bfq_active_extract(struct bfq_service_tree *st, |
5714 |
++ struct bfq_entity *entity) |
5715 |
++{ |
5716 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
5717 |
++ struct rb_node *node; |
5718 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
5719 |
++ struct bfq_sched_data *sd = NULL; |
5720 |
++ struct bfq_group *bfqg = NULL; |
5721 |
++ struct bfq_data *bfqd = NULL; |
5722 |
++#endif |
5723 |
++ |
5724 |
++ node = bfq_find_deepest(&entity->rb_node); |
5725 |
++ bfq_extract(&st->active, entity); |
5726 |
++ |
5727 |
++ if (node) |
5728 |
++ bfq_update_active_tree(node); |
5729 |
++ |
5730 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
5731 |
++ sd = entity->sched_data; |
5732 |
++ bfqg = container_of(sd, struct bfq_group, sched_data); |
5733 |
++ BUG_ON(!bfqg); |
5734 |
++ bfqd = (struct bfq_data *)bfqg->bfqd; |
5735 |
++#endif |
5736 |
++ if (bfqq) |
5737 |
++ list_del(&bfqq->bfqq_list); |
5738 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
5739 |
++ else { /* bfq_group */ |
5740 |
++ BUG_ON(!bfqd); |
5741 |
++ bfq_weights_tree_remove(bfqd, entity, |
5742 |
++ &bfqd->group_weights_tree); |
5743 |
++ } |
5744 |
++ if (bfqg != bfqd->root_group) { |
5745 |
++ BUG_ON(!bfqg); |
5746 |
++ BUG_ON(!bfqd); |
5747 |
++ BUG_ON(!bfqg->active_entities); |
5748 |
++ bfqg->active_entities--; |
5749 |
++ if (bfqg->active_entities == 1) { |
5750 |
++ BUG_ON(!bfqd->active_numerous_groups); |
5751 |
++ bfqd->active_numerous_groups--; |
5752 |
++ } |
5753 |
++ } |
5754 |
++#endif |
5755 |
++} |
5756 |
++ |
5757 |
++/** |
5758 |
++ * bfq_idle_insert - insert an entity into the idle tree. |
5759 |
++ * @st: the service tree containing the tree. |
5760 |
++ * @entity: the entity to insert. |
5761 |
++ */ |
5762 |
++static void bfq_idle_insert(struct bfq_service_tree *st, |
5763 |
++ struct bfq_entity *entity) |
5764 |
++{ |
5765 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
5766 |
++ struct bfq_entity *first_idle = st->first_idle; |
5767 |
++ struct bfq_entity *last_idle = st->last_idle; |
5768 |
++ |
5769 |
++ if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) |
5770 |
++ st->first_idle = entity; |
5771 |
++ if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) |
5772 |
++ st->last_idle = entity; |
5773 |
++ |
5774 |
++ bfq_insert(&st->idle, entity); |
5775 |
++ |
5776 |
++ if (bfqq) |
5777 |
++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); |
5778 |
++} |
5779 |
++ |
5780 |
++/** |
5781 |
++ * bfq_forget_entity - remove an entity from the wfq trees. |
5782 |
++ * @st: the service tree. |
5783 |
++ * @entity: the entity being removed. |
5784 |
++ * |
5785 |
++ * Update the device status and forget everything about @entity, putting |
5786 |
++ * the device reference to it, if it is a queue. Entities belonging to |
5787 |
++ * groups are not refcounted. |
5788 |
++ */ |
5789 |
++static void bfq_forget_entity(struct bfq_service_tree *st, |
5790 |
++ struct bfq_entity *entity) |
5791 |
++{ |
5792 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
5793 |
++ struct bfq_sched_data *sd; |
5794 |
++ |
5795 |
++ BUG_ON(!entity->on_st); |
5796 |
++ |
5797 |
++ entity->on_st = 0; |
5798 |
++ st->wsum -= entity->weight; |
5799 |
++ if (bfqq) { |
5800 |
++ sd = entity->sched_data; |
5801 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", |
5802 |
++ bfqq, atomic_read(&bfqq->ref)); |
5803 |
++ bfq_put_queue(bfqq); |
5804 |
++ } |
5805 |
++} |
5806 |
++ |
5807 |
++/** |
5808 |
++ * bfq_put_idle_entity - release the idle tree ref of an entity. |
5809 |
++ * @st: service tree for the entity. |
5810 |
++ * @entity: the entity being released. |
5811 |
++ */ |
5812 |
++static void bfq_put_idle_entity(struct bfq_service_tree *st, |
5813 |
++ struct bfq_entity *entity) |
5814 |
++{ |
5815 |
++ bfq_idle_extract(st, entity); |
5816 |
++ bfq_forget_entity(st, entity); |
5817 |
++} |
5818 |
++ |
5819 |
++/** |
5820 |
++ * bfq_forget_idle - update the idle tree if necessary. |
5821 |
++ * @st: the service tree to act upon. |
5822 |
++ * |
5823 |
++ * To preserve the global O(log N) complexity we only remove one entry here; |
5824 |
++ * as the idle tree will not grow indefinitely this can be done safely. |
5825 |
++ */ |
5826 |
++static void bfq_forget_idle(struct bfq_service_tree *st) |
5827 |
++{ |
5828 |
++ struct bfq_entity *first_idle = st->first_idle; |
5829 |
++ struct bfq_entity *last_idle = st->last_idle; |
5830 |
++ |
5831 |
++ if (RB_EMPTY_ROOT(&st->active) && last_idle && |
5832 |
++ !bfq_gt(last_idle->finish, st->vtime)) { |
5833 |
++ /* |
5834 |
++ * Forget the whole idle tree, increasing the vtime past |
5835 |
++ * the last finish time of idle entities. |
5836 |
++ */ |
5837 |
++ st->vtime = last_idle->finish; |
5838 |
++ } |
5839 |
++ |
5840 |
++ if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) |
5841 |
++ bfq_put_idle_entity(st, first_idle); |
5842 |
++} |
5843 |
++ |
5844 |
++static struct bfq_service_tree * |
5845 |
++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, |
5846 |
++ struct bfq_entity *entity) |
5847 |
++{ |
5848 |
++ struct bfq_service_tree *new_st = old_st; |
5849 |
++ |
5850 |
++ if (entity->prio_changed) { |
5851 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
5852 |
++ unsigned short prev_weight, new_weight; |
5853 |
++ struct bfq_data *bfqd = NULL; |
5854 |
++ struct rb_root *root; |
5855 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
5856 |
++ struct bfq_sched_data *sd; |
5857 |
++ struct bfq_group *bfqg; |
5858 |
++#endif |
5859 |
++ |
5860 |
++ if (bfqq) |
5861 |
++ bfqd = bfqq->bfqd; |
5862 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
5863 |
++ else { |
5864 |
++ sd = entity->my_sched_data; |
5865 |
++ bfqg = container_of(sd, struct bfq_group, sched_data); |
5866 |
++ BUG_ON(!bfqg); |
5867 |
++ bfqd = (struct bfq_data *)bfqg->bfqd; |
5868 |
++ BUG_ON(!bfqd); |
5869 |
++ } |
5870 |
++#endif |
5871 |
++ |
5872 |
++ BUG_ON(old_st->wsum < entity->weight); |
5873 |
++ old_st->wsum -= entity->weight; |
5874 |
++ |
5875 |
++ if (entity->new_weight != entity->orig_weight) { |
5876 |
++ if (entity->new_weight < BFQ_MIN_WEIGHT || |
5877 |
++ entity->new_weight > BFQ_MAX_WEIGHT) { |
5878 |
++ printk(KERN_CRIT "update_weight_prio: " |
5879 |
++ "new_weight %d\n", |
5880 |
++ entity->new_weight); |
5881 |
++ BUG(); |
5882 |
++ } |
5883 |
++ entity->orig_weight = entity->new_weight; |
5884 |
++ if (bfqq) |
5885 |
++ bfqq->ioprio = |
5886 |
++ bfq_weight_to_ioprio(entity->orig_weight); |
5887 |
++ } |
5888 |
++ |
5889 |
++ if (bfqq) |
5890 |
++ bfqq->ioprio_class = bfqq->new_ioprio_class; |
5891 |
++ entity->prio_changed = 0; |
5892 |
++ |
5893 |
++ /* |
5894 |
++ * NOTE: here we may be changing the weight too early, |
5895 |
++ * this will cause unfairness. The correct approach |
5896 |
++ * would have required additional complexity to defer |
5897 |
++ * weight changes to the proper time instants (i.e., |
5898 |
++ * when entity->finish <= old_st->vtime). |
5899 |
++ */ |
5900 |
++ new_st = bfq_entity_service_tree(entity); |
5901 |
++ |
5902 |
++ prev_weight = entity->weight; |
5903 |
++ new_weight = entity->orig_weight * |
5904 |
++ (bfqq ? bfqq->wr_coeff : 1); |
5905 |
++ /* |
5906 |
++ * If the weight of the entity changes, remove the entity |
5907 |
++ * from its old weight counter (if there is a counter |
5908 |
++ * associated with the entity), and add it to the counter |
5909 |
++ * associated with its new weight. |
5910 |
++ */ |
5911 |
++ if (prev_weight != new_weight) { |
5912 |
++ root = bfqq ? &bfqd->queue_weights_tree : |
5913 |
++ &bfqd->group_weights_tree; |
5914 |
++ bfq_weights_tree_remove(bfqd, entity, root); |
5915 |
++ } |
5916 |
++ entity->weight = new_weight; |
5917 |
++ /* |
5918 |
++ * Add the entity to its weights tree only if it is |
5919 |
++ * not associated with a weight-raised queue. |
5920 |
++ */ |
5921 |
++ if (prev_weight != new_weight && |
5922 |
++ (bfqq ? bfqq->wr_coeff == 1 : 1)) |
5923 |
++ /* If we get here, root has been initialized. */ |
5924 |
++ bfq_weights_tree_add(bfqd, entity, root); |
5925 |
++ |
5926 |
++ new_st->wsum += entity->weight; |
5927 |
++ |
5928 |
++ if (new_st != old_st) |
5929 |
++ entity->start = new_st->vtime; |
5930 |
++ } |
5931 |
++ |
5932 |
++ return new_st; |
5933 |
++} |
5934 |
++ |
5935 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
5936 |
++static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); |
5937 |
++#endif |
5938 |
++ |
5939 |
++/** |
5940 |
++ * bfq_bfqq_served - update the scheduler status after selection for |
5941 |
++ * service. |
5942 |
++ * @bfqq: the queue being served. |
5943 |
++ * @served: bytes to transfer. |
5944 |
++ * |
5945 |
++ * NOTE: this can be optimized, as the timestamps of upper level entities |
5946 |
++ * are synchronized every time a new bfqq is selected for service. By now, |
5947 |
++ * we keep it to better check consistency. |
5948 |
++ */ |
5949 |
++static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) |
5950 |
++{ |
5951 |
++ struct bfq_entity *entity = &bfqq->entity; |
5952 |
++ struct bfq_service_tree *st; |
5953 |
++ |
5954 |
++ for_each_entity(entity) { |
5955 |
++ st = bfq_entity_service_tree(entity); |
5956 |
++ |
5957 |
++ entity->service += served; |
5958 |
++ BUG_ON(entity->service > entity->budget); |
5959 |
++ BUG_ON(st->wsum == 0); |
5960 |
++ |
5961 |
++ st->vtime += bfq_delta(served, st->wsum); |
5962 |
++ bfq_forget_idle(st); |
5963 |
++ } |
5964 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
5965 |
++ bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); |
5966 |
++#endif |
5967 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); |
5968 |
++} |
5969 |
++ |
5970 |
++/** |
5971 |
++ * bfq_bfqq_charge_full_budget - set the service to the entity budget. |
5972 |
++ * @bfqq: the queue that needs a service update. |
5973 |
++ * |
5974 |
++ * When it's not possible to be fair in the service domain, because |
5975 |
++ * a queue is not consuming its budget fast enough (the meaning of |
5976 |
++ * fast depends on the timeout parameter), we charge it a full |
5977 |
++ * budget. In this way we should obtain a sort of time-domain |
5978 |
++ * fairness among all the seeky/slow queues. |
5979 |
++ */ |
5980 |
++static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) |
5981 |
++{ |
5982 |
++ struct bfq_entity *entity = &bfqq->entity; |
5983 |
++ |
5984 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); |
5985 |
++ |
5986 |
++ bfq_bfqq_served(bfqq, entity->budget - entity->service); |
5987 |
++} |
5988 |
++ |
5989 |
++/** |
5990 |
++ * __bfq_activate_entity - activate an entity. |
5991 |
++ * @entity: the entity being activated. |
5992 |
++ * |
5993 |
++ * Called whenever an entity is activated, i.e., it is not active and one |
5994 |
++ * of its children receives a new request, or has to be reactivated due to |
5995 |
++ * budget exhaustion. It uses the current budget of the entity (and the |
5996 |
++ * service received if @entity is active) of the queue to calculate its |
5997 |
++ * timestamps. |
5998 |
++ */ |
5999 |
++static void __bfq_activate_entity(struct bfq_entity *entity) |
6000 |
++{ |
6001 |
++ struct bfq_sched_data *sd = entity->sched_data; |
6002 |
++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); |
6003 |
++ |
6004 |
++ if (entity == sd->in_service_entity) { |
6005 |
++ BUG_ON(entity->tree); |
6006 |
++ /* |
6007 |
++ * If we are requeueing the current entity we have |
6008 |
++ * to take care of not charging to it service it has |
6009 |
++ * not received. |
6010 |
++ */ |
6011 |
++ bfq_calc_finish(entity, entity->service); |
6012 |
++ entity->start = entity->finish; |
6013 |
++ sd->in_service_entity = NULL; |
6014 |
++ } else if (entity->tree == &st->active) { |
6015 |
++ /* |
6016 |
++ * Requeueing an entity due to a change of some |
6017 |
++ * next_in_service entity below it. We reuse the |
6018 |
++ * old start time. |
6019 |
++ */ |
6020 |
++ bfq_active_extract(st, entity); |
6021 |
++ } else if (entity->tree == &st->idle) { |
6022 |
++ /* |
6023 |
++ * Must be on the idle tree, bfq_idle_extract() will |
6024 |
++ * check for that. |
6025 |
++ */ |
6026 |
++ bfq_idle_extract(st, entity); |
6027 |
++ entity->start = bfq_gt(st->vtime, entity->finish) ? |
6028 |
++ st->vtime : entity->finish; |
6029 |
++ } else { |
6030 |
++ /* |
6031 |
++ * The finish time of the entity may be invalid, and |
6032 |
++ * it is in the past for sure, otherwise the queue |
6033 |
++ * would have been on the idle tree. |
6034 |
++ */ |
6035 |
++ entity->start = st->vtime; |
6036 |
++ st->wsum += entity->weight; |
6037 |
++ bfq_get_entity(entity); |
6038 |
++ |
6039 |
++ BUG_ON(entity->on_st); |
6040 |
++ entity->on_st = 1; |
6041 |
++ } |
6042 |
++ |
6043 |
++ st = __bfq_entity_update_weight_prio(st, entity); |
6044 |
++ bfq_calc_finish(entity, entity->budget); |
6045 |
++ bfq_active_insert(st, entity); |
6046 |
++} |
6047 |
++ |
6048 |
++/** |
6049 |
++ * bfq_activate_entity - activate an entity and its ancestors if necessary. |
6050 |
++ * @entity: the entity to activate. |
6051 |
++ * |
6052 |
++ * Activate @entity and all the entities on the path from it to the root. |
6053 |
++ */ |
6054 |
++static void bfq_activate_entity(struct bfq_entity *entity) |
6055 |
++{ |
6056 |
++ struct bfq_sched_data *sd; |
6057 |
++ |
6058 |
++ for_each_entity(entity) { |
6059 |
++ __bfq_activate_entity(entity); |
6060 |
++ |
6061 |
++ sd = entity->sched_data; |
6062 |
++ if (!bfq_update_next_in_service(sd)) |
6063 |
++ /* |
6064 |
++ * No need to propagate the activation to the |
6065 |
++ * upper entities, as they will be updated when |
6066 |
++ * the in-service entity is rescheduled. |
6067 |
++ */ |
6068 |
++ break; |
6069 |
++ } |
6070 |
++} |
6071 |
++ |
6072 |
++/** |
6073 |
++ * __bfq_deactivate_entity - deactivate an entity from its service tree. |
6074 |
++ * @entity: the entity to deactivate. |
6075 |
++ * @requeue: if false, the entity will not be put into the idle tree. |
6076 |
++ * |
6077 |
++ * Deactivate an entity, independently from its previous state. If the |
6078 |
++ * entity was not on a service tree just return, otherwise if it is on |
6079 |
++ * any scheduler tree, extract it from that tree, and if necessary |
6080 |
++ * and if the caller did not specify @requeue, put it on the idle tree. |
6081 |
++ * |
6082 |
++ * Return %1 if the caller should update the entity hierarchy, i.e., |
6083 |
++ * if the entity was in service or if it was the next_in_service for |
6084 |
++ * its sched_data; return %0 otherwise. |
6085 |
++ */ |
6086 |
++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) |
6087 |
++{ |
6088 |
++ struct bfq_sched_data *sd = entity->sched_data; |
6089 |
++ struct bfq_service_tree *st; |
6090 |
++ int was_in_service; |
6091 |
++ int ret = 0; |
6092 |
++ |
6093 |
++ if (sd == NULL || !entity->on_st) /* never activated, or inactive */ |
6094 |
++ return 0; |
6095 |
++ |
6096 |
++ st = bfq_entity_service_tree(entity); |
6097 |
++ was_in_service = entity == sd->in_service_entity; |
6098 |
++ |
6099 |
++ BUG_ON(was_in_service && entity->tree); |
6100 |
++ |
6101 |
++ if (was_in_service) { |
6102 |
++ bfq_calc_finish(entity, entity->service); |
6103 |
++ sd->in_service_entity = NULL; |
6104 |
++ } else if (entity->tree == &st->active) |
6105 |
++ bfq_active_extract(st, entity); |
6106 |
++ else if (entity->tree == &st->idle) |
6107 |
++ bfq_idle_extract(st, entity); |
6108 |
++ else if (entity->tree) |
6109 |
++ BUG(); |
6110 |
++ |
6111 |
++ if (was_in_service || sd->next_in_service == entity) |
6112 |
++ ret = bfq_update_next_in_service(sd); |
6113 |
++ |
6114 |
++ if (!requeue || !bfq_gt(entity->finish, st->vtime)) |
6115 |
++ bfq_forget_entity(st, entity); |
6116 |
++ else |
6117 |
++ bfq_idle_insert(st, entity); |
6118 |
++ |
6119 |
++ BUG_ON(sd->in_service_entity == entity); |
6120 |
++ BUG_ON(sd->next_in_service == entity); |
6121 |
++ |
6122 |
++ return ret; |
6123 |
++} |
6124 |
++ |
6125 |
++/** |
6126 |
++ * bfq_deactivate_entity - deactivate an entity. |
6127 |
++ * @entity: the entity to deactivate. |
6128 |
++ * @requeue: true if the entity can be put on the idle tree |
6129 |
++ */ |
6130 |
++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) |
6131 |
++{ |
6132 |
++ struct bfq_sched_data *sd; |
6133 |
++ struct bfq_entity *parent; |
6134 |
++ |
6135 |
++ for_each_entity_safe(entity, parent) { |
6136 |
++ sd = entity->sched_data; |
6137 |
++ |
6138 |
++ if (!__bfq_deactivate_entity(entity, requeue)) |
6139 |
++ /* |
6140 |
++ * The parent entity is still backlogged, and |
6141 |
++ * we don't need to update it as it is still |
6142 |
++ * in service. |
6143 |
++ */ |
6144 |
++ break; |
6145 |
++ |
6146 |
++ if (sd->next_in_service) |
6147 |
++ /* |
6148 |
++ * The parent entity is still backlogged and |
6149 |
++ * the budgets on the path towards the root |
6150 |
++ * need to be updated. |
6151 |
++ */ |
6152 |
++ goto update; |
6153 |
++ |
6154 |
++ /* |
6155 |
++ * If we reach there the parent is no more backlogged and |
6156 |
++ * we want to propagate the dequeue upwards. |
6157 |
++ */ |
6158 |
++ requeue = 1; |
6159 |
++ } |
6160 |
++ |
6161 |
++ return; |
6162 |
++ |
6163 |
++update: |
6164 |
++ entity = parent; |
6165 |
++ for_each_entity(entity) { |
6166 |
++ __bfq_activate_entity(entity); |
6167 |
++ |
6168 |
++ sd = entity->sched_data; |
6169 |
++ if (!bfq_update_next_in_service(sd)) |
6170 |
++ break; |
6171 |
++ } |
6172 |
++} |
6173 |
++ |
6174 |
++/** |
6175 |
++ * bfq_update_vtime - update vtime if necessary. |
6176 |
++ * @st: the service tree to act upon. |
6177 |
++ * |
6178 |
++ * If necessary update the service tree vtime to have at least one |
6179 |
++ * eligible entity, skipping to its start time. Assumes that the |
6180 |
++ * active tree of the device is not empty. |
6181 |
++ * |
6182 |
++ * NOTE: this hierarchical implementation updates vtimes quite often, |
6183 |
++ * we may end up with reactivated processes getting timestamps after a |
6184 |
++ * vtime skip done because we needed a ->first_active entity on some |
6185 |
++ * intermediate node. |
6186 |
++ */ |
6187 |
++static void bfq_update_vtime(struct bfq_service_tree *st) |
6188 |
++{ |
6189 |
++ struct bfq_entity *entry; |
6190 |
++ struct rb_node *node = st->active.rb_node; |
6191 |
++ |
6192 |
++ entry = rb_entry(node, struct bfq_entity, rb_node); |
6193 |
++ if (bfq_gt(entry->min_start, st->vtime)) { |
6194 |
++ st->vtime = entry->min_start; |
6195 |
++ bfq_forget_idle(st); |
6196 |
++ } |
6197 |
++} |
6198 |
++ |
6199 |
++/** |
6200 |
++ * bfq_first_active_entity - find the eligible entity with |
6201 |
++ * the smallest finish time |
6202 |
++ * @st: the service tree to select from. |
6203 |
++ * |
6204 |
++ * This function searches the first schedulable entity, starting from the |
6205 |
++ * root of the tree and going on the left every time on this side there is |
6206 |
++ * a subtree with at least one eligible (start >= vtime) entity. The path on |
6207 |
++ * the right is followed only if a) the left subtree contains no eligible |
6208 |
++ * entities and b) no eligible entity has been found yet. |
6209 |
++ */ |
6210 |
++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) |
6211 |
++{ |
6212 |
++ struct bfq_entity *entry, *first = NULL; |
6213 |
++ struct rb_node *node = st->active.rb_node; |
6214 |
++ |
6215 |
++ while (node) { |
6216 |
++ entry = rb_entry(node, struct bfq_entity, rb_node); |
6217 |
++left: |
6218 |
++ if (!bfq_gt(entry->start, st->vtime)) |
6219 |
++ first = entry; |
6220 |
++ |
6221 |
++ BUG_ON(bfq_gt(entry->min_start, st->vtime)); |
6222 |
++ |
6223 |
++ if (node->rb_left) { |
6224 |
++ entry = rb_entry(node->rb_left, |
6225 |
++ struct bfq_entity, rb_node); |
6226 |
++ if (!bfq_gt(entry->min_start, st->vtime)) { |
6227 |
++ node = node->rb_left; |
6228 |
++ goto left; |
6229 |
++ } |
6230 |
++ } |
6231 |
++ if (first) |
6232 |
++ break; |
6233 |
++ node = node->rb_right; |
6234 |
++ } |
6235 |
++ |
6236 |
++ BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); |
6237 |
++ return first; |
6238 |
++} |
6239 |
++ |
6240 |
++/** |
6241 |
++ * __bfq_lookup_next_entity - return the first eligible entity in @st. |
6242 |
++ * @st: the service tree. |
6243 |
++ * |
6244 |
++ * Update the virtual time in @st and return the first eligible entity |
6245 |
++ * it contains. |
6246 |
++ */ |
6247 |
++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, |
6248 |
++ bool force) |
6249 |
++{ |
6250 |
++ struct bfq_entity *entity, *new_next_in_service = NULL; |
6251 |
++ |
6252 |
++ if (RB_EMPTY_ROOT(&st->active)) |
6253 |
++ return NULL; |
6254 |
++ |
6255 |
++ bfq_update_vtime(st); |
6256 |
++ entity = bfq_first_active_entity(st); |
6257 |
++ BUG_ON(bfq_gt(entity->start, st->vtime)); |
6258 |
++ |
6259 |
++ /* |
6260 |
++ * If the chosen entity does not match with the sched_data's |
6261 |
++ * next_in_service and we are forcedly serving the IDLE priority |
6262 |
++ * class tree, bubble up budget update. |
6263 |
++ */ |
6264 |
++ if (unlikely(force && entity != entity->sched_data->next_in_service)) { |
6265 |
++ new_next_in_service = entity; |
6266 |
++ for_each_entity(new_next_in_service) |
6267 |
++ bfq_update_budget(new_next_in_service); |
6268 |
++ } |
6269 |
++ |
6270 |
++ return entity; |
6271 |
++} |
6272 |
++ |
6273 |
++/** |
6274 |
++ * bfq_lookup_next_entity - return the first eligible entity in @sd. |
6275 |
++ * @sd: the sched_data. |
6276 |
++ * @extract: if true the returned entity will be also extracted from @sd. |
6277 |
++ * |
6278 |
++ * NOTE: since we cache the next_in_service entity at each level of the |
6279 |
++ * hierarchy, the complexity of the lookup can be decreased with |
6280 |
++ * absolutely no effort just returning the cached next_in_service value; |
6281 |
++ * we prefer to do full lookups to test the consistency of * the data |
6282 |
++ * structures. |
6283 |
++ */ |
6284 |
++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, |
6285 |
++ int extract, |
6286 |
++ struct bfq_data *bfqd) |
6287 |
++{ |
6288 |
++ struct bfq_service_tree *st = sd->service_tree; |
6289 |
++ struct bfq_entity *entity; |
6290 |
++ int i = 0; |
6291 |
++ |
6292 |
++ BUG_ON(sd->in_service_entity); |
6293 |
++ |
6294 |
++ if (bfqd && |
6295 |
++ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { |
6296 |
++ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, |
6297 |
++ true); |
6298 |
++ if (entity) { |
6299 |
++ i = BFQ_IOPRIO_CLASSES - 1; |
6300 |
++ bfqd->bfq_class_idle_last_service = jiffies; |
6301 |
++ sd->next_in_service = entity; |
6302 |
++ } |
6303 |
++ } |
6304 |
++ for (; i < BFQ_IOPRIO_CLASSES; i++) { |
6305 |
++ entity = __bfq_lookup_next_entity(st + i, false); |
6306 |
++ if (entity) { |
6307 |
++ if (extract) { |
6308 |
++ bfq_check_next_in_service(sd, entity); |
6309 |
++ bfq_active_extract(st + i, entity); |
6310 |
++ sd->in_service_entity = entity; |
6311 |
++ sd->next_in_service = NULL; |
6312 |
++ } |
6313 |
++ break; |
6314 |
++ } |
6315 |
++ } |
6316 |
++ |
6317 |
++ return entity; |
6318 |
++} |
6319 |
++ |
6320 |
++/* |
6321 |
++ * Get next queue for service. |
6322 |
++ */ |
6323 |
++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) |
6324 |
++{ |
6325 |
++ struct bfq_entity *entity = NULL; |
6326 |
++ struct bfq_sched_data *sd; |
6327 |
++ struct bfq_queue *bfqq; |
6328 |
++ |
6329 |
++ BUG_ON(bfqd->in_service_queue); |
6330 |
++ |
6331 |
++ if (bfqd->busy_queues == 0) |
6332 |
++ return NULL; |
6333 |
++ |
6334 |
++ sd = &bfqd->root_group->sched_data; |
6335 |
++ for (; sd ; sd = entity->my_sched_data) { |
6336 |
++ entity = bfq_lookup_next_entity(sd, 1, bfqd); |
6337 |
++ BUG_ON(!entity); |
6338 |
++ entity->service = 0; |
6339 |
++ } |
6340 |
++ |
6341 |
++ bfqq = bfq_entity_to_bfqq(entity); |
6342 |
++ BUG_ON(!bfqq); |
6343 |
++ |
6344 |
++ return bfqq; |
6345 |
++} |
6346 |
++ |
6347 |
++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) |
6348 |
++{ |
6349 |
++ if (bfqd->in_service_bic) { |
6350 |
++ put_io_context(bfqd->in_service_bic->icq.ioc); |
6351 |
++ bfqd->in_service_bic = NULL; |
6352 |
++ } |
6353 |
++ |
6354 |
++ bfqd->in_service_queue = NULL; |
6355 |
++ del_timer(&bfqd->idle_slice_timer); |
6356 |
++} |
6357 |
++ |
6358 |
++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
6359 |
++ int requeue) |
6360 |
++{ |
6361 |
++ struct bfq_entity *entity = &bfqq->entity; |
6362 |
++ |
6363 |
++ if (bfqq == bfqd->in_service_queue) |
6364 |
++ __bfq_bfqd_reset_in_service(bfqd); |
6365 |
++ |
6366 |
++ bfq_deactivate_entity(entity, requeue); |
6367 |
++} |
6368 |
++ |
6369 |
++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
6370 |
++{ |
6371 |
++ struct bfq_entity *entity = &bfqq->entity; |
6372 |
++ |
6373 |
++ bfq_activate_entity(entity); |
6374 |
++} |
6375 |
++ |
6376 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
6377 |
++static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); |
6378 |
++#endif |
6379 |
++ |
6380 |
++/* |
6381 |
++ * Called when the bfqq no longer has requests pending, remove it from |
6382 |
++ * the service tree. |
6383 |
++ */ |
6384 |
++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
6385 |
++ int requeue) |
6386 |
++{ |
6387 |
++ BUG_ON(!bfq_bfqq_busy(bfqq)); |
6388 |
++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); |
6389 |
++ |
6390 |
++ bfq_log_bfqq(bfqd, bfqq, "del from busy"); |
6391 |
++ |
6392 |
++ bfq_clear_bfqq_busy(bfqq); |
6393 |
++ |
6394 |
++ BUG_ON(bfqd->busy_queues == 0); |
6395 |
++ bfqd->busy_queues--; |
6396 |
++ |
6397 |
++ if (!bfqq->dispatched) { |
6398 |
++ bfq_weights_tree_remove(bfqd, &bfqq->entity, |
6399 |
++ &bfqd->queue_weights_tree); |
6400 |
++ if (!blk_queue_nonrot(bfqd->queue)) { |
6401 |
++ BUG_ON(!bfqd->busy_in_flight_queues); |
6402 |
++ bfqd->busy_in_flight_queues--; |
6403 |
++ if (bfq_bfqq_constantly_seeky(bfqq)) { |
6404 |
++ BUG_ON(!bfqd-> |
6405 |
++ const_seeky_busy_in_flight_queues); |
6406 |
++ bfqd->const_seeky_busy_in_flight_queues--; |
6407 |
++ } |
6408 |
++ } |
6409 |
++ } |
6410 |
++ if (bfqq->wr_coeff > 1) |
6411 |
++ bfqd->wr_busy_queues--; |
6412 |
++ |
6413 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
6414 |
++ bfqg_stats_update_dequeue(bfqq_group(bfqq)); |
6415 |
++#endif |
6416 |
++ |
6417 |
++ bfq_deactivate_bfqq(bfqd, bfqq, requeue); |
6418 |
++} |
6419 |
++ |
6420 |
++/* |
6421 |
++ * Called when an inactive queue receives a new request. |
6422 |
++ */ |
6423 |
++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
6424 |
++{ |
6425 |
++ BUG_ON(bfq_bfqq_busy(bfqq)); |
6426 |
++ BUG_ON(bfqq == bfqd->in_service_queue); |
6427 |
++ |
6428 |
++ bfq_log_bfqq(bfqd, bfqq, "add to busy"); |
6429 |
++ |
6430 |
++ bfq_activate_bfqq(bfqd, bfqq); |
6431 |
++ |
6432 |
++ bfq_mark_bfqq_busy(bfqq); |
6433 |
++ bfqd->busy_queues++; |
6434 |
++ |
6435 |
++ if (!bfqq->dispatched) { |
6436 |
++ if (bfqq->wr_coeff == 1) |
6437 |
++ bfq_weights_tree_add(bfqd, &bfqq->entity, |
6438 |
++ &bfqd->queue_weights_tree); |
6439 |
++ if (!blk_queue_nonrot(bfqd->queue)) { |
6440 |
++ bfqd->busy_in_flight_queues++; |
6441 |
++ if (bfq_bfqq_constantly_seeky(bfqq)) |
6442 |
++ bfqd->const_seeky_busy_in_flight_queues++; |
6443 |
++ } |
6444 |
++ } |
6445 |
++ if (bfqq->wr_coeff > 1) |
6446 |
++ bfqd->wr_busy_queues++; |
6447 |
++} |
6448 |
+diff --git a/block/bfq.h b/block/bfq.h |
6449 |
+new file mode 100644 |
6450 |
+index 0000000..485d0c9 |
6451 |
+--- /dev/null |
6452 |
++++ b/block/bfq.h |
6453 |
+@@ -0,0 +1,801 @@ |
6454 |
++/* |
6455 |
++ * BFQ-v7r11 for 4.5.0: data structures and common functions prototypes. |
6456 |
++ * |
6457 |
++ * Based on ideas and code from CFQ: |
6458 |
++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
6459 |
++ * |
6460 |
++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
6461 |
++ * Paolo Valente <paolo.valente@×××××××.it> |
6462 |
++ * |
6463 |
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
6464 |
++ */ |
6465 |
++ |
6466 |
++#ifndef _BFQ_H |
6467 |
++#define _BFQ_H |
6468 |
++ |
6469 |
++#include <linux/blktrace_api.h> |
6470 |
++#include <linux/hrtimer.h> |
6471 |
++#include <linux/ioprio.h> |
6472 |
++#include <linux/rbtree.h> |
6473 |
++#include <linux/blk-cgroup.h> |
6474 |
++ |
6475 |
++#define BFQ_IOPRIO_CLASSES 3 |
6476 |
++#define BFQ_CL_IDLE_TIMEOUT (HZ/5) |
6477 |
++ |
6478 |
++#define BFQ_MIN_WEIGHT 1 |
6479 |
++#define BFQ_MAX_WEIGHT 1000 |
6480 |
++#define BFQ_WEIGHT_CONVERSION_COEFF 10 |
6481 |
++ |
6482 |
++#define BFQ_DEFAULT_QUEUE_IOPRIO 4 |
6483 |
++ |
6484 |
++#define BFQ_DEFAULT_GRP_WEIGHT 10 |
6485 |
++#define BFQ_DEFAULT_GRP_IOPRIO 0 |
6486 |
++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE |
6487 |
++ |
6488 |
++struct bfq_entity; |
6489 |
++ |
6490 |
++/** |
6491 |
++ * struct bfq_service_tree - per ioprio_class service tree. |
6492 |
++ * @active: tree for active entities (i.e., those backlogged). |
6493 |
++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). |
6494 |
++ * @first_idle: idle entity with minimum F_i. |
6495 |
++ * @last_idle: idle entity with maximum F_i. |
6496 |
++ * @vtime: scheduler virtual time. |
6497 |
++ * @wsum: scheduler weight sum; active and idle entities contribute to it. |
6498 |
++ * |
6499 |
++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each |
6500 |
++ * ioprio_class has its own independent scheduler, and so its own |
6501 |
++ * bfq_service_tree. All the fields are protected by the queue lock |
6502 |
++ * of the containing bfqd. |
6503 |
++ */ |
6504 |
++struct bfq_service_tree { |
6505 |
++ struct rb_root active; |
6506 |
++ struct rb_root idle; |
6507 |
++ |
6508 |
++ struct bfq_entity *first_idle; |
6509 |
++ struct bfq_entity *last_idle; |
6510 |
++ |
6511 |
++ u64 vtime; |
6512 |
++ unsigned long wsum; |
6513 |
++}; |
6514 |
++ |
6515 |
++/** |
6516 |
++ * struct bfq_sched_data - multi-class scheduler. |
6517 |
++ * @in_service_entity: entity in service. |
6518 |
++ * @next_in_service: head-of-the-line entity in the scheduler. |
6519 |
++ * @service_tree: array of service trees, one per ioprio_class. |
6520 |
++ * |
6521 |
++ * bfq_sched_data is the basic scheduler queue. It supports three |
6522 |
++ * ioprio_classes, and can be used either as a toplevel queue or as |
6523 |
++ * an intermediate queue on a hierarchical setup. |
6524 |
++ * @next_in_service points to the active entity of the sched_data |
6525 |
++ * service trees that will be scheduled next. |
6526 |
++ * |
6527 |
++ * The supported ioprio_classes are the same as in CFQ, in descending |
6528 |
++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. |
6529 |
++ * Requests from higher priority queues are served before all the |
6530 |
++ * requests from lower priority queues; among requests of the same |
6531 |
++ * queue requests are served according to B-WF2Q+. |
6532 |
++ * All the fields are protected by the queue lock of the containing bfqd. |
6533 |
++ */ |
6534 |
++struct bfq_sched_data { |
6535 |
++ struct bfq_entity *in_service_entity; |
6536 |
++ struct bfq_entity *next_in_service; |
6537 |
++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; |
6538 |
++}; |
6539 |
++ |
6540 |
++/** |
6541 |
++ * struct bfq_weight_counter - counter of the number of all active entities |
6542 |
++ * with a given weight. |
6543 |
++ * @weight: weight of the entities that this counter refers to. |
6544 |
++ * @num_active: number of active entities with this weight. |
6545 |
++ * @weights_node: weights tree member (see bfq_data's @queue_weights_tree |
6546 |
++ * and @group_weights_tree). |
6547 |
++ */ |
6548 |
++struct bfq_weight_counter { |
6549 |
++ short int weight; |
6550 |
++ unsigned int num_active; |
6551 |
++ struct rb_node weights_node; |
6552 |
++}; |
6553 |
++ |
6554 |
++/** |
6555 |
++ * struct bfq_entity - schedulable entity. |
6556 |
++ * @rb_node: service_tree member. |
6557 |
++ * @weight_counter: pointer to the weight counter associated with this entity. |
6558 |
++ * @on_st: flag, true if the entity is on a tree (either the active or |
6559 |
++ * the idle one of its service_tree). |
6560 |
++ * @finish: B-WF2Q+ finish timestamp (aka F_i). |
6561 |
++ * @start: B-WF2Q+ start timestamp (aka S_i). |
6562 |
++ * @tree: tree the entity is enqueued into; %NULL if not on a tree. |
6563 |
++ * @min_start: minimum start time of the (active) subtree rooted at |
6564 |
++ * this entity; used for O(log N) lookups into active trees. |
6565 |
++ * @service: service received during the last round of service. |
6566 |
++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. |
6567 |
++ * @weight: weight of the queue |
6568 |
++ * @parent: parent entity, for hierarchical scheduling. |
6569 |
++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the |
6570 |
++ * associated scheduler queue, %NULL on leaf nodes. |
6571 |
++ * @sched_data: the scheduler queue this entity belongs to. |
6572 |
++ * @ioprio: the ioprio in use. |
6573 |
++ * @new_weight: when a weight change is requested, the new weight value. |
6574 |
++ * @orig_weight: original weight, used to implement weight boosting |
6575 |
++ * @prio_changed: flag, true when the user requested a weight, ioprio or |
6576 |
++ * ioprio_class change. |
6577 |
++ * |
6578 |
++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the |
6579 |
++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each |
6580 |
++ * entity belongs to the sched_data of the parent group in the cgroup |
6581 |
++ * hierarchy. Non-leaf entities have also their own sched_data, stored |
6582 |
++ * in @my_sched_data. |
6583 |
++ * |
6584 |
++ * Each entity stores independently its priority values; this would |
6585 |
++ * allow different weights on different devices, but this |
6586 |
++ * functionality is not exported to userspace by now. Priorities and |
6587 |
++ * weights are updated lazily, first storing the new values into the |
6588 |
++ * new_* fields, then setting the @prio_changed flag. As soon as |
6589 |
++ * there is a transition in the entity state that allows the priority |
6590 |
++ * update to take place the effective and the requested priority |
6591 |
++ * values are synchronized. |
6592 |
++ * |
6593 |
++ * Unless cgroups are used, the weight value is calculated from the |
6594 |
++ * ioprio to export the same interface as CFQ. When dealing with |
6595 |
++ * ``well-behaved'' queues (i.e., queues that do not spend too much |
6596 |
++ * time to consume their budget and have true sequential behavior, and |
6597 |
++ * when there are no external factors breaking anticipation) the |
6598 |
++ * relative weights at each level of the cgroups hierarchy should be |
6599 |
++ * guaranteed. All the fields are protected by the queue lock of the |
6600 |
++ * containing bfqd. |
6601 |
++ */ |
6602 |
++struct bfq_entity { |
6603 |
++ struct rb_node rb_node; |
6604 |
++ struct bfq_weight_counter *weight_counter; |
6605 |
++ |
6606 |
++ int on_st; |
6607 |
++ |
6608 |
++ u64 finish; |
6609 |
++ u64 start; |
6610 |
++ |
6611 |
++ struct rb_root *tree; |
6612 |
++ |
6613 |
++ u64 min_start; |
6614 |
++ |
6615 |
++ int service, budget; |
6616 |
++ unsigned short weight, new_weight; |
6617 |
++ unsigned short orig_weight; |
6618 |
++ |
6619 |
++ struct bfq_entity *parent; |
6620 |
++ |
6621 |
++ struct bfq_sched_data *my_sched_data; |
6622 |
++ struct bfq_sched_data *sched_data; |
6623 |
++ |
6624 |
++ int prio_changed; |
6625 |
++}; |
6626 |
++ |
6627 |
++struct bfq_group; |
6628 |
++ |
6629 |
++/** |
6630 |
++ * struct bfq_queue - leaf schedulable entity. |
6631 |
++ * @ref: reference counter. |
6632 |
++ * @bfqd: parent bfq_data. |
6633 |
++ * @new_ioprio: when an ioprio change is requested, the new ioprio value. |
6634 |
++ * @ioprio_class: the ioprio_class in use. |
6635 |
++ * @new_ioprio_class: when an ioprio_class change is requested, the new |
6636 |
++ * ioprio_class value. |
6637 |
++ * @new_bfqq: shared bfq_queue if queue is cooperating with |
6638 |
++ * one or more other queues. |
6639 |
++ * @sort_list: sorted list of pending requests. |
6640 |
++ * @next_rq: if fifo isn't expired, next request to serve. |
6641 |
++ * @queued: nr of requests queued in @sort_list. |
6642 |
++ * @allocated: currently allocated requests. |
6643 |
++ * @meta_pending: pending metadata requests. |
6644 |
++ * @fifo: fifo list of requests in sort_list. |
6645 |
++ * @entity: entity representing this queue in the scheduler. |
6646 |
++ * @max_budget: maximum budget allowed from the feedback mechanism. |
6647 |
++ * @budget_timeout: budget expiration (in jiffies). |
6648 |
++ * @dispatched: number of requests on the dispatch list or inside driver. |
6649 |
++ * @flags: status flags. |
6650 |
++ * @bfqq_list: node for active/idle bfqq list inside our bfqd. |
6651 |
++ * @burst_list_node: node for the device's burst list. |
6652 |
++ * @seek_samples: number of seeks sampled |
6653 |
++ * @seek_total: sum of the distances of the seeks sampled |
6654 |
++ * @seek_mean: mean seek distance |
6655 |
++ * @last_request_pos: position of the last request enqueued |
6656 |
++ * @requests_within_timer: number of consecutive pairs of request completion |
6657 |
++ * and arrival, such that the queue becomes idle |
6658 |
++ * after the completion, but the next request arrives |
6659 |
++ * within an idle time slice; used only if the queue's |
6660 |
++ * IO_bound has been cleared. |
6661 |
++ * @pid: pid of the process owning the queue, used for logging purposes. |
6662 |
++ * @last_wr_start_finish: start time of the current weight-raising period if |
6663 |
++ * the @bfq-queue is being weight-raised, otherwise |
6664 |
++ * finish time of the last weight-raising period |
6665 |
++ * @wr_cur_max_time: current max raising time for this queue |
6666 |
++ * @soft_rt_next_start: minimum time instant such that, only if a new |
6667 |
++ * request is enqueued after this time instant in an |
6668 |
++ * idle @bfq_queue with no outstanding requests, then |
6669 |
++ * the task associated with the queue it is deemed as |
6670 |
++ * soft real-time (see the comments to the function |
6671 |
++ * bfq_bfqq_softrt_next_start()) |
6672 |
++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from |
6673 |
++ * idle to backlogged |
6674 |
++ * @service_from_backlogged: cumulative service received from the @bfq_queue |
6675 |
++ * since the last transition from idle to |
6676 |
++ * backlogged |
6677 |
++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the |
6678 |
++ * queue is shared |
6679 |
++ * |
6680 |
++ * A bfq_queue is a leaf request queue; it can be associated with an |
6681 |
++ * io_context or more, if it is async or shared between cooperating |
6682 |
++ * processes. @cgroup holds a reference to the cgroup, to be sure that it |
6683 |
++ * does not disappear while a bfqq still references it (mostly to avoid |
6684 |
++ * races between request issuing and task migration followed by cgroup |
6685 |
++ * destruction). |
6686 |
++ * All the fields are protected by the queue lock of the containing bfqd. |
6687 |
++ */ |
6688 |
++struct bfq_queue { |
6689 |
++ atomic_t ref; |
6690 |
++ struct bfq_data *bfqd; |
6691 |
++ |
6692 |
++ unsigned short ioprio, new_ioprio; |
6693 |
++ unsigned short ioprio_class, new_ioprio_class; |
6694 |
++ |
6695 |
++ /* fields for cooperating queues handling */ |
6696 |
++ struct bfq_queue *new_bfqq; |
6697 |
++ struct rb_node pos_node; |
6698 |
++ struct rb_root *pos_root; |
6699 |
++ |
6700 |
++ struct rb_root sort_list; |
6701 |
++ struct request *next_rq; |
6702 |
++ int queued[2]; |
6703 |
++ int allocated[2]; |
6704 |
++ int meta_pending; |
6705 |
++ struct list_head fifo; |
6706 |
++ |
6707 |
++ struct bfq_entity entity; |
6708 |
++ |
6709 |
++ int max_budget; |
6710 |
++ unsigned long budget_timeout; |
6711 |
++ |
6712 |
++ int dispatched; |
6713 |
++ |
6714 |
++ unsigned int flags; |
6715 |
++ |
6716 |
++ struct list_head bfqq_list; |
6717 |
++ |
6718 |
++ struct hlist_node burst_list_node; |
6719 |
++ |
6720 |
++ unsigned int seek_samples; |
6721 |
++ u64 seek_total; |
6722 |
++ sector_t seek_mean; |
6723 |
++ sector_t last_request_pos; |
6724 |
++ |
6725 |
++ unsigned int requests_within_timer; |
6726 |
++ |
6727 |
++ pid_t pid; |
6728 |
++ struct bfq_io_cq *bic; |
6729 |
++ |
6730 |
++ /* weight-raising fields */ |
6731 |
++ unsigned long wr_cur_max_time; |
6732 |
++ unsigned long soft_rt_next_start; |
6733 |
++ unsigned long last_wr_start_finish; |
6734 |
++ unsigned int wr_coeff; |
6735 |
++ unsigned long last_idle_bklogged; |
6736 |
++ unsigned long service_from_backlogged; |
6737 |
++}; |
6738 |
++ |
6739 |
++/** |
6740 |
++ * struct bfq_ttime - per process thinktime stats. |
6741 |
++ * @ttime_total: total process thinktime |
6742 |
++ * @ttime_samples: number of thinktime samples |
6743 |
++ * @ttime_mean: average process thinktime |
6744 |
++ */ |
6745 |
++struct bfq_ttime { |
6746 |
++ unsigned long last_end_request; |
6747 |
++ |
6748 |
++ unsigned long ttime_total; |
6749 |
++ unsigned long ttime_samples; |
6750 |
++ unsigned long ttime_mean; |
6751 |
++}; |
6752 |
++ |
6753 |
++/** |
6754 |
++ * struct bfq_io_cq - per (request_queue, io_context) structure. |
6755 |
++ * @icq: associated io_cq structure |
6756 |
++ * @bfqq: array of two process queues, the sync and the async |
6757 |
++ * @ttime: associated @bfq_ttime struct |
6758 |
++ * @ioprio: per (request_queue, blkcg) ioprio. |
6759 |
++ * @blkcg_id: id of the blkcg the related io_cq belongs to. |
6760 |
++ */ |
6761 |
++struct bfq_io_cq { |
6762 |
++ struct io_cq icq; /* must be the first member */ |
6763 |
++ struct bfq_queue *bfqq[2]; |
6764 |
++ struct bfq_ttime ttime; |
6765 |
++ int ioprio; |
6766 |
++ |
6767 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
6768 |
++ uint64_t blkcg_id; /* the current blkcg ID */ |
6769 |
++#endif |
6770 |
++}; |
6771 |
++ |
6772 |
++enum bfq_device_speed { |
6773 |
++ BFQ_BFQD_FAST, |
6774 |
++ BFQ_BFQD_SLOW, |
6775 |
++}; |
6776 |
++ |
6777 |
++/** |
6778 |
++ * struct bfq_data - per device data structure. |
6779 |
++ * @queue: request queue for the managed device. |
6780 |
++ * @root_group: root bfq_group for the device. |
6781 |
++ * @active_numerous_groups: number of bfq_groups containing more than one |
6782 |
++ * active @bfq_entity. |
6783 |
++ * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by |
6784 |
++ * weight. Used to keep track of whether all @bfq_queues |
6785 |
++ * have the same weight. The tree contains one counter |
6786 |
++ * for each distinct weight associated to some active |
6787 |
++ * and not weight-raised @bfq_queue (see the comments to |
6788 |
++ * the functions bfq_weights_tree_[add|remove] for |
6789 |
++ * further details). |
6790 |
++ * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted |
6791 |
++ * by weight. Used to keep track of whether all |
6792 |
++ * @bfq_groups have the same weight. The tree contains |
6793 |
++ * one counter for each distinct weight associated to |
6794 |
++ * some active @bfq_group (see the comments to the |
6795 |
++ * functions bfq_weights_tree_[add|remove] for further |
6796 |
++ * details). |
6797 |
++ * @busy_queues: number of bfq_queues containing requests (including the |
6798 |
++ * queue in service, even if it is idling). |
6799 |
++ * @busy_in_flight_queues: number of @bfq_queues containing pending or |
6800 |
++ * in-flight requests, plus the @bfq_queue in |
6801 |
++ * service, even if idle but waiting for the |
6802 |
++ * possible arrival of its next sync request. This |
6803 |
++ * field is updated only if the device is rotational, |
6804 |
++ * but used only if the device is also NCQ-capable. |
6805 |
++ * The reason why the field is updated also for non- |
6806 |
++ * NCQ-capable rotational devices is related to the |
6807 |
++ * fact that the value of @hw_tag may be set also |
6808 |
++ * later than when busy_in_flight_queues may need to |
6809 |
++ * be incremented for the first time(s). Taking also |
6810 |
++ * this possibility into account, to avoid unbalanced |
6811 |
++ * increments/decrements, would imply more overhead |
6812 |
++ * than just updating busy_in_flight_queues |
6813 |
++ * regardless of the value of @hw_tag. |
6814 |
++ * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues |
6815 |
++ * (that is, seeky queues that expired |
6816 |
++ * for budget timeout at least once) |
6817 |
++ * containing pending or in-flight |
6818 |
++ * requests, including the in-service |
6819 |
++ * @bfq_queue if constantly seeky. This |
6820 |
++ * field is updated only if the device |
6821 |
++ * is rotational, but used only if the |
6822 |
++ * device is also NCQ-capable (see the |
6823 |
++ * comments to @busy_in_flight_queues). |
6824 |
++ * @wr_busy_queues: number of weight-raised busy @bfq_queues. |
6825 |
++ * @queued: number of queued requests. |
6826 |
++ * @rq_in_driver: number of requests dispatched and waiting for completion. |
6827 |
++ * @sync_flight: number of sync requests in the driver. |
6828 |
++ * @max_rq_in_driver: max number of reqs in driver in the last |
6829 |
++ * @hw_tag_samples completed requests. |
6830 |
++ * @hw_tag_samples: nr of samples used to calculate hw_tag. |
6831 |
++ * @hw_tag: flag set to one if the driver is showing a queueing behavior. |
6832 |
++ * @budgets_assigned: number of budgets assigned. |
6833 |
++ * @idle_slice_timer: timer set when idling for the next sequential request |
6834 |
++ * from the queue in service. |
6835 |
++ * @unplug_work: delayed work to restart dispatching on the request queue. |
6836 |
++ * @in_service_queue: bfq_queue in service. |
6837 |
++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. |
6838 |
++ * @last_position: on-disk position of the last served request. |
6839 |
++ * @last_budget_start: beginning of the last budget. |
6840 |
++ * @last_idling_start: beginning of the last idle slice. |
6841 |
++ * @peak_rate: peak transfer rate observed for a budget. |
6842 |
++ * @peak_rate_samples: number of samples used to calculate @peak_rate. |
6843 |
++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before |
6844 |
++ * rescheduling. |
6845 |
++ * @active_list: list of all the bfq_queues active on the device. |
6846 |
++ * @idle_list: list of all the bfq_queues idle on the device. |
6847 |
++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires |
6848 |
++ * requests are served in fifo order. |
6849 |
++ * @bfq_back_penalty: weight of backward seeks wrt forward ones. |
6850 |
++ * @bfq_back_max: maximum allowed backward seek. |
6851 |
++ * @bfq_slice_idle: maximum idling time. |
6852 |
++ * @bfq_user_max_budget: user-configured max budget value |
6853 |
++ * (0 for auto-tuning). |
6854 |
++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to |
6855 |
++ * async queues. |
6856 |
++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to |
6857 |
++ * to prevent seeky queues to impose long latencies to well |
6858 |
++ * behaved ones (this also implies that seeky queues cannot |
6859 |
++ * receive guarantees in the service domain; after a timeout |
6860 |
++ * they are charged for the whole allocated budget, to try |
6861 |
++ * to preserve a behavior reasonably fair among them, but |
6862 |
++ * without service-domain guarantees). |
6863 |
++ * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is |
6864 |
++ * no more granted any weight-raising. |
6865 |
++ * @bfq_failed_cooperations: number of consecutive failed cooperation |
6866 |
++ * chances after which weight-raising is restored |
6867 |
++ * to a queue subject to more than bfq_coop_thresh |
6868 |
++ * queue merges. |
6869 |
++ * @bfq_requests_within_timer: number of consecutive requests that must be |
6870 |
++ * issued within the idle time slice to set |
6871 |
++ * again idling to a queue which was marked as |
6872 |
++ * non-I/O-bound (see the definition of the |
6873 |
++ * IO_bound flag for further details). |
6874 |
++ * @last_ins_in_burst: last time at which a queue entered the current |
6875 |
++ * burst of queues being activated shortly after |
6876 |
++ * each other; for more details about this and the |
6877 |
++ * following parameters related to a burst of |
6878 |
++ * activations, see the comments to the function |
6879 |
++ * @bfq_handle_burst. |
6880 |
++ * @bfq_burst_interval: reference time interval used to decide whether a |
6881 |
++ * queue has been activated shortly after |
6882 |
++ * @last_ins_in_burst. |
6883 |
++ * @burst_size: number of queues in the current burst of queue activations. |
6884 |
++ * @bfq_large_burst_thresh: maximum burst size above which the current |
6885 |
++ * queue-activation burst is deemed as 'large'. |
6886 |
++ * @large_burst: true if a large queue-activation burst is in progress. |
6887 |
++ * @burst_list: head of the burst list (as for the above fields, more details |
6888 |
++ * in the comments to the function bfq_handle_burst). |
6889 |
++ * @low_latency: if set to true, low-latency heuristics are enabled. |
6890 |
++ * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised |
6891 |
++ * queue is multiplied. |
6892 |
++ * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). |
6893 |
++ * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. |
6894 |
++ * @bfq_wr_min_idle_time: minimum idle period after which weight-raising |
6895 |
++ * may be reactivated for a queue (in jiffies). |
6896 |
++ * @bfq_wr_min_inter_arr_async: minimum period between request arrivals |
6897 |
++ * after which weight-raising may be |
6898 |
++ * reactivated for an already busy queue |
6899 |
++ * (in jiffies). |
6900 |
++ * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, |
6901 |
++ * sectors per seconds. |
6902 |
++ * @RT_prod: cached value of the product R*T used for computing the maximum |
6903 |
++ * duration of the weight raising automatically. |
6904 |
++ * @device_speed: device-speed class for the low-latency heuristic. |
6905 |
++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. |
6906 |
++ * |
6907 |
++ * All the fields are protected by the @queue lock. |
6908 |
++ */ |
6909 |
++struct bfq_data { |
6910 |
++ struct request_queue *queue; |
6911 |
++ |
6912 |
++ struct bfq_group *root_group; |
6913 |
++ |
6914 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
6915 |
++ int active_numerous_groups; |
6916 |
++#endif |
6917 |
++ |
6918 |
++ struct rb_root queue_weights_tree; |
6919 |
++ struct rb_root group_weights_tree; |
6920 |
++ |
6921 |
++ int busy_queues; |
6922 |
++ int busy_in_flight_queues; |
6923 |
++ int const_seeky_busy_in_flight_queues; |
6924 |
++ int wr_busy_queues; |
6925 |
++ int queued; |
6926 |
++ int rq_in_driver; |
6927 |
++ int sync_flight; |
6928 |
++ |
6929 |
++ int max_rq_in_driver; |
6930 |
++ int hw_tag_samples; |
6931 |
++ int hw_tag; |
6932 |
++ |
6933 |
++ int budgets_assigned; |
6934 |
++ |
6935 |
++ struct timer_list idle_slice_timer; |
6936 |
++ struct work_struct unplug_work; |
6937 |
++ |
6938 |
++ struct bfq_queue *in_service_queue; |
6939 |
++ struct bfq_io_cq *in_service_bic; |
6940 |
++ |
6941 |
++ sector_t last_position; |
6942 |
++ |
6943 |
++ ktime_t last_budget_start; |
6944 |
++ ktime_t last_idling_start; |
6945 |
++ int peak_rate_samples; |
6946 |
++ u64 peak_rate; |
6947 |
++ int bfq_max_budget; |
6948 |
++ |
6949 |
++ struct list_head active_list; |
6950 |
++ struct list_head idle_list; |
6951 |
++ |
6952 |
++ unsigned int bfq_fifo_expire[2]; |
6953 |
++ unsigned int bfq_back_penalty; |
6954 |
++ unsigned int bfq_back_max; |
6955 |
++ unsigned int bfq_slice_idle; |
6956 |
++ u64 bfq_class_idle_last_service; |
6957 |
++ |
6958 |
++ int bfq_user_max_budget; |
6959 |
++ int bfq_max_budget_async_rq; |
6960 |
++ unsigned int bfq_timeout[2]; |
6961 |
++ |
6962 |
++ unsigned int bfq_coop_thresh; |
6963 |
++ unsigned int bfq_failed_cooperations; |
6964 |
++ unsigned int bfq_requests_within_timer; |
6965 |
++ |
6966 |
++ unsigned long last_ins_in_burst; |
6967 |
++ unsigned long bfq_burst_interval; |
6968 |
++ int burst_size; |
6969 |
++ unsigned long bfq_large_burst_thresh; |
6970 |
++ bool large_burst; |
6971 |
++ struct hlist_head burst_list; |
6972 |
++ |
6973 |
++ bool low_latency; |
6974 |
++ |
6975 |
++ /* parameters of the low_latency heuristics */ |
6976 |
++ unsigned int bfq_wr_coeff; |
6977 |
++ unsigned int bfq_wr_max_time; |
6978 |
++ unsigned int bfq_wr_rt_max_time; |
6979 |
++ unsigned int bfq_wr_min_idle_time; |
6980 |
++ unsigned long bfq_wr_min_inter_arr_async; |
6981 |
++ unsigned int bfq_wr_max_softrt_rate; |
6982 |
++ u64 RT_prod; |
6983 |
++ enum bfq_device_speed device_speed; |
6984 |
++ |
6985 |
++ struct bfq_queue oom_bfqq; |
6986 |
++}; |
6987 |
++ |
6988 |
++enum bfqq_state_flags { |
6989 |
++ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ |
6990 |
++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ |
6991 |
++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ |
6992 |
++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ |
6993 |
++ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ |
6994 |
++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ |
6995 |
++ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ |
6996 |
++ BFQ_BFQQ_FLAG_IO_bound, /* |
6997 |
++ * bfqq has timed-out at least once |
6998 |
++ * having consumed at most 2/10 of |
6999 |
++ * its budget |
7000 |
++ */ |
7001 |
++ BFQ_BFQQ_FLAG_in_large_burst, /* |
7002 |
++ * bfqq activated in a large burst, |
7003 |
++ * see comments to bfq_handle_burst. |
7004 |
++ */ |
7005 |
++ BFQ_BFQQ_FLAG_constantly_seeky, /* |
7006 |
++ * bfqq has proved to be slow and |
7007 |
++ * seeky until budget timeout |
7008 |
++ */ |
7009 |
++ BFQ_BFQQ_FLAG_softrt_update, /* |
7010 |
++ * may need softrt-next-start |
7011 |
++ * update |
7012 |
++ */ |
7013 |
++}; |
7014 |
++ |
7015 |
++#define BFQ_BFQQ_FNS(name) \ |
7016 |
++static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ |
7017 |
++{ \ |
7018 |
++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ |
7019 |
++} \ |
7020 |
++static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ |
7021 |
++{ \ |
7022 |
++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ |
7023 |
++} \ |
7024 |
++static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ |
7025 |
++{ \ |
7026 |
++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ |
7027 |
++} |
7028 |
++ |
7029 |
++BFQ_BFQQ_FNS(busy); |
7030 |
++BFQ_BFQQ_FNS(wait_request); |
7031 |
++BFQ_BFQQ_FNS(must_alloc); |
7032 |
++BFQ_BFQQ_FNS(fifo_expire); |
7033 |
++BFQ_BFQQ_FNS(idle_window); |
7034 |
++BFQ_BFQQ_FNS(sync); |
7035 |
++BFQ_BFQQ_FNS(budget_new); |
7036 |
++BFQ_BFQQ_FNS(IO_bound); |
7037 |
++BFQ_BFQQ_FNS(in_large_burst); |
7038 |
++BFQ_BFQQ_FNS(constantly_seeky); |
7039 |
++BFQ_BFQQ_FNS(softrt_update); |
7040 |
++#undef BFQ_BFQQ_FNS |
7041 |
++ |
7042 |
++/* Logging facilities. */ |
7043 |
++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ |
7044 |
++ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) |
7045 |
++ |
7046 |
++#define bfq_log(bfqd, fmt, args...) \ |
7047 |
++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) |
7048 |
++ |
7049 |
++/* Expiration reasons. */ |
7050 |
++enum bfqq_expiration { |
7051 |
++ BFQ_BFQQ_TOO_IDLE = 0, /* |
7052 |
++ * queue has been idling for |
7053 |
++ * too long |
7054 |
++ */ |
7055 |
++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ |
7056 |
++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ |
7057 |
++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ |
7058 |
++}; |
7059 |
++ |
7060 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
7061 |
++ |
7062 |
++struct bfqg_stats { |
7063 |
++ /* total bytes transferred */ |
7064 |
++ struct blkg_rwstat service_bytes; |
7065 |
++ /* total IOs serviced, post merge */ |
7066 |
++ struct blkg_rwstat serviced; |
7067 |
++ /* number of ios merged */ |
7068 |
++ struct blkg_rwstat merged; |
7069 |
++ /* total time spent on device in ns, may not be accurate w/ queueing */ |
7070 |
++ struct blkg_rwstat service_time; |
7071 |
++ /* total time spent waiting in scheduler queue in ns */ |
7072 |
++ struct blkg_rwstat wait_time; |
7073 |
++ /* number of IOs queued up */ |
7074 |
++ struct blkg_rwstat queued; |
7075 |
++ /* total sectors transferred */ |
7076 |
++ struct blkg_stat sectors; |
7077 |
++ /* total disk time and nr sectors dispatched by this group */ |
7078 |
++ struct blkg_stat time; |
7079 |
++ /* time not charged to this cgroup */ |
7080 |
++ struct blkg_stat unaccounted_time; |
7081 |
++ /* sum of number of ios queued across all samples */ |
7082 |
++ struct blkg_stat avg_queue_size_sum; |
7083 |
++ /* count of samples taken for average */ |
7084 |
++ struct blkg_stat avg_queue_size_samples; |
7085 |
++ /* how many times this group has been removed from service tree */ |
7086 |
++ struct blkg_stat dequeue; |
7087 |
++ /* total time spent waiting for it to be assigned a timeslice. */ |
7088 |
++ struct blkg_stat group_wait_time; |
7089 |
++ /* time spent idling for this blkcg_gq */ |
7090 |
++ struct blkg_stat idle_time; |
7091 |
++ /* total time with empty current active q with other requests queued */ |
7092 |
++ struct blkg_stat empty_time; |
7093 |
++ /* fields after this shouldn't be cleared on stat reset */ |
7094 |
++ uint64_t start_group_wait_time; |
7095 |
++ uint64_t start_idle_time; |
7096 |
++ uint64_t start_empty_time; |
7097 |
++ uint16_t flags; |
7098 |
++}; |
7099 |
++ |
7100 |
++/* |
7101 |
++ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. |
7102 |
++ * |
7103 |
++ * @ps: @blkcg_policy_storage that this structure inherits |
7104 |
++ * @weight: weight of the bfq_group |
7105 |
++ */ |
7106 |
++struct bfq_group_data { |
7107 |
++ /* must be the first member */ |
7108 |
++ struct blkcg_policy_data pd; |
7109 |
++ |
7110 |
++ unsigned short weight; |
7111 |
++}; |
7112 |
++ |
7113 |
++/** |
7114 |
++ * struct bfq_group - per (device, cgroup) data structure. |
7115 |
++ * @entity: schedulable entity to insert into the parent group sched_data. |
7116 |
++ * @sched_data: own sched_data, to contain child entities (they may be |
7117 |
++ * both bfq_queues and bfq_groups). |
7118 |
++ * @bfqd: the bfq_data for the device this group acts upon. |
7119 |
++ * @async_bfqq: array of async queues for all the tasks belonging to |
7120 |
++ * the group, one queue per ioprio value per ioprio_class, |
7121 |
++ * except for the idle class that has only one queue. |
7122 |
++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). |
7123 |
++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used |
7124 |
++ * to avoid too many special cases during group creation/ |
7125 |
++ * migration. |
7126 |
++ * @active_entities: number of active entities belonging to the group; |
7127 |
++ * unused for the root group. Used to know whether there |
7128 |
++ * are groups with more than one active @bfq_entity |
7129 |
++ * (see the comments to the function |
7130 |
++ * bfq_bfqq_must_not_expire()). |
7131 |
++ * |
7132 |
++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup |
7133 |
++ * there is a set of bfq_groups, each one collecting the lower-level |
7134 |
++ * entities belonging to the group that are acting on the same device. |
7135 |
++ * |
7136 |
++ * Locking works as follows: |
7137 |
++ * o @bfqd is protected by the queue lock, RCU is used to access it |
7138 |
++ * from the readers. |
7139 |
++ * o All the other fields are protected by the @bfqd queue lock. |
7140 |
++ */ |
7141 |
++struct bfq_group { |
7142 |
++ /* must be the first member */ |
7143 |
++ struct blkg_policy_data pd; |
7144 |
++ |
7145 |
++ struct bfq_entity entity; |
7146 |
++ struct bfq_sched_data sched_data; |
7147 |
++ |
7148 |
++ void *bfqd; |
7149 |
++ |
7150 |
++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; |
7151 |
++ struct bfq_queue *async_idle_bfqq; |
7152 |
++ |
7153 |
++ struct bfq_entity *my_entity; |
7154 |
++ |
7155 |
++ int active_entities; |
7156 |
++ |
7157 |
++ struct bfqg_stats stats; |
7158 |
++ struct bfqg_stats dead_stats; /* stats pushed from dead children */ |
7159 |
++}; |
7160 |
++ |
7161 |
++#else |
7162 |
++struct bfq_group { |
7163 |
++ struct bfq_sched_data sched_data; |
7164 |
++ |
7165 |
++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; |
7166 |
++ struct bfq_queue *async_idle_bfqq; |
7167 |
++}; |
7168 |
++#endif |
7169 |
++ |
7170 |
++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); |
7171 |
++ |
7172 |
++static struct bfq_service_tree * |
7173 |
++bfq_entity_service_tree(struct bfq_entity *entity) |
7174 |
++{ |
7175 |
++ struct bfq_sched_data *sched_data = entity->sched_data; |
7176 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
7177 |
++ unsigned int idx = bfqq ? bfqq->ioprio_class - 1 : |
7178 |
++ BFQ_DEFAULT_GRP_CLASS; |
7179 |
++ |
7180 |
++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); |
7181 |
++ BUG_ON(sched_data == NULL); |
7182 |
++ |
7183 |
++ return sched_data->service_tree + idx; |
7184 |
++} |
7185 |
++ |
7186 |
++static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) |
7187 |
++{ |
7188 |
++ return bic->bfqq[is_sync]; |
7189 |
++} |
7190 |
++ |
7191 |
++static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, |
7192 |
++ bool is_sync) |
7193 |
++{ |
7194 |
++ bic->bfqq[is_sync] = bfqq; |
7195 |
++} |
7196 |
++ |
7197 |
++static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) |
7198 |
++{ |
7199 |
++ return bic->icq.q->elevator->elevator_data; |
7200 |
++} |
7201 |
++ |
7202 |
++/** |
7203 |
++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. |
7204 |
++ * @ptr: a pointer to a bfqd. |
7205 |
++ * @flags: storage for the flags to be saved. |
7206 |
++ * |
7207 |
++ * This function allows bfqg->bfqd to be protected by the |
7208 |
++ * queue lock of the bfqd they reference; the pointer is dereferenced |
7209 |
++ * under RCU, so the storage for bfqd is assured to be safe as long |
7210 |
++ * as the RCU read side critical section does not end. After the |
7211 |
++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be |
7212 |
++ * sure that no other writer accessed it. If we raced with a writer, |
7213 |
++ * the function returns NULL, with the queue unlocked, otherwise it |
7214 |
++ * returns the dereferenced pointer, with the queue locked. |
7215 |
++ */ |
7216 |
++static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) |
7217 |
++{ |
7218 |
++ struct bfq_data *bfqd; |
7219 |
++ |
7220 |
++ rcu_read_lock(); |
7221 |
++ bfqd = rcu_dereference(*(struct bfq_data **)ptr); |
7222 |
++ |
7223 |
++ if (bfqd != NULL) { |
7224 |
++ spin_lock_irqsave(bfqd->queue->queue_lock, *flags); |
7225 |
++ if (ptr == NULL) |
7226 |
++ printk(KERN_CRIT "get_bfqd_locked pointer NULL\n"); |
7227 |
++ else if (*ptr == bfqd) |
7228 |
++ goto out; |
7229 |
++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); |
7230 |
++ } |
7231 |
++ |
7232 |
++ bfqd = NULL; |
7233 |
++out: |
7234 |
++ rcu_read_unlock(); |
7235 |
++ return bfqd; |
7236 |
++} |
7237 |
++ |
7238 |
++static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) |
7239 |
++{ |
7240 |
++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); |
7241 |
++} |
7242 |
++ |
7243 |
++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); |
7244 |
++static void bfq_put_queue(struct bfq_queue *bfqq); |
7245 |
++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); |
7246 |
++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, |
7247 |
++ struct bio *bio, int is_sync, |
7248 |
++ struct bfq_io_cq *bic, gfp_t gfp_mask); |
7249 |
++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, |
7250 |
++ struct bfq_group *bfqg); |
7251 |
++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); |
7252 |
++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); |
7253 |
++ |
7254 |
++#endif /* _BFQ_H */ |
7255 |
+-- |
7256 |
+1.9.1 |
7257 |
+ |
7258 |
|
7259 |
diff --git a/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for-4.7.patch b/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for-4.7.patch |
7260 |
new file mode 100644 |
7261 |
index 0000000..eb23acc |
7262 |
--- /dev/null |
7263 |
+++ b/5003_block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for-4.7.patch |
7264 |
@@ -0,0 +1,1101 @@ |
7265 |
+From d93e55da4df8c5e7c33379780ad7d2fdb02e0568 Mon Sep 17 00:00:00 2001 |
7266 |
+From: Mauro Andreolini <mauro.andreolini@×××××××.it> |
7267 |
+Date: Sun, 6 Sep 2015 16:09:05 +0200 |
7268 |
+Subject: [PATCH 3/4] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r11 for |
7269 |
+ 4.7.0 |
7270 |
+ |
7271 |
+A set of processes may happen to perform interleaved reads, i.e.,requests |
7272 |
+whose union would give rise to a sequential read pattern. There are two |
7273 |
+typical cases: in the first case, processes read fixed-size chunks of |
7274 |
+data at a fixed distance from each other, while in the second case processes |
7275 |
+may read variable-size chunks at variable distances. The latter case occurs |
7276 |
+for example with QEMU, which splits the I/O generated by the guest into |
7277 |
+multiple chunks, and lets these chunks be served by a pool of cooperating |
7278 |
+processes, iteratively assigning the next chunk of I/O to the first |
7279 |
+available process. CFQ uses actual queue merging for the first type of |
7280 |
+rocesses, whereas it uses preemption to get a sequential read pattern out |
7281 |
+of the read requests performed by the second type of processes. In the end |
7282 |
+it uses two different mechanisms to achieve the same goal: boosting the |
7283 |
+throughput with interleaved I/O. |
7284 |
+ |
7285 |
+This patch introduces Early Queue Merge (EQM), a unified mechanism to get a |
7286 |
+sequential read pattern with both types of processes. The main idea is |
7287 |
+checking newly arrived requests against the next request of the active queue |
7288 |
+both in case of actual request insert and in case of request merge. By doing |
7289 |
+so, both the types of processes can be handled by just merging their queues. |
7290 |
+EQM is then simpler and more compact than the pair of mechanisms used in |
7291 |
+CFQ. |
7292 |
+ |
7293 |
+Finally, EQM also preserves the typical low-latency properties of BFQ, by |
7294 |
+properly restoring the weight-raising state of a queue when it gets back to |
7295 |
+a non-merged state. |
7296 |
+ |
7297 |
+Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it> |
7298 |
+Signed-off-by: Arianna Avanzini <avanzini@××××××.com> |
7299 |
+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it> |
7300 |
+Signed-off-by: Linus Walleij <linus.walleij@××××××.org> |
7301 |
+--- |
7302 |
+ block/bfq-cgroup.c | 4 + |
7303 |
+ block/bfq-iosched.c | 687 ++++++++++++++++++++++++++++++++++++++++++++++++++-- |
7304 |
+ block/bfq.h | 66 +++++ |
7305 |
+ 3 files changed, 743 insertions(+), 14 deletions(-) |
7306 |
+ |
7307 |
+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c |
7308 |
+index 8610cd6..5ee99ec 100644 |
7309 |
+--- a/block/bfq-cgroup.c |
7310 |
++++ b/block/bfq-cgroup.c |
7311 |
+@@ -437,6 +437,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) |
7312 |
+ */ |
7313 |
+ bfqg->bfqd = bfqd; |
7314 |
+ bfqg->active_entities = 0; |
7315 |
++ bfqg->rq_pos_tree = RB_ROOT; |
7316 |
+ } |
7317 |
+ |
7318 |
+ static void bfq_pd_free(struct blkg_policy_data *pd) |
7319 |
+@@ -530,6 +531,8 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, |
7320 |
+ return bfqg; |
7321 |
+ } |
7322 |
+ |
7323 |
++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); |
7324 |
++ |
7325 |
+ /** |
7326 |
+ * bfq_bfqq_move - migrate @bfqq to @bfqg. |
7327 |
+ * @bfqd: queue descriptor. |
7328 |
+@@ -577,6 +580,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
7329 |
+ bfqg_get(bfqg); |
7330 |
+ |
7331 |
+ if (busy) { |
7332 |
++ bfq_pos_tree_add_move(bfqd, bfqq); |
7333 |
+ if (resume) |
7334 |
+ bfq_activate_bfqq(bfqd, bfqq); |
7335 |
+ } |
7336 |
+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c |
7337 |
+index f9787a6..d1f648d 100644 |
7338 |
+--- a/block/bfq-iosched.c |
7339 |
++++ b/block/bfq-iosched.c |
7340 |
+@@ -296,6 +296,72 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, |
7341 |
+ } |
7342 |
+ } |
7343 |
+ |
7344 |
++static struct bfq_queue * |
7345 |
++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, |
7346 |
++ sector_t sector, struct rb_node **ret_parent, |
7347 |
++ struct rb_node ***rb_link) |
7348 |
++{ |
7349 |
++ struct rb_node **p, *parent; |
7350 |
++ struct bfq_queue *bfqq = NULL; |
7351 |
++ |
7352 |
++ parent = NULL; |
7353 |
++ p = &root->rb_node; |
7354 |
++ while (*p) { |
7355 |
++ struct rb_node **n; |
7356 |
++ |
7357 |
++ parent = *p; |
7358 |
++ bfqq = rb_entry(parent, struct bfq_queue, pos_node); |
7359 |
++ |
7360 |
++ /* |
7361 |
++ * Sort strictly based on sector. Smallest to the left, |
7362 |
++ * largest to the right. |
7363 |
++ */ |
7364 |
++ if (sector > blk_rq_pos(bfqq->next_rq)) |
7365 |
++ n = &(*p)->rb_right; |
7366 |
++ else if (sector < blk_rq_pos(bfqq->next_rq)) |
7367 |
++ n = &(*p)->rb_left; |
7368 |
++ else |
7369 |
++ break; |
7370 |
++ p = n; |
7371 |
++ bfqq = NULL; |
7372 |
++ } |
7373 |
++ |
7374 |
++ *ret_parent = parent; |
7375 |
++ if (rb_link) |
7376 |
++ *rb_link = p; |
7377 |
++ |
7378 |
++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", |
7379 |
++ (long long unsigned)sector, |
7380 |
++ bfqq ? bfqq->pid : 0); |
7381 |
++ |
7382 |
++ return bfqq; |
7383 |
++} |
7384 |
++ |
7385 |
++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
7386 |
++{ |
7387 |
++ struct rb_node **p, *parent; |
7388 |
++ struct bfq_queue *__bfqq; |
7389 |
++ |
7390 |
++ if (bfqq->pos_root) { |
7391 |
++ rb_erase(&bfqq->pos_node, bfqq->pos_root); |
7392 |
++ bfqq->pos_root = NULL; |
7393 |
++ } |
7394 |
++ |
7395 |
++ if (bfq_class_idle(bfqq)) |
7396 |
++ return; |
7397 |
++ if (!bfqq->next_rq) |
7398 |
++ return; |
7399 |
++ |
7400 |
++ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; |
7401 |
++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, |
7402 |
++ blk_rq_pos(bfqq->next_rq), &parent, &p); |
7403 |
++ if (!__bfqq) { |
7404 |
++ rb_link_node(&bfqq->pos_node, parent, p); |
7405 |
++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); |
7406 |
++ } else |
7407 |
++ bfqq->pos_root = NULL; |
7408 |
++} |
7409 |
++ |
7410 |
+ /* |
7411 |
+ * Tell whether there are active queues or groups with differentiated weights. |
7412 |
+ */ |
7413 |
+@@ -528,6 +594,57 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) |
7414 |
+ return dur; |
7415 |
+ } |
7416 |
+ |
7417 |
++static unsigned bfq_bfqq_cooperations(struct bfq_queue *bfqq) |
7418 |
++{ |
7419 |
++ return bfqq->bic ? bfqq->bic->cooperations : 0; |
7420 |
++} |
7421 |
++ |
7422 |
++static void |
7423 |
++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) |
7424 |
++{ |
7425 |
++ if (bic->saved_idle_window) |
7426 |
++ bfq_mark_bfqq_idle_window(bfqq); |
7427 |
++ else |
7428 |
++ bfq_clear_bfqq_idle_window(bfqq); |
7429 |
++ if (bic->saved_IO_bound) |
7430 |
++ bfq_mark_bfqq_IO_bound(bfqq); |
7431 |
++ else |
7432 |
++ bfq_clear_bfqq_IO_bound(bfqq); |
7433 |
++ /* Assuming that the flag in_large_burst is already correctly set */ |
7434 |
++ if (bic->wr_time_left && bfqq->bfqd->low_latency && |
7435 |
++ !bfq_bfqq_in_large_burst(bfqq) && |
7436 |
++ bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { |
7437 |
++ /* |
7438 |
++ * Start a weight raising period with the duration given by |
7439 |
++ * the raising_time_left snapshot. |
7440 |
++ */ |
7441 |
++ if (bfq_bfqq_busy(bfqq)) |
7442 |
++ bfqq->bfqd->wr_busy_queues++; |
7443 |
++ bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; |
7444 |
++ bfqq->wr_cur_max_time = bic->wr_time_left; |
7445 |
++ bfqq->last_wr_start_finish = jiffies; |
7446 |
++ bfqq->entity.prio_changed = 1; |
7447 |
++ } |
7448 |
++ /* |
7449 |
++ * Clear wr_time_left to prevent bfq_bfqq_save_state() from |
7450 |
++ * getting confused about the queue's need of a weight-raising |
7451 |
++ * period. |
7452 |
++ */ |
7453 |
++ bic->wr_time_left = 0; |
7454 |
++} |
7455 |
++ |
7456 |
++static int bfqq_process_refs(struct bfq_queue *bfqq) |
7457 |
++{ |
7458 |
++ int process_refs, io_refs; |
7459 |
++ |
7460 |
++ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); |
7461 |
++ |
7462 |
++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; |
7463 |
++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; |
7464 |
++ BUG_ON(process_refs < 0); |
7465 |
++ return process_refs; |
7466 |
++} |
7467 |
++ |
7468 |
+ /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ |
7469 |
+ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
7470 |
+ { |
7471 |
+@@ -764,8 +881,14 @@ static void bfq_add_request(struct request *rq) |
7472 |
+ BUG_ON(!next_rq); |
7473 |
+ bfqq->next_rq = next_rq; |
7474 |
+ |
7475 |
++ /* |
7476 |
++ * Adjust priority tree position, if next_rq changes. |
7477 |
++ */ |
7478 |
++ if (prev != bfqq->next_rq) |
7479 |
++ bfq_pos_tree_add_move(bfqd, bfqq); |
7480 |
++ |
7481 |
+ if (!bfq_bfqq_busy(bfqq)) { |
7482 |
+- bool soft_rt, in_burst, |
7483 |
++ bool soft_rt, coop_or_in_burst, |
7484 |
+ idle_for_long_time = time_is_before_jiffies( |
7485 |
+ bfqq->budget_timeout + |
7486 |
+ bfqd->bfq_wr_min_idle_time); |
7487 |
+@@ -793,11 +916,12 @@ static void bfq_add_request(struct request *rq) |
7488 |
+ bfqd->last_ins_in_burst = jiffies; |
7489 |
+ } |
7490 |
+ |
7491 |
+- in_burst = bfq_bfqq_in_large_burst(bfqq); |
7492 |
++ coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || |
7493 |
++ bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; |
7494 |
+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && |
7495 |
+- !in_burst && |
7496 |
++ !coop_or_in_burst && |
7497 |
+ time_is_before_jiffies(bfqq->soft_rt_next_start); |
7498 |
+- interactive = !in_burst && idle_for_long_time; |
7499 |
++ interactive = !coop_or_in_burst && idle_for_long_time; |
7500 |
+ entity->budget = max_t(unsigned long, bfqq->max_budget, |
7501 |
+ bfq_serv_to_charge(next_rq, bfqq)); |
7502 |
+ |
7503 |
+@@ -816,6 +940,9 @@ static void bfq_add_request(struct request *rq) |
7504 |
+ if (!bfqd->low_latency) |
7505 |
+ goto add_bfqq_busy; |
7506 |
+ |
7507 |
++ if (bfq_bfqq_just_split(bfqq)) |
7508 |
++ goto set_prio_changed; |
7509 |
++ |
7510 |
+ /* |
7511 |
+ * If the queue: |
7512 |
+ * - is not being boosted, |
7513 |
+@@ -840,7 +967,7 @@ static void bfq_add_request(struct request *rq) |
7514 |
+ } else if (old_wr_coeff > 1) { |
7515 |
+ if (interactive) |
7516 |
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); |
7517 |
+- else if (in_burst || |
7518 |
++ else if (coop_or_in_burst || |
7519 |
+ (bfqq->wr_cur_max_time == |
7520 |
+ bfqd->bfq_wr_rt_max_time && |
7521 |
+ !soft_rt)) { |
7522 |
+@@ -905,6 +1032,7 @@ static void bfq_add_request(struct request *rq) |
7523 |
+ bfqd->bfq_wr_rt_max_time; |
7524 |
+ } |
7525 |
+ } |
7526 |
++set_prio_changed: |
7527 |
+ if (old_wr_coeff != bfqq->wr_coeff) |
7528 |
+ entity->prio_changed = 1; |
7529 |
+ add_bfqq_busy: |
7530 |
+@@ -1047,6 +1175,15 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, |
7531 |
+ bfqd->last_position); |
7532 |
+ BUG_ON(!next_rq); |
7533 |
+ bfqq->next_rq = next_rq; |
7534 |
++ /* |
7535 |
++ * If next_rq changes, update both the queue's budget to |
7536 |
++ * fit the new request and the queue's position in its |
7537 |
++ * rq_pos_tree. |
7538 |
++ */ |
7539 |
++ if (prev != bfqq->next_rq) { |
7540 |
++ bfq_updated_next_req(bfqd, bfqq); |
7541 |
++ bfq_pos_tree_add_move(bfqd, bfqq); |
7542 |
++ } |
7543 |
+ } |
7544 |
+ } |
7545 |
+ |
7546 |
+@@ -1129,11 +1266,346 @@ static void bfq_end_wr(struct bfq_data *bfqd) |
7547 |
+ spin_unlock_irq(bfqd->queue->queue_lock); |
7548 |
+ } |
7549 |
+ |
7550 |
++static sector_t bfq_io_struct_pos(void *io_struct, bool request) |
7551 |
++{ |
7552 |
++ if (request) |
7553 |
++ return blk_rq_pos(io_struct); |
7554 |
++ else |
7555 |
++ return ((struct bio *)io_struct)->bi_iter.bi_sector; |
7556 |
++} |
7557 |
++ |
7558 |
++static int bfq_rq_close_to_sector(void *io_struct, bool request, |
7559 |
++ sector_t sector) |
7560 |
++{ |
7561 |
++ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= |
7562 |
++ BFQQ_SEEK_THR; |
7563 |
++} |
7564 |
++ |
7565 |
++static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, |
7566 |
++ struct bfq_queue *bfqq, |
7567 |
++ sector_t sector) |
7568 |
++{ |
7569 |
++ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; |
7570 |
++ struct rb_node *parent, *node; |
7571 |
++ struct bfq_queue *__bfqq; |
7572 |
++ |
7573 |
++ if (RB_EMPTY_ROOT(root)) |
7574 |
++ return NULL; |
7575 |
++ |
7576 |
++ /* |
7577 |
++ * First, if we find a request starting at the end of the last |
7578 |
++ * request, choose it. |
7579 |
++ */ |
7580 |
++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); |
7581 |
++ if (__bfqq) |
7582 |
++ return __bfqq; |
7583 |
++ |
7584 |
++ /* |
7585 |
++ * If the exact sector wasn't found, the parent of the NULL leaf |
7586 |
++ * will contain the closest sector (rq_pos_tree sorted by |
7587 |
++ * next_request position). |
7588 |
++ */ |
7589 |
++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); |
7590 |
++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) |
7591 |
++ return __bfqq; |
7592 |
++ |
7593 |
++ if (blk_rq_pos(__bfqq->next_rq) < sector) |
7594 |
++ node = rb_next(&__bfqq->pos_node); |
7595 |
++ else |
7596 |
++ node = rb_prev(&__bfqq->pos_node); |
7597 |
++ if (!node) |
7598 |
++ return NULL; |
7599 |
++ |
7600 |
++ __bfqq = rb_entry(node, struct bfq_queue, pos_node); |
7601 |
++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) |
7602 |
++ return __bfqq; |
7603 |
++ |
7604 |
++ return NULL; |
7605 |
++} |
7606 |
++ |
7607 |
++static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, |
7608 |
++ struct bfq_queue *cur_bfqq, |
7609 |
++ sector_t sector) |
7610 |
++{ |
7611 |
++ struct bfq_queue *bfqq; |
7612 |
++ |
7613 |
++ /* |
7614 |
++ * We shall notice if some of the queues are cooperating, |
7615 |
++ * e.g., working closely on the same area of the device. In |
7616 |
++ * that case, we can group them together and: 1) don't waste |
7617 |
++ * time idling, and 2) serve the union of their requests in |
7618 |
++ * the best possible order for throughput. |
7619 |
++ */ |
7620 |
++ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); |
7621 |
++ if (!bfqq || bfqq == cur_bfqq) |
7622 |
++ return NULL; |
7623 |
++ |
7624 |
++ return bfqq; |
7625 |
++} |
7626 |
++ |
7627 |
++static struct bfq_queue * |
7628 |
++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) |
7629 |
++{ |
7630 |
++ int process_refs, new_process_refs; |
7631 |
++ struct bfq_queue *__bfqq; |
7632 |
++ |
7633 |
++ /* |
7634 |
++ * If there are no process references on the new_bfqq, then it is |
7635 |
++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain |
7636 |
++ * may have dropped their last reference (not just their last process |
7637 |
++ * reference). |
7638 |
++ */ |
7639 |
++ if (!bfqq_process_refs(new_bfqq)) |
7640 |
++ return NULL; |
7641 |
++ |
7642 |
++ /* Avoid a circular list and skip interim queue merges. */ |
7643 |
++ while ((__bfqq = new_bfqq->new_bfqq)) { |
7644 |
++ if (__bfqq == bfqq) |
7645 |
++ return NULL; |
7646 |
++ new_bfqq = __bfqq; |
7647 |
++ } |
7648 |
++ |
7649 |
++ process_refs = bfqq_process_refs(bfqq); |
7650 |
++ new_process_refs = bfqq_process_refs(new_bfqq); |
7651 |
++ /* |
7652 |
++ * If the process for the bfqq has gone away, there is no |
7653 |
++ * sense in merging the queues. |
7654 |
++ */ |
7655 |
++ if (process_refs == 0 || new_process_refs == 0) |
7656 |
++ return NULL; |
7657 |
++ |
7658 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", |
7659 |
++ new_bfqq->pid); |
7660 |
++ |
7661 |
++ /* |
7662 |
++ * Merging is just a redirection: the requests of the process |
7663 |
++ * owning one of the two queues are redirected to the other queue. |
7664 |
++ * The latter queue, in its turn, is set as shared if this is the |
7665 |
++ * first time that the requests of some process are redirected to |
7666 |
++ * it. |
7667 |
++ * |
7668 |
++ * We redirect bfqq to new_bfqq and not the opposite, because we |
7669 |
++ * are in the context of the process owning bfqq, hence we have |
7670 |
++ * the io_cq of this process. So we can immediately configure this |
7671 |
++ * io_cq to redirect the requests of the process to new_bfqq. |
7672 |
++ * |
7673 |
++ * NOTE, even if new_bfqq coincides with the in-service queue, the |
7674 |
++ * io_cq of new_bfqq is not available, because, if the in-service |
7675 |
++ * queue is shared, bfqd->in_service_bic may not point to the |
7676 |
++ * io_cq of the in-service queue. |
7677 |
++ * Redirecting the requests of the process owning bfqq to the |
7678 |
++ * currently in-service queue is in any case the best option, as |
7679 |
++ * we feed the in-service queue with new requests close to the |
7680 |
++ * last request served and, by doing so, hopefully increase the |
7681 |
++ * throughput. |
7682 |
++ */ |
7683 |
++ bfqq->new_bfqq = new_bfqq; |
7684 |
++ atomic_add(process_refs, &new_bfqq->ref); |
7685 |
++ return new_bfqq; |
7686 |
++} |
7687 |
++ |
7688 |
++static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, |
7689 |
++ struct bfq_queue *new_bfqq) |
7690 |
++{ |
7691 |
++ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || |
7692 |
++ (bfqq->ioprio_class != new_bfqq->ioprio_class)) |
7693 |
++ return false; |
7694 |
++ |
7695 |
++ /* |
7696 |
++ * If either of the queues has already been detected as seeky, |
7697 |
++ * then merging it with the other queue is unlikely to lead to |
7698 |
++ * sequential I/O. |
7699 |
++ */ |
7700 |
++ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) |
7701 |
++ return false; |
7702 |
++ |
7703 |
++ /* |
7704 |
++ * Interleaved I/O is known to be done by (some) applications |
7705 |
++ * only for reads, so it does not make sense to merge async |
7706 |
++ * queues. |
7707 |
++ */ |
7708 |
++ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) |
7709 |
++ return false; |
7710 |
++ |
7711 |
++ return true; |
7712 |
++} |
7713 |
++ |
7714 |
++/* |
7715 |
++ * Attempt to schedule a merge of bfqq with the currently in-service queue |
7716 |
++ * or with a close queue among the scheduled queues. |
7717 |
++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue |
7718 |
++ * structure otherwise. |
7719 |
++ * |
7720 |
++ * The OOM queue is not allowed to participate to cooperation: in fact, since |
7721 |
++ * the requests temporarily redirected to the OOM queue could be redirected |
7722 |
++ * again to dedicated queues at any time, the state needed to correctly |
7723 |
++ * handle merging with the OOM queue would be quite complex and expensive |
7724 |
++ * to maintain. Besides, in such a critical condition as an out of memory, |
7725 |
++ * the benefits of queue merging may be little relevant, or even negligible. |
7726 |
++ */ |
7727 |
++static struct bfq_queue * |
7728 |
++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
7729 |
++ void *io_struct, bool request) |
7730 |
++{ |
7731 |
++ struct bfq_queue *in_service_bfqq, *new_bfqq; |
7732 |
++ |
7733 |
++ if (bfqq->new_bfqq) |
7734 |
++ return bfqq->new_bfqq; |
7735 |
++ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) |
7736 |
++ return NULL; |
7737 |
++ /* If device has only one backlogged bfq_queue, don't search. */ |
7738 |
++ if (bfqd->busy_queues == 1) |
7739 |
++ return NULL; |
7740 |
++ |
7741 |
++ in_service_bfqq = bfqd->in_service_queue; |
7742 |
++ |
7743 |
++ if (!in_service_bfqq || in_service_bfqq == bfqq || |
7744 |
++ !bfqd->in_service_bic || |
7745 |
++ unlikely(in_service_bfqq == &bfqd->oom_bfqq)) |
7746 |
++ goto check_scheduled; |
7747 |
++ |
7748 |
++ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && |
7749 |
++ bfqq->entity.parent == in_service_bfqq->entity.parent && |
7750 |
++ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { |
7751 |
++ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); |
7752 |
++ if (new_bfqq) |
7753 |
++ return new_bfqq; |
7754 |
++ } |
7755 |
++ /* |
7756 |
++ * Check whether there is a cooperator among currently scheduled |
7757 |
++ * queues. The only thing we need is that the bio/request is not |
7758 |
++ * NULL, as we need it to establish whether a cooperator exists. |
7759 |
++ */ |
7760 |
++check_scheduled: |
7761 |
++ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, |
7762 |
++ bfq_io_struct_pos(io_struct, request)); |
7763 |
++ |
7764 |
++ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); |
7765 |
++ |
7766 |
++ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && |
7767 |
++ bfq_may_be_close_cooperator(bfqq, new_bfqq)) |
7768 |
++ return bfq_setup_merge(bfqq, new_bfqq); |
7769 |
++ |
7770 |
++ return NULL; |
7771 |
++} |
7772 |
++ |
7773 |
++static void bfq_bfqq_save_state(struct bfq_queue *bfqq) |
7774 |
++{ |
7775 |
++ /* |
7776 |
++ * If !bfqq->bic, the queue is already shared or its requests |
7777 |
++ * have already been redirected to a shared queue; both idle window |
7778 |
++ * and weight raising state have already been saved. Do nothing. |
7779 |
++ */ |
7780 |
++ if (!bfqq->bic) |
7781 |
++ return; |
7782 |
++ if (bfqq->bic->wr_time_left) |
7783 |
++ /* |
7784 |
++ * This is the queue of a just-started process, and would |
7785 |
++ * deserve weight raising: we set wr_time_left to the full |
7786 |
++ * weight-raising duration to trigger weight-raising when |
7787 |
++ * and if the queue is split and the first request of the |
7788 |
++ * queue is enqueued. |
7789 |
++ */ |
7790 |
++ bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); |
7791 |
++ else if (bfqq->wr_coeff > 1) { |
7792 |
++ unsigned long wr_duration = |
7793 |
++ jiffies - bfqq->last_wr_start_finish; |
7794 |
++ /* |
7795 |
++ * It may happen that a queue's weight raising period lasts |
7796 |
++ * longer than its wr_cur_max_time, as weight raising is |
7797 |
++ * handled only when a request is enqueued or dispatched (it |
7798 |
++ * does not use any timer). If the weight raising period is |
7799 |
++ * about to end, don't save it. |
7800 |
++ */ |
7801 |
++ if (bfqq->wr_cur_max_time <= wr_duration) |
7802 |
++ bfqq->bic->wr_time_left = 0; |
7803 |
++ else |
7804 |
++ bfqq->bic->wr_time_left = |
7805 |
++ bfqq->wr_cur_max_time - wr_duration; |
7806 |
++ /* |
7807 |
++ * The bfq_queue is becoming shared or the requests of the |
7808 |
++ * process owning the queue are being redirected to a shared |
7809 |
++ * queue. Stop the weight raising period of the queue, as in |
7810 |
++ * both cases it should not be owned by an interactive or |
7811 |
++ * soft real-time application. |
7812 |
++ */ |
7813 |
++ bfq_bfqq_end_wr(bfqq); |
7814 |
++ } else |
7815 |
++ bfqq->bic->wr_time_left = 0; |
7816 |
++ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); |
7817 |
++ bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); |
7818 |
++ bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); |
7819 |
++ bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); |
7820 |
++ bfqq->bic->cooperations++; |
7821 |
++ bfqq->bic->failed_cooperations = 0; |
7822 |
++} |
7823 |
++ |
7824 |
++static void bfq_get_bic_reference(struct bfq_queue *bfqq) |
7825 |
++{ |
7826 |
++ /* |
7827 |
++ * If bfqq->bic has a non-NULL value, the bic to which it belongs |
7828 |
++ * is about to begin using a shared bfq_queue. |
7829 |
++ */ |
7830 |
++ if (bfqq->bic) |
7831 |
++ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); |
7832 |
++} |
7833 |
++ |
7834 |
++static void |
7835 |
++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, |
7836 |
++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) |
7837 |
++{ |
7838 |
++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", |
7839 |
++ (long unsigned)new_bfqq->pid); |
7840 |
++ /* Save weight raising and idle window of the merged queues */ |
7841 |
++ bfq_bfqq_save_state(bfqq); |
7842 |
++ bfq_bfqq_save_state(new_bfqq); |
7843 |
++ if (bfq_bfqq_IO_bound(bfqq)) |
7844 |
++ bfq_mark_bfqq_IO_bound(new_bfqq); |
7845 |
++ bfq_clear_bfqq_IO_bound(bfqq); |
7846 |
++ /* |
7847 |
++ * Grab a reference to the bic, to prevent it from being destroyed |
7848 |
++ * before being possibly touched by a bfq_split_bfqq(). |
7849 |
++ */ |
7850 |
++ bfq_get_bic_reference(bfqq); |
7851 |
++ bfq_get_bic_reference(new_bfqq); |
7852 |
++ /* |
7853 |
++ * Merge queues (that is, let bic redirect its requests to new_bfqq) |
7854 |
++ */ |
7855 |
++ bic_set_bfqq(bic, new_bfqq, 1); |
7856 |
++ bfq_mark_bfqq_coop(new_bfqq); |
7857 |
++ /* |
7858 |
++ * new_bfqq now belongs to at least two bics (it is a shared queue): |
7859 |
++ * set new_bfqq->bic to NULL. bfqq either: |
7860 |
++ * - does not belong to any bic any more, and hence bfqq->bic must |
7861 |
++ * be set to NULL, or |
7862 |
++ * - is a queue whose owning bics have already been redirected to a |
7863 |
++ * different queue, hence the queue is destined to not belong to |
7864 |
++ * any bic soon and bfqq->bic is already NULL (therefore the next |
7865 |
++ * assignment causes no harm). |
7866 |
++ */ |
7867 |
++ new_bfqq->bic = NULL; |
7868 |
++ bfqq->bic = NULL; |
7869 |
++ bfq_put_queue(bfqq); |
7870 |
++} |
7871 |
++ |
7872 |
++static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) |
7873 |
++{ |
7874 |
++ struct bfq_io_cq *bic = bfqq->bic; |
7875 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
7876 |
++ |
7877 |
++ if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { |
7878 |
++ bic->failed_cooperations++; |
7879 |
++ if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) |
7880 |
++ bic->cooperations = 0; |
7881 |
++ } |
7882 |
++} |
7883 |
++ |
7884 |
+ static int bfq_allow_merge(struct request_queue *q, struct request *rq, |
7885 |
+ struct bio *bio) |
7886 |
+ { |
7887 |
+ struct bfq_data *bfqd = q->elevator->elevator_data; |
7888 |
+ struct bfq_io_cq *bic; |
7889 |
++ struct bfq_queue *bfqq, *new_bfqq; |
7890 |
+ |
7891 |
+ /* |
7892 |
+ * Disallow merge of a sync bio into an async request. |
7893 |
+@@ -1150,7 +1622,26 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, |
7894 |
+ if (!bic) |
7895 |
+ return 0; |
7896 |
+ |
7897 |
+- return bic_to_bfqq(bic, bfq_bio_sync(bio)) == RQ_BFQQ(rq); |
7898 |
++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); |
7899 |
++ /* |
7900 |
++ * We take advantage of this function to perform an early merge |
7901 |
++ * of the queues of possible cooperating processes. |
7902 |
++ */ |
7903 |
++ if (bfqq) { |
7904 |
++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); |
7905 |
++ if (new_bfqq) { |
7906 |
++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); |
7907 |
++ /* |
7908 |
++ * If we get here, the bio will be queued in the |
7909 |
++ * shared queue, i.e., new_bfqq, so use new_bfqq |
7910 |
++ * to decide whether bio and rq can be merged. |
7911 |
++ */ |
7912 |
++ bfqq = new_bfqq; |
7913 |
++ } else |
7914 |
++ bfq_bfqq_increase_failed_cooperations(bfqq); |
7915 |
++ } |
7916 |
++ |
7917 |
++ return bfqq == RQ_BFQQ(rq); |
7918 |
+ } |
7919 |
+ |
7920 |
+ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, |
7921 |
+@@ -1349,6 +1840,15 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
7922 |
+ |
7923 |
+ __bfq_bfqd_reset_in_service(bfqd); |
7924 |
+ |
7925 |
++ /* |
7926 |
++ * If this bfqq is shared between multiple processes, check |
7927 |
++ * to make sure that those processes are still issuing I/Os |
7928 |
++ * within the mean seek distance. If not, it may be time to |
7929 |
++ * break the queues apart again. |
7930 |
++ */ |
7931 |
++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) |
7932 |
++ bfq_mark_bfqq_split_coop(bfqq); |
7933 |
++ |
7934 |
+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { |
7935 |
+ /* |
7936 |
+ * Overloading budget_timeout field to store the time |
7937 |
+@@ -1357,8 +1857,13 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
7938 |
+ */ |
7939 |
+ bfqq->budget_timeout = jiffies; |
7940 |
+ bfq_del_bfqq_busy(bfqd, bfqq, 1); |
7941 |
+- } else |
7942 |
++ } else { |
7943 |
+ bfq_activate_bfqq(bfqd, bfqq); |
7944 |
++ /* |
7945 |
++ * Resort priority tree of potential close cooperators. |
7946 |
++ */ |
7947 |
++ bfq_pos_tree_add_move(bfqd, bfqq); |
7948 |
++ } |
7949 |
+ } |
7950 |
+ |
7951 |
+ /** |
7952 |
+@@ -2242,10 +2747,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
7953 |
+ /* |
7954 |
+ * If the queue was activated in a burst, or |
7955 |
+ * too much time has elapsed from the beginning |
7956 |
+- * of this weight-raising period, then end weight |
7957 |
+- * raising. |
7958 |
++ * of this weight-raising period, or the queue has |
7959 |
++ * exceeded the acceptable number of cooperations, |
7960 |
++ * then end weight raising. |
7961 |
+ */ |
7962 |
+ if (bfq_bfqq_in_large_burst(bfqq) || |
7963 |
++ bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || |
7964 |
+ time_is_before_jiffies(bfqq->last_wr_start_finish + |
7965 |
+ bfqq->wr_cur_max_time)) { |
7966 |
+ bfqq->last_wr_start_finish = jiffies; |
7967 |
+@@ -2474,6 +2981,25 @@ static void bfq_put_queue(struct bfq_queue *bfqq) |
7968 |
+ #endif |
7969 |
+ } |
7970 |
+ |
7971 |
++static void bfq_put_cooperator(struct bfq_queue *bfqq) |
7972 |
++{ |
7973 |
++ struct bfq_queue *__bfqq, *next; |
7974 |
++ |
7975 |
++ /* |
7976 |
++ * If this queue was scheduled to merge with another queue, be |
7977 |
++ * sure to drop the reference taken on that queue (and others in |
7978 |
++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. |
7979 |
++ */ |
7980 |
++ __bfqq = bfqq->new_bfqq; |
7981 |
++ while (__bfqq) { |
7982 |
++ if (__bfqq == bfqq) |
7983 |
++ break; |
7984 |
++ next = __bfqq->new_bfqq; |
7985 |
++ bfq_put_queue(__bfqq); |
7986 |
++ __bfqq = next; |
7987 |
++ } |
7988 |
++} |
7989 |
++ |
7990 |
+ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
7991 |
+ { |
7992 |
+ if (bfqq == bfqd->in_service_queue) { |
7993 |
+@@ -2484,6 +3010,8 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
7994 |
+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, |
7995 |
+ atomic_read(&bfqq->ref)); |
7996 |
+ |
7997 |
++ bfq_put_cooperator(bfqq); |
7998 |
++ |
7999 |
+ bfq_put_queue(bfqq); |
8000 |
+ } |
8001 |
+ |
8002 |
+@@ -2492,6 +3020,25 @@ static void bfq_init_icq(struct io_cq *icq) |
8003 |
+ struct bfq_io_cq *bic = icq_to_bic(icq); |
8004 |
+ |
8005 |
+ bic->ttime.last_end_request = jiffies; |
8006 |
++ /* |
8007 |
++ * A newly created bic indicates that the process has just |
8008 |
++ * started doing I/O, and is probably mapping into memory its |
8009 |
++ * executable and libraries: it definitely needs weight raising. |
8010 |
++ * There is however the possibility that the process performs, |
8011 |
++ * for a while, I/O close to some other process. EQM intercepts |
8012 |
++ * this behavior and may merge the queue corresponding to the |
8013 |
++ * process with some other queue, BEFORE the weight of the queue |
8014 |
++ * is raised. Merged queues are not weight-raised (they are assumed |
8015 |
++ * to belong to processes that benefit only from high throughput). |
8016 |
++ * If the merge is basically the consequence of an accident, then |
8017 |
++ * the queue will be split soon and will get back its old weight. |
8018 |
++ * It is then important to write down somewhere that this queue |
8019 |
++ * does need weight raising, even if it did not make it to get its |
8020 |
++ * weight raised before being merged. To this purpose, we overload |
8021 |
++ * the field raising_time_left and assign 1 to it, to mark the queue |
8022 |
++ * as needing weight raising. |
8023 |
++ */ |
8024 |
++ bic->wr_time_left = 1; |
8025 |
+ } |
8026 |
+ |
8027 |
+ static void bfq_exit_icq(struct io_cq *icq) |
8028 |
+@@ -2505,6 +3052,13 @@ static void bfq_exit_icq(struct io_cq *icq) |
8029 |
+ } |
8030 |
+ |
8031 |
+ if (bic->bfqq[BLK_RW_SYNC]) { |
8032 |
++ /* |
8033 |
++ * If the bic is using a shared queue, put the reference |
8034 |
++ * taken on the io_context when the bic started using a |
8035 |
++ * shared bfq_queue. |
8036 |
++ */ |
8037 |
++ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) |
8038 |
++ put_io_context(icq->ioc); |
8039 |
+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); |
8040 |
+ bic->bfqq[BLK_RW_SYNC] = NULL; |
8041 |
+ } |
8042 |
+@@ -2809,6 +3363,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, |
8043 |
+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) |
8044 |
+ return; |
8045 |
+ |
8046 |
++ /* Idle window just restored, statistics are meaningless. */ |
8047 |
++ if (bfq_bfqq_just_split(bfqq)) |
8048 |
++ return; |
8049 |
++ |
8050 |
+ enable_idle = bfq_bfqq_idle_window(bfqq); |
8051 |
+ |
8052 |
+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || |
8053 |
+@@ -2856,6 +3414,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
8054 |
+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || |
8055 |
+ !BFQQ_SEEKY(bfqq)) |
8056 |
+ bfq_update_idle_window(bfqd, bfqq, bic); |
8057 |
++ bfq_clear_bfqq_just_split(bfqq); |
8058 |
+ |
8059 |
+ bfq_log_bfqq(bfqd, bfqq, |
8060 |
+ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", |
8061 |
+@@ -2920,12 +3479,47 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
8062 |
+ static void bfq_insert_request(struct request_queue *q, struct request *rq) |
8063 |
+ { |
8064 |
+ struct bfq_data *bfqd = q->elevator->elevator_data; |
8065 |
+- struct bfq_queue *bfqq = RQ_BFQQ(rq); |
8066 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; |
8067 |
+ |
8068 |
+ assert_spin_locked(bfqd->queue->queue_lock); |
8069 |
+ |
8070 |
++ /* |
8071 |
++ * An unplug may trigger a requeue of a request from the device |
8072 |
++ * driver: make sure we are in process context while trying to |
8073 |
++ * merge two bfq_queues. |
8074 |
++ */ |
8075 |
++ if (!in_interrupt()) { |
8076 |
++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); |
8077 |
++ if (new_bfqq) { |
8078 |
++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) |
8079 |
++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); |
8080 |
++ /* |
8081 |
++ * Release the request's reference to the old bfqq |
8082 |
++ * and make sure one is taken to the shared queue. |
8083 |
++ */ |
8084 |
++ new_bfqq->allocated[rq_data_dir(rq)]++; |
8085 |
++ bfqq->allocated[rq_data_dir(rq)]--; |
8086 |
++ atomic_inc(&new_bfqq->ref); |
8087 |
++ bfq_put_queue(bfqq); |
8088 |
++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) |
8089 |
++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), |
8090 |
++ bfqq, new_bfqq); |
8091 |
++ rq->elv.priv[1] = new_bfqq; |
8092 |
++ bfqq = new_bfqq; |
8093 |
++ } else |
8094 |
++ bfq_bfqq_increase_failed_cooperations(bfqq); |
8095 |
++ } |
8096 |
++ |
8097 |
+ bfq_add_request(rq); |
8098 |
+ |
8099 |
++ /* |
8100 |
++ * Here a newly-created bfq_queue has already started a weight-raising |
8101 |
++ * period: clear raising_time_left to prevent bfq_bfqq_save_state() |
8102 |
++ * from assigning it a full weight-raising period. See the detailed |
8103 |
++ * comments about this field in bfq_init_icq(). |
8104 |
++ */ |
8105 |
++ if (bfqq->bic) |
8106 |
++ bfqq->bic->wr_time_left = 0; |
8107 |
+ rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; |
8108 |
+ list_add_tail(&rq->queuelist, &bfqq->fifo); |
8109 |
+ |
8110 |
+@@ -3094,6 +3688,32 @@ static void bfq_put_request(struct request *rq) |
8111 |
+ } |
8112 |
+ |
8113 |
+ /* |
8114 |
++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this |
8115 |
++ * was the last process referring to said bfqq. |
8116 |
++ */ |
8117 |
++static struct bfq_queue * |
8118 |
++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) |
8119 |
++{ |
8120 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); |
8121 |
++ |
8122 |
++ put_io_context(bic->icq.ioc); |
8123 |
++ |
8124 |
++ if (bfqq_process_refs(bfqq) == 1) { |
8125 |
++ bfqq->pid = current->pid; |
8126 |
++ bfq_clear_bfqq_coop(bfqq); |
8127 |
++ bfq_clear_bfqq_split_coop(bfqq); |
8128 |
++ return bfqq; |
8129 |
++ } |
8130 |
++ |
8131 |
++ bic_set_bfqq(bic, NULL, 1); |
8132 |
++ |
8133 |
++ bfq_put_cooperator(bfqq); |
8134 |
++ |
8135 |
++ bfq_put_queue(bfqq); |
8136 |
++ return NULL; |
8137 |
++} |
8138 |
++ |
8139 |
++/* |
8140 |
+ * Allocate bfq data structures associated with this request. |
8141 |
+ */ |
8142 |
+ static int bfq_set_request(struct request_queue *q, struct request *rq, |
8143 |
+@@ -3105,6 +3725,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, |
8144 |
+ const int is_sync = rq_is_sync(rq); |
8145 |
+ struct bfq_queue *bfqq; |
8146 |
+ unsigned long flags; |
8147 |
++ bool split = false; |
8148 |
+ |
8149 |
+ might_sleep_if(gfpflags_allow_blocking(gfp_mask)); |
8150 |
+ |
8151 |
+@@ -3117,15 +3738,30 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, |
8152 |
+ |
8153 |
+ bfq_bic_update_cgroup(bic, bio); |
8154 |
+ |
8155 |
++new_queue: |
8156 |
+ bfqq = bic_to_bfqq(bic, is_sync); |
8157 |
+ if (!bfqq || bfqq == &bfqd->oom_bfqq) { |
8158 |
+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); |
8159 |
+ bic_set_bfqq(bic, bfqq, is_sync); |
8160 |
+- if (is_sync) { |
8161 |
+- if (bfqd->large_burst) |
8162 |
++ if (split && is_sync) { |
8163 |
++ if ((bic->was_in_burst_list && bfqd->large_burst) || |
8164 |
++ bic->saved_in_large_burst) |
8165 |
+ bfq_mark_bfqq_in_large_burst(bfqq); |
8166 |
+- else |
8167 |
+- bfq_clear_bfqq_in_large_burst(bfqq); |
8168 |
++ else { |
8169 |
++ bfq_clear_bfqq_in_large_burst(bfqq); |
8170 |
++ if (bic->was_in_burst_list) |
8171 |
++ hlist_add_head(&bfqq->burst_list_node, |
8172 |
++ &bfqd->burst_list); |
8173 |
++ } |
8174 |
++ } |
8175 |
++ } else { |
8176 |
++ /* If the queue was seeky for too long, break it apart. */ |
8177 |
++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { |
8178 |
++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); |
8179 |
++ bfqq = bfq_split_bfqq(bic, bfqq); |
8180 |
++ split = true; |
8181 |
++ if (!bfqq) |
8182 |
++ goto new_queue; |
8183 |
+ } |
8184 |
+ } |
8185 |
+ |
8186 |
+@@ -3137,6 +3773,26 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, |
8187 |
+ rq->elv.priv[0] = bic; |
8188 |
+ rq->elv.priv[1] = bfqq; |
8189 |
+ |
8190 |
++ /* |
8191 |
++ * If a bfq_queue has only one process reference, it is owned |
8192 |
++ * by only one bfq_io_cq: we can set the bic field of the |
8193 |
++ * bfq_queue to the address of that structure. Also, if the |
8194 |
++ * queue has just been split, mark a flag so that the |
8195 |
++ * information is available to the other scheduler hooks. |
8196 |
++ */ |
8197 |
++ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { |
8198 |
++ bfqq->bic = bic; |
8199 |
++ if (split) { |
8200 |
++ bfq_mark_bfqq_just_split(bfqq); |
8201 |
++ /* |
8202 |
++ * If the queue has just been split from a shared |
8203 |
++ * queue, restore the idle window and the possible |
8204 |
++ * weight raising period. |
8205 |
++ */ |
8206 |
++ bfq_bfqq_resume_state(bfqq, bic); |
8207 |
++ } |
8208 |
++ } |
8209 |
++ |
8210 |
+ spin_unlock_irqrestore(q->queue_lock, flags); |
8211 |
+ |
8212 |
+ return 0; |
8213 |
+@@ -3290,6 +3946,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, |
8214 |
+ root_group->my_entity = NULL; |
8215 |
+ root_group->bfqd = bfqd; |
8216 |
+ #endif |
8217 |
++ root_group->rq_pos_tree = RB_ROOT; |
8218 |
+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) |
8219 |
+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; |
8220 |
+ } |
8221 |
+@@ -3370,6 +4027,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) |
8222 |
+ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; |
8223 |
+ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; |
8224 |
+ |
8225 |
++ bfqd->bfq_coop_thresh = 2; |
8226 |
++ bfqd->bfq_failed_cooperations = 7000; |
8227 |
+ bfqd->bfq_requests_within_timer = 120; |
8228 |
+ |
8229 |
+ bfqd->bfq_large_burst_thresh = 11; |
8230 |
+diff --git a/block/bfq.h b/block/bfq.h |
8231 |
+index 485d0c9..f73c942 100644 |
8232 |
+--- a/block/bfq.h |
8233 |
++++ b/block/bfq.h |
8234 |
+@@ -183,6 +183,8 @@ struct bfq_group; |
8235 |
+ * ioprio_class value. |
8236 |
+ * @new_bfqq: shared bfq_queue if queue is cooperating with |
8237 |
+ * one or more other queues. |
8238 |
++ * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). |
8239 |
++ * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). |
8240 |
+ * @sort_list: sorted list of pending requests. |
8241 |
+ * @next_rq: if fifo isn't expired, next request to serve. |
8242 |
+ * @queued: nr of requests queued in @sort_list. |
8243 |
+@@ -304,6 +306,26 @@ struct bfq_ttime { |
8244 |
+ * @ttime: associated @bfq_ttime struct |
8245 |
+ * @ioprio: per (request_queue, blkcg) ioprio. |
8246 |
+ * @blkcg_id: id of the blkcg the related io_cq belongs to. |
8247 |
++ * @wr_time_left: snapshot of the time left before weight raising ends |
8248 |
++ * for the sync queue associated to this process; this |
8249 |
++ * snapshot is taken to remember this value while the weight |
8250 |
++ * raising is suspended because the queue is merged with a |
8251 |
++ * shared queue, and is used to set @raising_cur_max_time |
8252 |
++ * when the queue is split from the shared queue and its |
8253 |
++ * weight is raised again |
8254 |
++ * @saved_idle_window: same purpose as the previous field for the idle |
8255 |
++ * window |
8256 |
++ * @saved_IO_bound: same purpose as the previous two fields for the I/O |
8257 |
++ * bound classification of a queue |
8258 |
++ * @saved_in_large_burst: same purpose as the previous fields for the |
8259 |
++ * value of the field keeping the queue's belonging |
8260 |
++ * to a large burst |
8261 |
++ * @was_in_burst_list: true if the queue belonged to a burst list |
8262 |
++ * before its merge with another cooperating queue |
8263 |
++ * @cooperations: counter of consecutive successful queue merges underwent |
8264 |
++ * by any of the process' @bfq_queues |
8265 |
++ * @failed_cooperations: counter of consecutive failed queue merges of any |
8266 |
++ * of the process' @bfq_queues |
8267 |
+ */ |
8268 |
+ struct bfq_io_cq { |
8269 |
+ struct io_cq icq; /* must be the first member */ |
8270 |
+@@ -314,6 +336,16 @@ struct bfq_io_cq { |
8271 |
+ #ifdef CONFIG_BFQ_GROUP_IOSCHED |
8272 |
+ uint64_t blkcg_id; /* the current blkcg ID */ |
8273 |
+ #endif |
8274 |
++ |
8275 |
++ unsigned int wr_time_left; |
8276 |
++ bool saved_idle_window; |
8277 |
++ bool saved_IO_bound; |
8278 |
++ |
8279 |
++ bool saved_in_large_burst; |
8280 |
++ bool was_in_burst_list; |
8281 |
++ |
8282 |
++ unsigned int cooperations; |
8283 |
++ unsigned int failed_cooperations; |
8284 |
+ }; |
8285 |
+ |
8286 |
+ enum bfq_device_speed { |
8287 |
+@@ -557,6 +589,9 @@ enum bfqq_state_flags { |
8288 |
+ * may need softrt-next-start |
8289 |
+ * update |
8290 |
+ */ |
8291 |
++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ |
8292 |
++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ |
8293 |
++ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ |
8294 |
+ }; |
8295 |
+ |
8296 |
+ #define BFQ_BFQQ_FNS(name) \ |
8297 |
+@@ -583,6 +618,9 @@ BFQ_BFQQ_FNS(budget_new); |
8298 |
+ BFQ_BFQQ_FNS(IO_bound); |
8299 |
+ BFQ_BFQQ_FNS(in_large_burst); |
8300 |
+ BFQ_BFQQ_FNS(constantly_seeky); |
8301 |
++BFQ_BFQQ_FNS(coop); |
8302 |
++BFQ_BFQQ_FNS(split_coop); |
8303 |
++BFQ_BFQQ_FNS(just_split); |
8304 |
+ BFQ_BFQQ_FNS(softrt_update); |
8305 |
+ #undef BFQ_BFQQ_FNS |
8306 |
+ |
8307 |
+@@ -675,6 +713,9 @@ struct bfq_group_data { |
8308 |
+ * are groups with more than one active @bfq_entity |
8309 |
+ * (see the comments to the function |
8310 |
+ * bfq_bfqq_must_not_expire()). |
8311 |
++ * @rq_pos_tree: rbtree sorted by next_request position, used when |
8312 |
++ * determining if two or more queues have interleaving |
8313 |
++ * requests (see bfq_find_close_cooperator()). |
8314 |
+ * |
8315 |
+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup |
8316 |
+ * there is a set of bfq_groups, each one collecting the lower-level |
8317 |
+@@ -701,6 +742,8 @@ struct bfq_group { |
8318 |
+ |
8319 |
+ int active_entities; |
8320 |
+ |
8321 |
++ struct rb_root rq_pos_tree; |
8322 |
++ |
8323 |
+ struct bfqg_stats stats; |
8324 |
+ struct bfqg_stats dead_stats; /* stats pushed from dead children */ |
8325 |
+ }; |
8326 |
+@@ -711,6 +754,8 @@ struct bfq_group { |
8327 |
+ |
8328 |
+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; |
8329 |
+ struct bfq_queue *async_idle_bfqq; |
8330 |
++ |
8331 |
++ struct rb_root rq_pos_tree; |
8332 |
+ }; |
8333 |
+ #endif |
8334 |
+ |
8335 |
+@@ -787,6 +832,27 @@ static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) |
8336 |
+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); |
8337 |
+ } |
8338 |
+ |
8339 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
8340 |
++ |
8341 |
++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) |
8342 |
++{ |
8343 |
++ struct bfq_entity *group_entity = bfqq->entity.parent; |
8344 |
++ |
8345 |
++ if (!group_entity) |
8346 |
++ group_entity = &bfqq->bfqd->root_group->entity; |
8347 |
++ |
8348 |
++ return container_of(group_entity, struct bfq_group, entity); |
8349 |
++} |
8350 |
++ |
8351 |
++#else |
8352 |
++ |
8353 |
++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) |
8354 |
++{ |
8355 |
++ return bfqq->bfqd->root_group; |
8356 |
++} |
8357 |
++ |
8358 |
++#endif |
8359 |
++ |
8360 |
+ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); |
8361 |
+ static void bfq_put_queue(struct bfq_queue *bfqq); |
8362 |
+ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); |
8363 |
+-- |
8364 |
+1.9.1 |
8365 |
+ |
8366 |
|
8367 |
diff --git a/5004_blkck-bfq-turn-BFQ-v7r11-for-4.7.0-into-BFQ-v8-for-4.patch1 b/5004_blkck-bfq-turn-BFQ-v7r11-for-4.7.0-into-BFQ-v8-for-4.patch1 |
8368 |
new file mode 100644 |
8369 |
index 0000000..372f093 |
8370 |
--- /dev/null |
8371 |
+++ b/5004_blkck-bfq-turn-BFQ-v7r11-for-4.7.0-into-BFQ-v8-for-4.patch1 |
8372 |
@@ -0,0 +1,6361 @@ |
8373 |
+From 21d90fdc7488cd7c28f47b5ba759e62c697c0382 Mon Sep 17 00:00:00 2001 |
8374 |
+From: Paolo Valente <paolo.valente@××××××.org> |
8375 |
+Date: Tue, 17 May 2016 08:28:04 +0200 |
8376 |
+Subject: [PATCH 4/4] block, bfq: turn BFQ-v7r11 for 4.7.0 into BFQ-v8 for |
8377 |
+ 4.7.0 |
8378 |
+ |
8379 |
+--- |
8380 |
+ block/Kconfig.iosched | 2 +- |
8381 |
+ block/bfq-cgroup.c | 448 +++++---- |
8382 |
+ block/bfq-iosched.c | 2581 +++++++++++++++++++++++++++++-------------------- |
8383 |
+ block/bfq-sched.c | 432 +++++++-- |
8384 |
+ block/bfq.h | 697 +++++++------ |
8385 |
+ 5 files changed, 2433 insertions(+), 1727 deletions(-) |
8386 |
+ |
8387 |
+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched |
8388 |
+index f78cd1a..6d92579 100644 |
8389 |
+--- a/block/Kconfig.iosched |
8390 |
++++ b/block/Kconfig.iosched |
8391 |
+@@ -53,7 +53,7 @@ config IOSCHED_BFQ |
8392 |
+ |
8393 |
+ config BFQ_GROUP_IOSCHED |
8394 |
+ bool "BFQ hierarchical scheduling support" |
8395 |
+- depends on CGROUPS && IOSCHED_BFQ=y |
8396 |
++ depends on IOSCHED_BFQ && BLK_CGROUP |
8397 |
+ default n |
8398 |
+ ---help--- |
8399 |
+ Enable hierarchical scheduling in BFQ, using the blkio controller. |
8400 |
+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c |
8401 |
+index 5ee99ec..bc01663 100644 |
8402 |
+--- a/block/bfq-cgroup.c |
8403 |
++++ b/block/bfq-cgroup.c |
8404 |
+@@ -162,7 +162,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) |
8405 |
+ static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) |
8406 |
+ { |
8407 |
+ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); |
8408 |
+- BUG_ON(!pd); |
8409 |
+ return pd_to_bfqg(pd); |
8410 |
+ } |
8411 |
+ |
8412 |
+@@ -224,14 +223,6 @@ static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) |
8413 |
+ blkg_rwstat_add(&bfqg->stats.merged, rw, 1); |
8414 |
+ } |
8415 |
+ |
8416 |
+-static void bfqg_stats_update_dispatch(struct bfq_group *bfqg, |
8417 |
+- uint64_t bytes, int rw) |
8418 |
+-{ |
8419 |
+- blkg_stat_add(&bfqg->stats.sectors, bytes >> 9); |
8420 |
+- blkg_rwstat_add(&bfqg->stats.serviced, rw, 1); |
8421 |
+- blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes); |
8422 |
+-} |
8423 |
+- |
8424 |
+ static void bfqg_stats_update_completion(struct bfq_group *bfqg, |
8425 |
+ uint64_t start_time, uint64_t io_start_time, int rw) |
8426 |
+ { |
8427 |
+@@ -248,17 +239,11 @@ static void bfqg_stats_update_completion(struct bfq_group *bfqg, |
8428 |
+ /* @stats = 0 */ |
8429 |
+ static void bfqg_stats_reset(struct bfqg_stats *stats) |
8430 |
+ { |
8431 |
+- if (!stats) |
8432 |
+- return; |
8433 |
+- |
8434 |
+ /* queued stats shouldn't be cleared */ |
8435 |
+- blkg_rwstat_reset(&stats->service_bytes); |
8436 |
+- blkg_rwstat_reset(&stats->serviced); |
8437 |
+ blkg_rwstat_reset(&stats->merged); |
8438 |
+ blkg_rwstat_reset(&stats->service_time); |
8439 |
+ blkg_rwstat_reset(&stats->wait_time); |
8440 |
+ blkg_stat_reset(&stats->time); |
8441 |
+- blkg_stat_reset(&stats->unaccounted_time); |
8442 |
+ blkg_stat_reset(&stats->avg_queue_size_sum); |
8443 |
+ blkg_stat_reset(&stats->avg_queue_size_samples); |
8444 |
+ blkg_stat_reset(&stats->dequeue); |
8445 |
+@@ -268,21 +253,19 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) |
8446 |
+ } |
8447 |
+ |
8448 |
+ /* @to += @from */ |
8449 |
+-static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from) |
8450 |
++static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) |
8451 |
+ { |
8452 |
+ if (!to || !from) |
8453 |
+ return; |
8454 |
+ |
8455 |
+ /* queued stats shouldn't be cleared */ |
8456 |
+- blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); |
8457 |
+- blkg_rwstat_add_aux(&to->serviced, &from->serviced); |
8458 |
+ blkg_rwstat_add_aux(&to->merged, &from->merged); |
8459 |
+ blkg_rwstat_add_aux(&to->service_time, &from->service_time); |
8460 |
+ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); |
8461 |
+ blkg_stat_add_aux(&from->time, &from->time); |
8462 |
+- blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); |
8463 |
+ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); |
8464 |
+- blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); |
8465 |
++ blkg_stat_add_aux(&to->avg_queue_size_samples, |
8466 |
++ &from->avg_queue_size_samples); |
8467 |
+ blkg_stat_add_aux(&to->dequeue, &from->dequeue); |
8468 |
+ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); |
8469 |
+ blkg_stat_add_aux(&to->idle_time, &from->idle_time); |
8470 |
+@@ -308,10 +291,8 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) |
8471 |
+ if (unlikely(!parent)) |
8472 |
+ return; |
8473 |
+ |
8474 |
+- bfqg_stats_merge(&parent->dead_stats, &bfqg->stats); |
8475 |
+- bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats); |
8476 |
++ bfqg_stats_add_aux(&parent->stats, &bfqg->stats); |
8477 |
+ bfqg_stats_reset(&bfqg->stats); |
8478 |
+- bfqg_stats_reset(&bfqg->dead_stats); |
8479 |
+ } |
8480 |
+ |
8481 |
+ static void bfq_init_entity(struct bfq_entity *entity, |
8482 |
+@@ -332,15 +313,11 @@ static void bfq_init_entity(struct bfq_entity *entity, |
8483 |
+ |
8484 |
+ static void bfqg_stats_exit(struct bfqg_stats *stats) |
8485 |
+ { |
8486 |
+- blkg_rwstat_exit(&stats->service_bytes); |
8487 |
+- blkg_rwstat_exit(&stats->serviced); |
8488 |
+ blkg_rwstat_exit(&stats->merged); |
8489 |
+ blkg_rwstat_exit(&stats->service_time); |
8490 |
+ blkg_rwstat_exit(&stats->wait_time); |
8491 |
+ blkg_rwstat_exit(&stats->queued); |
8492 |
+- blkg_stat_exit(&stats->sectors); |
8493 |
+ blkg_stat_exit(&stats->time); |
8494 |
+- blkg_stat_exit(&stats->unaccounted_time); |
8495 |
+ blkg_stat_exit(&stats->avg_queue_size_sum); |
8496 |
+ blkg_stat_exit(&stats->avg_queue_size_samples); |
8497 |
+ blkg_stat_exit(&stats->dequeue); |
8498 |
+@@ -351,15 +328,11 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) |
8499 |
+ |
8500 |
+ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) |
8501 |
+ { |
8502 |
+- if (blkg_rwstat_init(&stats->service_bytes, gfp) || |
8503 |
+- blkg_rwstat_init(&stats->serviced, gfp) || |
8504 |
+- blkg_rwstat_init(&stats->merged, gfp) || |
8505 |
++ if (blkg_rwstat_init(&stats->merged, gfp) || |
8506 |
+ blkg_rwstat_init(&stats->service_time, gfp) || |
8507 |
+ blkg_rwstat_init(&stats->wait_time, gfp) || |
8508 |
+ blkg_rwstat_init(&stats->queued, gfp) || |
8509 |
+- blkg_stat_init(&stats->sectors, gfp) || |
8510 |
+ blkg_stat_init(&stats->time, gfp) || |
8511 |
+- blkg_stat_init(&stats->unaccounted_time, gfp) || |
8512 |
+ blkg_stat_init(&stats->avg_queue_size_sum, gfp) || |
8513 |
+ blkg_stat_init(&stats->avg_queue_size_samples, gfp) || |
8514 |
+ blkg_stat_init(&stats->dequeue, gfp) || |
8515 |
+@@ -374,20 +347,36 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) |
8516 |
+ } |
8517 |
+ |
8518 |
+ static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) |
8519 |
+- { |
8520 |
++{ |
8521 |
+ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; |
8522 |
+- } |
8523 |
++} |
8524 |
+ |
8525 |
+ static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) |
8526 |
+ { |
8527 |
+ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); |
8528 |
+ } |
8529 |
+ |
8530 |
++static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) |
8531 |
++{ |
8532 |
++ struct bfq_group_data *bgd; |
8533 |
++ |
8534 |
++ bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); |
8535 |
++ if (!bgd) |
8536 |
++ return NULL; |
8537 |
++ return &bgd->pd; |
8538 |
++} |
8539 |
++ |
8540 |
+ static void bfq_cpd_init(struct blkcg_policy_data *cpd) |
8541 |
+ { |
8542 |
+ struct bfq_group_data *d = cpd_to_bfqgd(cpd); |
8543 |
+ |
8544 |
+- d->weight = BFQ_DEFAULT_GRP_WEIGHT; |
8545 |
++ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? |
8546 |
++ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; |
8547 |
++} |
8548 |
++ |
8549 |
++static void bfq_cpd_free(struct blkcg_policy_data *cpd) |
8550 |
++{ |
8551 |
++ kfree(cpd_to_bfqgd(cpd)); |
8552 |
+ } |
8553 |
+ |
8554 |
+ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) |
8555 |
+@@ -398,8 +387,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) |
8556 |
+ if (!bfqg) |
8557 |
+ return NULL; |
8558 |
+ |
8559 |
+- if (bfqg_stats_init(&bfqg->stats, gfp) || |
8560 |
+- bfqg_stats_init(&bfqg->dead_stats, gfp)) { |
8561 |
++ if (bfqg_stats_init(&bfqg->stats, gfp)) { |
8562 |
+ kfree(bfqg); |
8563 |
+ return NULL; |
8564 |
+ } |
8565 |
+@@ -407,27 +395,20 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) |
8566 |
+ return &bfqg->pd; |
8567 |
+ } |
8568 |
+ |
8569 |
+-static void bfq_group_set_parent(struct bfq_group *bfqg, |
8570 |
+- struct bfq_group *parent) |
8571 |
++static void bfq_pd_init(struct blkg_policy_data *pd) |
8572 |
+ { |
8573 |
++ struct blkcg_gq *blkg; |
8574 |
++ struct bfq_group *bfqg; |
8575 |
++ struct bfq_data *bfqd; |
8576 |
+ struct bfq_entity *entity; |
8577 |
++ struct bfq_group_data *d; |
8578 |
+ |
8579 |
+- BUG_ON(!parent); |
8580 |
+- BUG_ON(!bfqg); |
8581 |
+- BUG_ON(bfqg == parent); |
8582 |
+- |
8583 |
++ blkg = pd_to_blkg(pd); |
8584 |
++ BUG_ON(!blkg); |
8585 |
++ bfqg = blkg_to_bfqg(blkg); |
8586 |
++ bfqd = blkg->q->elevator->elevator_data; |
8587 |
+ entity = &bfqg->entity; |
8588 |
+- entity->parent = parent->my_entity; |
8589 |
+- entity->sched_data = &parent->sched_data; |
8590 |
+-} |
8591 |
+- |
8592 |
+-static void bfq_pd_init(struct blkg_policy_data *pd) |
8593 |
+-{ |
8594 |
+- struct blkcg_gq *blkg = pd_to_blkg(pd); |
8595 |
+- struct bfq_group *bfqg = blkg_to_bfqg(blkg); |
8596 |
+- struct bfq_data *bfqd = blkg->q->elevator->elevator_data; |
8597 |
+- struct bfq_entity *entity = &bfqg->entity; |
8598 |
+- struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); |
8599 |
++ d = blkcg_to_bfqgd(blkg->blkcg); |
8600 |
+ |
8601 |
+ entity->orig_weight = entity->weight = entity->new_weight = d->weight; |
8602 |
+ entity->my_sched_data = &bfqg->sched_data; |
8603 |
+@@ -445,45 +426,28 @@ static void bfq_pd_free(struct blkg_policy_data *pd) |
8604 |
+ struct bfq_group *bfqg = pd_to_bfqg(pd); |
8605 |
+ |
8606 |
+ bfqg_stats_exit(&bfqg->stats); |
8607 |
+- bfqg_stats_exit(&bfqg->dead_stats); |
8608 |
+- |
8609 |
+ return kfree(bfqg); |
8610 |
+ } |
8611 |
+ |
8612 |
+-/* offset delta from bfqg->stats to bfqg->dead_stats */ |
8613 |
+-static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) - |
8614 |
+- offsetof(struct bfq_group, stats); |
8615 |
+- |
8616 |
+-/* to be used by recursive prfill, sums live and dead stats recursively */ |
8617 |
+-static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) |
8618 |
++static void bfq_pd_reset_stats(struct blkg_policy_data *pd) |
8619 |
+ { |
8620 |
+- u64 sum = 0; |
8621 |
++ struct bfq_group *bfqg = pd_to_bfqg(pd); |
8622 |
+ |
8623 |
+- sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); |
8624 |
+- sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, |
8625 |
+- off + dead_stats_off_delta); |
8626 |
+- return sum; |
8627 |
++ bfqg_stats_reset(&bfqg->stats); |
8628 |
+ } |
8629 |
+ |
8630 |
+-/* to be used by recursive prfill, sums live and dead rwstats recursively */ |
8631 |
+-static struct blkg_rwstat bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, |
8632 |
+- int off) |
8633 |
++static void bfq_group_set_parent(struct bfq_group *bfqg, |
8634 |
++ struct bfq_group *parent) |
8635 |
+ { |
8636 |
+- struct blkg_rwstat a, b; |
8637 |
+- |
8638 |
+- a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); |
8639 |
+- b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, |
8640 |
+- off + dead_stats_off_delta); |
8641 |
+- blkg_rwstat_add_aux(&a, &b); |
8642 |
+- return a; |
8643 |
+-} |
8644 |
++ struct bfq_entity *entity; |
8645 |
+ |
8646 |
+-static void bfq_pd_reset_stats(struct blkg_policy_data *pd) |
8647 |
+-{ |
8648 |
+- struct bfq_group *bfqg = pd_to_bfqg(pd); |
8649 |
++ BUG_ON(!parent); |
8650 |
++ BUG_ON(!bfqg); |
8651 |
++ BUG_ON(bfqg == parent); |
8652 |
+ |
8653 |
+- bfqg_stats_reset(&bfqg->stats); |
8654 |
+- bfqg_stats_reset(&bfqg->dead_stats); |
8655 |
++ entity = &bfqg->entity; |
8656 |
++ entity->parent = parent->my_entity; |
8657 |
++ entity->sched_data = &parent->sched_data; |
8658 |
+ } |
8659 |
+ |
8660 |
+ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, |
8661 |
+@@ -531,13 +495,18 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, |
8662 |
+ return bfqg; |
8663 |
+ } |
8664 |
+ |
8665 |
+-static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); |
8666 |
++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, |
8667 |
++ struct bfq_queue *bfqq); |
8668 |
++ |
8669 |
++static void bfq_bfqq_expire(struct bfq_data *bfqd, |
8670 |
++ struct bfq_queue *bfqq, |
8671 |
++ bool compensate, |
8672 |
++ enum bfqq_expiration reason); |
8673 |
+ |
8674 |
+ /** |
8675 |
+ * bfq_bfqq_move - migrate @bfqq to @bfqg. |
8676 |
+ * @bfqd: queue descriptor. |
8677 |
+ * @bfqq: the queue to move. |
8678 |
+- * @entity: @bfqq's entity. |
8679 |
+ * @bfqg: the group to move to. |
8680 |
+ * |
8681 |
+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating |
8682 |
+@@ -548,26 +517,40 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
8683 |
+ * rcu_read_lock()). |
8684 |
+ */ |
8685 |
+ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
8686 |
+- struct bfq_entity *entity, struct bfq_group *bfqg) |
8687 |
++ struct bfq_group *bfqg) |
8688 |
+ { |
8689 |
+- int busy, resume; |
8690 |
++ struct bfq_entity *entity = &bfqq->entity; |
8691 |
+ |
8692 |
+- busy = bfq_bfqq_busy(bfqq); |
8693 |
+- resume = !RB_EMPTY_ROOT(&bfqq->sort_list); |
8694 |
+- |
8695 |
+- BUG_ON(resume && !entity->on_st); |
8696 |
+- BUG_ON(busy && !resume && entity->on_st && |
8697 |
++ BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); |
8698 |
++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); |
8699 |
++ BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) |
8700 |
++ && entity->on_st && |
8701 |
+ bfqq != bfqd->in_service_queue); |
8702 |
++ BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); |
8703 |
++ |
8704 |
++ /* If bfqq is empty, then bfq_bfqq_expire also invokes |
8705 |
++ * bfq_del_bfqq_busy, thereby removing bfqq and its entity |
8706 |
++ * from data structures related to current group. Otherwise we |
8707 |
++ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as |
8708 |
++ * we do below. |
8709 |
++ */ |
8710 |
++ if (bfqq == bfqd->in_service_queue) |
8711 |
++ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, |
8712 |
++ false, BFQ_BFQQ_PREEMPTED); |
8713 |
++ |
8714 |
++ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) |
8715 |
++ && &bfq_entity_service_tree(entity)->idle != |
8716 |
++ entity->tree); |
8717 |
+ |
8718 |
+- if (busy) { |
8719 |
+- BUG_ON(atomic_read(&bfqq->ref) < 2); |
8720 |
++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); |
8721 |
+ |
8722 |
+- if (!resume) |
8723 |
+- bfq_del_bfqq_busy(bfqd, bfqq, 0); |
8724 |
+- else |
8725 |
+- bfq_deactivate_bfqq(bfqd, bfqq, 0); |
8726 |
+- } else if (entity->on_st) |
8727 |
++ if (bfq_bfqq_busy(bfqq)) |
8728 |
++ bfq_deactivate_bfqq(bfqd, bfqq, 0); |
8729 |
++ else if (entity->on_st) { |
8730 |
++ BUG_ON(&bfq_entity_service_tree(entity)->idle != |
8731 |
++ entity->tree); |
8732 |
+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); |
8733 |
++ } |
8734 |
+ bfqg_put(bfqq_group(bfqq)); |
8735 |
+ |
8736 |
+ /* |
8737 |
+@@ -579,14 +562,17 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
8738 |
+ entity->sched_data = &bfqg->sched_data; |
8739 |
+ bfqg_get(bfqg); |
8740 |
+ |
8741 |
+- if (busy) { |
8742 |
++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); |
8743 |
++ if (bfq_bfqq_busy(bfqq)) { |
8744 |
+ bfq_pos_tree_add_move(bfqd, bfqq); |
8745 |
+- if (resume) |
8746 |
+- bfq_activate_bfqq(bfqd, bfqq); |
8747 |
++ bfq_activate_bfqq(bfqd, bfqq); |
8748 |
+ } |
8749 |
+ |
8750 |
+ if (!bfqd->in_service_queue && !bfqd->rq_in_driver) |
8751 |
+ bfq_schedule_dispatch(bfqd); |
8752 |
++ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) |
8753 |
++ && &bfq_entity_service_tree(entity)->idle != |
8754 |
++ entity->tree); |
8755 |
+ } |
8756 |
+ |
8757 |
+ /** |
8758 |
+@@ -621,7 +607,8 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, |
8759 |
+ bic_set_bfqq(bic, NULL, 0); |
8760 |
+ bfq_log_bfqq(bfqd, async_bfqq, |
8761 |
+ "bic_change_group: %p %d", |
8762 |
+- async_bfqq, atomic_read(&async_bfqq->ref)); |
8763 |
++ async_bfqq, |
8764 |
++ async_bfqq->ref); |
8765 |
+ bfq_put_queue(async_bfqq); |
8766 |
+ } |
8767 |
+ } |
8768 |
+@@ -629,7 +616,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, |
8769 |
+ if (sync_bfqq) { |
8770 |
+ entity = &sync_bfqq->entity; |
8771 |
+ if (entity->sched_data != &bfqg->sched_data) |
8772 |
+- bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); |
8773 |
++ bfq_bfqq_move(bfqd, sync_bfqq, bfqg); |
8774 |
+ } |
8775 |
+ |
8776 |
+ return bfqg; |
8777 |
+@@ -638,25 +625,23 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, |
8778 |
+ static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) |
8779 |
+ { |
8780 |
+ struct bfq_data *bfqd = bic_to_bfqd(bic); |
8781 |
+- struct blkcg *blkcg; |
8782 |
+ struct bfq_group *bfqg = NULL; |
8783 |
+- uint64_t id; |
8784 |
++ uint64_t serial_nr; |
8785 |
+ |
8786 |
+ rcu_read_lock(); |
8787 |
+- blkcg = bio_blkcg(bio); |
8788 |
+- id = blkcg->css.serial_nr; |
8789 |
+- rcu_read_unlock(); |
8790 |
++ serial_nr = bio_blkcg(bio)->css.serial_nr; |
8791 |
+ |
8792 |
+ /* |
8793 |
+ * Check whether blkcg has changed. The condition may trigger |
8794 |
+ * spuriously on a newly created cic but there's no harm. |
8795 |
+ */ |
8796 |
+- if (unlikely(!bfqd) || likely(bic->blkcg_id == id)) |
8797 |
+- return; |
8798 |
++ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) |
8799 |
++ goto out; |
8800 |
+ |
8801 |
+- bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg); |
8802 |
+- BUG_ON(!bfqg); |
8803 |
+- bic->blkcg_id = id; |
8804 |
++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); |
8805 |
++ bic->blkcg_serial_nr = serial_nr; |
8806 |
++out: |
8807 |
++ rcu_read_unlock(); |
8808 |
+ } |
8809 |
+ |
8810 |
+ /** |
8811 |
+@@ -682,8 +667,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, |
8812 |
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
8813 |
+ |
8814 |
+ BUG_ON(!bfqq); |
8815 |
+- bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); |
8816 |
+- return; |
8817 |
++ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); |
8818 |
+ } |
8819 |
+ |
8820 |
+ /** |
8821 |
+@@ -711,16 +695,15 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd, |
8822 |
+ if (bfqg->sched_data.in_service_entity) |
8823 |
+ bfq_reparent_leaf_entity(bfqd, |
8824 |
+ bfqg->sched_data.in_service_entity); |
8825 |
+- |
8826 |
+- return; |
8827 |
+ } |
8828 |
+ |
8829 |
+ /** |
8830 |
+- * bfq_destroy_group - destroy @bfqg. |
8831 |
+- * @bfqg: the group being destroyed. |
8832 |
++ * bfq_pd_offline - deactivate the entity associated with @pd, |
8833 |
++ * and reparent its children entities. |
8834 |
++ * @pd: descriptor of the policy going offline. |
8835 |
+ * |
8836 |
+- * Destroy @bfqg, making sure that it is not referenced from its parent. |
8837 |
+- * blkio already grabs the queue_lock for us, so no need to use RCU-based magic |
8838 |
++ * blkio already grabs the queue_lock for us, so no need to use |
8839 |
++ * RCU-based magic |
8840 |
+ */ |
8841 |
+ static void bfq_pd_offline(struct blkg_policy_data *pd) |
8842 |
+ { |
8843 |
+@@ -779,6 +762,12 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) |
8844 |
+ bfq_put_async_queues(bfqd, bfqg); |
8845 |
+ BUG_ON(entity->tree); |
8846 |
+ |
8847 |
++ /* |
8848 |
++ * @blkg is going offline and will be ignored by |
8849 |
++ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so |
8850 |
++ * that they don't get lost. If IOs complete after this point, the |
8851 |
++ * stats for them will be lost. Oh well... |
8852 |
++ */ |
8853 |
+ bfqg_stats_xfer_dead(bfqg); |
8854 |
+ } |
8855 |
+ |
8856 |
+@@ -788,46 +777,35 @@ static void bfq_end_wr_async(struct bfq_data *bfqd) |
8857 |
+ |
8858 |
+ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { |
8859 |
+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); |
8860 |
++ BUG_ON(!bfqg); |
8861 |
+ |
8862 |
+ bfq_end_wr_async_queues(bfqd, bfqg); |
8863 |
+ } |
8864 |
+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); |
8865 |
+ } |
8866 |
+ |
8867 |
+-static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css, |
8868 |
+- struct cftype *cftype) |
8869 |
+-{ |
8870 |
+- struct blkcg *blkcg = css_to_blkcg(css); |
8871 |
+- struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); |
8872 |
+- int ret = -EINVAL; |
8873 |
+- |
8874 |
+- spin_lock_irq(&blkcg->lock); |
8875 |
+- ret = bfqgd->weight; |
8876 |
+- spin_unlock_irq(&blkcg->lock); |
8877 |
+- |
8878 |
+- return ret; |
8879 |
+-} |
8880 |
+- |
8881 |
+-static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v) |
8882 |
++static int bfq_io_show_weight(struct seq_file *sf, void *v) |
8883 |
+ { |
8884 |
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); |
8885 |
+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); |
8886 |
++ unsigned int val = 0; |
8887 |
+ |
8888 |
+- spin_lock_irq(&blkcg->lock); |
8889 |
+- seq_printf(sf, "%u\n", bfqgd->weight); |
8890 |
+- spin_unlock_irq(&blkcg->lock); |
8891 |
++ if (bfqgd) |
8892 |
++ val = bfqgd->weight; |
8893 |
++ |
8894 |
++ seq_printf(sf, "%u\n", val); |
8895 |
+ |
8896 |
+ return 0; |
8897 |
+ } |
8898 |
+ |
8899 |
+-static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, |
8900 |
+- struct cftype *cftype, |
8901 |
+- u64 val) |
8902 |
++static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, |
8903 |
++ struct cftype *cftype, |
8904 |
++ u64 val) |
8905 |
+ { |
8906 |
+ struct blkcg *blkcg = css_to_blkcg(css); |
8907 |
+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); |
8908 |
+ struct blkcg_gq *blkg; |
8909 |
+- int ret = -EINVAL; |
8910 |
++ int ret = -ERANGE; |
8911 |
+ |
8912 |
+ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) |
8913 |
+ return ret; |
8914 |
+@@ -837,6 +815,7 @@ static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, |
8915 |
+ bfqgd->weight = (unsigned short)val; |
8916 |
+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { |
8917 |
+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); |
8918 |
++ |
8919 |
+ if (!bfqg) |
8920 |
+ continue; |
8921 |
+ /* |
8922 |
+@@ -871,13 +850,18 @@ static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, |
8923 |
+ return ret; |
8924 |
+ } |
8925 |
+ |
8926 |
+-static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of, |
8927 |
+- char *buf, size_t nbytes, |
8928 |
+- loff_t off) |
8929 |
++static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, |
8930 |
++ char *buf, size_t nbytes, |
8931 |
++ loff_t off) |
8932 |
+ { |
8933 |
++ u64 weight; |
8934 |
+ /* First unsigned long found in the file is used */ |
8935 |
+- return bfqio_cgroup_weight_write(of_css(of), NULL, |
8936 |
+- simple_strtoull(strim(buf), NULL, 0)); |
8937 |
++ int ret = kstrtoull(strim(buf), 0, &weight); |
8938 |
++ |
8939 |
++ if (ret) |
8940 |
++ return ret; |
8941 |
++ |
8942 |
++ return bfq_io_set_weight_legacy(of_css(of), NULL, weight); |
8943 |
+ } |
8944 |
+ |
8945 |
+ static int bfqg_print_stat(struct seq_file *sf, void *v) |
8946 |
+@@ -897,16 +881,17 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v) |
8947 |
+ static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, |
8948 |
+ struct blkg_policy_data *pd, int off) |
8949 |
+ { |
8950 |
+- u64 sum = bfqg_stat_pd_recursive_sum(pd, off); |
8951 |
+- |
8952 |
++ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), |
8953 |
++ &blkcg_policy_bfq, off); |
8954 |
+ return __blkg_prfill_u64(sf, pd, sum); |
8955 |
+ } |
8956 |
+ |
8957 |
+ static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, |
8958 |
+ struct blkg_policy_data *pd, int off) |
8959 |
+ { |
8960 |
+- struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off); |
8961 |
+- |
8962 |
++ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), |
8963 |
++ &blkcg_policy_bfq, |
8964 |
++ off); |
8965 |
+ return __blkg_prfill_rwstat(sf, pd, &sum); |
8966 |
+ } |
8967 |
+ |
8968 |
+@@ -926,6 +911,41 @@ static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) |
8969 |
+ return 0; |
8970 |
+ } |
8971 |
+ |
8972 |
++static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, |
8973 |
++ int off) |
8974 |
++{ |
8975 |
++ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); |
8976 |
++ |
8977 |
++ return __blkg_prfill_u64(sf, pd, sum >> 9); |
8978 |
++} |
8979 |
++ |
8980 |
++static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) |
8981 |
++{ |
8982 |
++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), |
8983 |
++ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); |
8984 |
++ return 0; |
8985 |
++} |
8986 |
++ |
8987 |
++static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, |
8988 |
++ struct blkg_policy_data *pd, int off) |
8989 |
++{ |
8990 |
++ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, |
8991 |
++ offsetof(struct blkcg_gq, stat_bytes)); |
8992 |
++ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + |
8993 |
++ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); |
8994 |
++ |
8995 |
++ return __blkg_prfill_u64(sf, pd, sum >> 9); |
8996 |
++} |
8997 |
++ |
8998 |
++static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) |
8999 |
++{ |
9000 |
++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), |
9001 |
++ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, |
9002 |
++ false); |
9003 |
++ return 0; |
9004 |
++} |
9005 |
++ |
9006 |
++ |
9007 |
+ static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, |
9008 |
+ struct blkg_policy_data *pd, int off) |
9009 |
+ { |
9010 |
+@@ -950,7 +970,8 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) |
9011 |
+ return 0; |
9012 |
+ } |
9013 |
+ |
9014 |
+-static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) |
9015 |
++static struct bfq_group * |
9016 |
++bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) |
9017 |
+ { |
9018 |
+ int ret; |
9019 |
+ |
9020 |
+@@ -958,41 +979,18 @@ static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int n |
9021 |
+ if (ret) |
9022 |
+ return NULL; |
9023 |
+ |
9024 |
+- return blkg_to_bfqg(bfqd->queue->root_blkg); |
9025 |
+-} |
9026 |
+- |
9027 |
+-static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) |
9028 |
+-{ |
9029 |
+- struct bfq_group_data *bgd; |
9030 |
+- |
9031 |
+- bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); |
9032 |
+- if (!bgd) |
9033 |
+- return NULL; |
9034 |
+- return &bgd->pd; |
9035 |
++ return blkg_to_bfqg(bfqd->queue->root_blkg); |
9036 |
+ } |
9037 |
+ |
9038 |
+-static void bfq_cpd_free(struct blkcg_policy_data *cpd) |
9039 |
+-{ |
9040 |
+- kfree(cpd_to_bfqgd(cpd)); |
9041 |
+-} |
9042 |
+- |
9043 |
+-static struct cftype bfqio_files_dfl[] = { |
9044 |
++static struct cftype bfq_blkcg_legacy_files[] = { |
9045 |
+ { |
9046 |
+- .name = "weight", |
9047 |
++ .name = "bfq.weight", |
9048 |
+ .flags = CFTYPE_NOT_ON_ROOT, |
9049 |
+- .seq_show = bfqio_cgroup_weight_read_dfl, |
9050 |
+- .write = bfqio_cgroup_weight_write_dfl, |
9051 |
++ .seq_show = bfq_io_show_weight, |
9052 |
++ .write_u64 = bfq_io_set_weight_legacy, |
9053 |
+ }, |
9054 |
+- {} /* terminate */ |
9055 |
+-}; |
9056 |
+ |
9057 |
+-static struct cftype bfqio_files[] = { |
9058 |
+- { |
9059 |
+- .name = "bfq.weight", |
9060 |
+- .read_u64 = bfqio_cgroup_weight_read, |
9061 |
+- .write_u64 = bfqio_cgroup_weight_write, |
9062 |
+- }, |
9063 |
+- /* statistics, cover only the tasks in the bfqg */ |
9064 |
++ /* statistics, covers only the tasks in the bfqg */ |
9065 |
+ { |
9066 |
+ .name = "bfq.time", |
9067 |
+ .private = offsetof(struct bfq_group, stats.time), |
9068 |
+@@ -1000,18 +998,17 @@ static struct cftype bfqio_files[] = { |
9069 |
+ }, |
9070 |
+ { |
9071 |
+ .name = "bfq.sectors", |
9072 |
+- .private = offsetof(struct bfq_group, stats.sectors), |
9073 |
+- .seq_show = bfqg_print_stat, |
9074 |
++ .seq_show = bfqg_print_stat_sectors, |
9075 |
+ }, |
9076 |
+ { |
9077 |
+ .name = "bfq.io_service_bytes", |
9078 |
+- .private = offsetof(struct bfq_group, stats.service_bytes), |
9079 |
+- .seq_show = bfqg_print_rwstat, |
9080 |
++ .private = (unsigned long)&blkcg_policy_bfq, |
9081 |
++ .seq_show = blkg_print_stat_bytes, |
9082 |
+ }, |
9083 |
+ { |
9084 |
+ .name = "bfq.io_serviced", |
9085 |
+- .private = offsetof(struct bfq_group, stats.serviced), |
9086 |
+- .seq_show = bfqg_print_rwstat, |
9087 |
++ .private = (unsigned long)&blkcg_policy_bfq, |
9088 |
++ .seq_show = blkg_print_stat_ios, |
9089 |
+ }, |
9090 |
+ { |
9091 |
+ .name = "bfq.io_service_time", |
9092 |
+@@ -1042,18 +1039,17 @@ static struct cftype bfqio_files[] = { |
9093 |
+ }, |
9094 |
+ { |
9095 |
+ .name = "bfq.sectors_recursive", |
9096 |
+- .private = offsetof(struct bfq_group, stats.sectors), |
9097 |
+- .seq_show = bfqg_print_stat_recursive, |
9098 |
++ .seq_show = bfqg_print_stat_sectors_recursive, |
9099 |
+ }, |
9100 |
+ { |
9101 |
+ .name = "bfq.io_service_bytes_recursive", |
9102 |
+- .private = offsetof(struct bfq_group, stats.service_bytes), |
9103 |
+- .seq_show = bfqg_print_rwstat_recursive, |
9104 |
++ .private = (unsigned long)&blkcg_policy_bfq, |
9105 |
++ .seq_show = blkg_print_stat_bytes_recursive, |
9106 |
+ }, |
9107 |
+ { |
9108 |
+ .name = "bfq.io_serviced_recursive", |
9109 |
+- .private = offsetof(struct bfq_group, stats.serviced), |
9110 |
+- .seq_show = bfqg_print_rwstat_recursive, |
9111 |
++ .private = (unsigned long)&blkcg_policy_bfq, |
9112 |
++ .seq_show = blkg_print_stat_ios_recursive, |
9113 |
+ }, |
9114 |
+ { |
9115 |
+ .name = "bfq.io_service_time_recursive", |
9116 |
+@@ -1099,32 +1095,35 @@ static struct cftype bfqio_files[] = { |
9117 |
+ .private = offsetof(struct bfq_group, stats.dequeue), |
9118 |
+ .seq_show = bfqg_print_stat, |
9119 |
+ }, |
9120 |
+- { |
9121 |
+- .name = "bfq.unaccounted_time", |
9122 |
+- .private = offsetof(struct bfq_group, stats.unaccounted_time), |
9123 |
+- .seq_show = bfqg_print_stat, |
9124 |
+- }, |
9125 |
+ { } /* terminate */ |
9126 |
+ }; |
9127 |
+ |
9128 |
+-static struct blkcg_policy blkcg_policy_bfq = { |
9129 |
+- .dfl_cftypes = bfqio_files_dfl, |
9130 |
+- .legacy_cftypes = bfqio_files, |
9131 |
+- |
9132 |
+- .pd_alloc_fn = bfq_pd_alloc, |
9133 |
+- .pd_init_fn = bfq_pd_init, |
9134 |
+- .pd_offline_fn = bfq_pd_offline, |
9135 |
+- .pd_free_fn = bfq_pd_free, |
9136 |
+- .pd_reset_stats_fn = bfq_pd_reset_stats, |
9137 |
+- |
9138 |
+- .cpd_alloc_fn = bfq_cpd_alloc, |
9139 |
+- .cpd_init_fn = bfq_cpd_init, |
9140 |
+- .cpd_bind_fn = bfq_cpd_init, |
9141 |
+- .cpd_free_fn = bfq_cpd_free, |
9142 |
+- |
9143 |
++static struct cftype bfq_blkg_files[] = { |
9144 |
++ { |
9145 |
++ .name = "bfq.weight", |
9146 |
++ .flags = CFTYPE_NOT_ON_ROOT, |
9147 |
++ .seq_show = bfq_io_show_weight, |
9148 |
++ .write = bfq_io_set_weight, |
9149 |
++ }, |
9150 |
++ {} /* terminate */ |
9151 |
+ }; |
9152 |
+ |
9153 |
+-#else |
9154 |
++#else /* CONFIG_BFQ_GROUP_IOSCHED */ |
9155 |
++ |
9156 |
++static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, |
9157 |
++ struct bfq_queue *bfqq, int rw) { } |
9158 |
++static inline void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) { } |
9159 |
++static inline void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) { } |
9160 |
++static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, |
9161 |
++ uint64_t start_time, uint64_t io_start_time, int rw) { } |
9162 |
++static inline void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, |
9163 |
++struct bfq_group *curr_bfqg) { } |
9164 |
++static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } |
9165 |
++static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } |
9166 |
++static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } |
9167 |
++static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } |
9168 |
++static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } |
9169 |
++static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } |
9170 |
+ |
9171 |
+ static void bfq_init_entity(struct bfq_entity *entity, |
9172 |
+ struct bfq_group *bfqg) |
9173 |
+@@ -1146,29 +1145,22 @@ bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) |
9174 |
+ return bfqd->root_group; |
9175 |
+ } |
9176 |
+ |
9177 |
+-static void bfq_bfqq_move(struct bfq_data *bfqd, |
9178 |
+- struct bfq_queue *bfqq, |
9179 |
+- struct bfq_entity *entity, |
9180 |
+- struct bfq_group *bfqg) |
9181 |
+-{ |
9182 |
+-} |
9183 |
+- |
9184 |
+ static void bfq_end_wr_async(struct bfq_data *bfqd) |
9185 |
+ { |
9186 |
+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); |
9187 |
+ } |
9188 |
+ |
9189 |
+-static void bfq_disconnect_groups(struct bfq_data *bfqd) |
9190 |
+-{ |
9191 |
+- bfq_put_async_queues(bfqd, bfqd->root_group); |
9192 |
+-} |
9193 |
+- |
9194 |
+ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, |
9195 |
+ struct blkcg *blkcg) |
9196 |
+ { |
9197 |
+ return bfqd->root_group; |
9198 |
+ } |
9199 |
+ |
9200 |
++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) |
9201 |
++{ |
9202 |
++ return bfqq->bfqd->root_group; |
9203 |
++} |
9204 |
++ |
9205 |
+ static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) |
9206 |
+ { |
9207 |
+ struct bfq_group *bfqg; |
9208 |
+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c |
9209 |
+index d1f648d..5469442 100644 |
9210 |
+--- a/block/bfq-iosched.c |
9211 |
++++ b/block/bfq-iosched.c |
9212 |
+@@ -7,25 +7,26 @@ |
9213 |
+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
9214 |
+ * Paolo Valente <paolo.valente@×××××××.it> |
9215 |
+ * |
9216 |
+- * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
9217 |
++ * Copyright (C) 2016 Paolo Valente <paolo.valente@×××××××.it> |
9218 |
+ * |
9219 |
+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ |
9220 |
+ * file. |
9221 |
+ * |
9222 |
+- * BFQ is a proportional-share storage-I/O scheduling algorithm based on |
9223 |
+- * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, |
9224 |
+- * measured in number of sectors, to processes instead of time slices. The |
9225 |
+- * device is not granted to the in-service process for a given time slice, |
9226 |
+- * but until it has exhausted its assigned budget. This change from the time |
9227 |
+- * to the service domain allows BFQ to distribute the device throughput |
9228 |
+- * among processes as desired, without any distortion due to ZBR, workload |
9229 |
+- * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, |
9230 |
+- * called B-WF2Q+, to schedule processes according to their budgets. More |
9231 |
+- * precisely, BFQ schedules queues associated to processes. Thanks to the |
9232 |
+- * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to |
9233 |
+- * I/O-bound processes issuing sequential requests (to boost the |
9234 |
+- * throughput), and yet guarantee a low latency to interactive and soft |
9235 |
+- * real-time applications. |
9236 |
++ * BFQ is a proportional-share storage-I/O scheduling algorithm based |
9237 |
++ * on the slice-by-slice service scheme of CFQ. But BFQ assigns |
9238 |
++ * budgets, measured in number of sectors, to processes instead of |
9239 |
++ * time slices. The device is not granted to the in-service process |
9240 |
++ * for a given time slice, but until it has exhausted its assigned |
9241 |
++ * budget. This change from the time to the service domain enables BFQ |
9242 |
++ * to distribute the device throughput among processes as desired, |
9243 |
++ * without any distortion due to throughput fluctuations, or to device |
9244 |
++ * internal queueing. BFQ uses an ad hoc internal scheduler, called |
9245 |
++ * B-WF2Q+, to schedule processes according to their budgets. More |
9246 |
++ * precisely, BFQ schedules queues associated with processes. Thanks to |
9247 |
++ * the accurate policy of B-WF2Q+, BFQ can afford to assign high |
9248 |
++ * budgets to I/O-bound processes issuing sequential requests (to |
9249 |
++ * boost the throughput), and yet guarantee a low latency to |
9250 |
++ * interactive and soft real-time applications. |
9251 |
+ * |
9252 |
+ * BFQ is described in [1], where also a reference to the initial, more |
9253 |
+ * theoretical paper on BFQ can be found. The interested reader can find |
9254 |
+@@ -87,7 +88,6 @@ static const int bfq_stats_min_budgets = 194; |
9255 |
+ |
9256 |
+ /* Default maximum budget values, in sectors and number of requests. */ |
9257 |
+ static const int bfq_default_max_budget = 16 * 1024; |
9258 |
+-static const int bfq_max_budget_async_rq = 4; |
9259 |
+ |
9260 |
+ /* |
9261 |
+ * Async to sync throughput distribution is controlled as follows: |
9262 |
+@@ -97,8 +97,7 @@ static const int bfq_max_budget_async_rq = 4; |
9263 |
+ static const int bfq_async_charge_factor = 10; |
9264 |
+ |
9265 |
+ /* Default timeout values, in jiffies, approximating CFQ defaults. */ |
9266 |
+-static const int bfq_timeout_sync = HZ / 8; |
9267 |
+-static int bfq_timeout_async = HZ / 25; |
9268 |
++static const int bfq_timeout = HZ / 8; |
9269 |
+ |
9270 |
+ struct kmem_cache *bfq_pool; |
9271 |
+ |
9272 |
+@@ -109,8 +108,9 @@ struct kmem_cache *bfq_pool; |
9273 |
+ #define BFQ_HW_QUEUE_THRESHOLD 4 |
9274 |
+ #define BFQ_HW_QUEUE_SAMPLES 32 |
9275 |
+ |
9276 |
+-#define BFQQ_SEEK_THR (sector_t)(8 * 1024) |
9277 |
+-#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) |
9278 |
++#define BFQQ_SEEK_THR (sector_t)(8 * 100) |
9279 |
++#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) |
9280 |
++#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) |
9281 |
+ |
9282 |
+ /* Min samples used for peak rate estimation (for autotuning). */ |
9283 |
+ #define BFQ_PEAK_RATE_SAMPLES 32 |
9284 |
+@@ -141,16 +141,24 @@ struct kmem_cache *bfq_pool; |
9285 |
+ * The device's speed class is dynamically (re)detected in |
9286 |
+ * bfq_update_peak_rate() every time the estimated peak rate is updated. |
9287 |
+ * |
9288 |
+- * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] |
9289 |
+- * are the reference values for a slow/fast rotational device, whereas |
9290 |
+- * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for |
9291 |
+- * a slow/fast non-rotational device. Finally, device_speed_thresh are the |
9292 |
+- * thresholds used to switch between speed classes. |
9293 |
++ * In the following definitions, R_slow[0]/R_fast[0] and |
9294 |
++ * T_slow[0]/T_fast[0] are the reference values for a slow/fast |
9295 |
++ * rotational device, whereas R_slow[1]/R_fast[1] and |
9296 |
++ * T_slow[1]/T_fast[1] are the reference values for a slow/fast |
9297 |
++ * non-rotational device. Finally, device_speed_thresh are the |
9298 |
++ * thresholds used to switch between speed classes. The reference |
9299 |
++ * rates are not the actual peak rates of the devices used as a |
9300 |
++ * reference, but slightly lower values. The reason for using these |
9301 |
++ * slightly lower values is that the peak-rate estimator tends to |
9302 |
++ * yield slightly lower values than the actual peak rate (it can yield |
9303 |
++ * the actual peak rate only if there is only one process doing I/O, |
9304 |
++ * and the process does sequential I/O). |
9305 |
++ * |
9306 |
+ * Both the reference peak rates and the thresholds are measured in |
9307 |
+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT. |
9308 |
+ */ |
9309 |
+-static int R_slow[2] = {1536, 10752}; |
9310 |
+-static int R_fast[2] = {17415, 34791}; |
9311 |
++static int R_slow[2] = {1000, 10700}; |
9312 |
++static int R_fast[2] = {14000, 33000}; |
9313 |
+ /* |
9314 |
+ * To improve readability, a conversion function is used to initialize the |
9315 |
+ * following arrays, which entails that they can be initialized only in a |
9316 |
+@@ -410,11 +418,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) |
9317 |
+ */ |
9318 |
+ static bool bfq_symmetric_scenario(struct bfq_data *bfqd) |
9319 |
+ { |
9320 |
+- return |
9321 |
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED |
9322 |
+- !bfqd->active_numerous_groups && |
9323 |
+-#endif |
9324 |
+- !bfq_differentiated_weights(bfqd); |
9325 |
++ return !bfq_differentiated_weights(bfqd); |
9326 |
+ } |
9327 |
+ |
9328 |
+ /* |
9329 |
+@@ -534,9 +538,19 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd, |
9330 |
+ static unsigned long bfq_serv_to_charge(struct request *rq, |
9331 |
+ struct bfq_queue *bfqq) |
9332 |
+ { |
9333 |
+- return blk_rq_sectors(rq) * |
9334 |
+- (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * |
9335 |
+- bfq_async_charge_factor)); |
9336 |
++ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) |
9337 |
++ return blk_rq_sectors(rq); |
9338 |
++ |
9339 |
++ /* |
9340 |
++ * If there are no weight-raised queues, then amplify service |
9341 |
++ * by just the async charge factor; otherwise amplify service |
9342 |
++ * by twice the async charge factor, to further reduce latency |
9343 |
++ * for weight-raised queues. |
9344 |
++ */ |
9345 |
++ if (bfqq->bfqd->wr_busy_queues == 0) |
9346 |
++ return blk_rq_sectors(rq) * bfq_async_charge_factor; |
9347 |
++ |
9348 |
++ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; |
9349 |
+ } |
9350 |
+ |
9351 |
+ /** |
9352 |
+@@ -591,12 +605,23 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) |
9353 |
+ dur = bfqd->RT_prod; |
9354 |
+ do_div(dur, bfqd->peak_rate); |
9355 |
+ |
9356 |
+- return dur; |
9357 |
+-} |
9358 |
++ /* |
9359 |
++ * Limit duration between 3 and 13 seconds. Tests show that |
9360 |
++ * higher values than 13 seconds often yield the opposite of |
9361 |
++ * the desired result, i.e., worsen responsiveness by letting |
9362 |
++ * non-interactive and non-soft-real-time applications |
9363 |
++ * preserve weight raising for a too long time interval. |
9364 |
++ * |
9365 |
++ * On the other end, lower values than 3 seconds make it |
9366 |
++ * difficult for most interactive tasks to complete their jobs |
9367 |
++ * before weight-raising finishes. |
9368 |
++ */ |
9369 |
++ if (dur > msecs_to_jiffies(13000)) |
9370 |
++ dur = msecs_to_jiffies(13000); |
9371 |
++ else if (dur < msecs_to_jiffies(3000)) |
9372 |
++ dur = msecs_to_jiffies(3000); |
9373 |
+ |
9374 |
+-static unsigned bfq_bfqq_cooperations(struct bfq_queue *bfqq) |
9375 |
+-{ |
9376 |
+- return bfqq->bic ? bfqq->bic->cooperations : 0; |
9377 |
++ return dur; |
9378 |
+ } |
9379 |
+ |
9380 |
+ static void |
9381 |
+@@ -606,31 +631,11 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) |
9382 |
+ bfq_mark_bfqq_idle_window(bfqq); |
9383 |
+ else |
9384 |
+ bfq_clear_bfqq_idle_window(bfqq); |
9385 |
++ |
9386 |
+ if (bic->saved_IO_bound) |
9387 |
+ bfq_mark_bfqq_IO_bound(bfqq); |
9388 |
+ else |
9389 |
+ bfq_clear_bfqq_IO_bound(bfqq); |
9390 |
+- /* Assuming that the flag in_large_burst is already correctly set */ |
9391 |
+- if (bic->wr_time_left && bfqq->bfqd->low_latency && |
9392 |
+- !bfq_bfqq_in_large_burst(bfqq) && |
9393 |
+- bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { |
9394 |
+- /* |
9395 |
+- * Start a weight raising period with the duration given by |
9396 |
+- * the raising_time_left snapshot. |
9397 |
+- */ |
9398 |
+- if (bfq_bfqq_busy(bfqq)) |
9399 |
+- bfqq->bfqd->wr_busy_queues++; |
9400 |
+- bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; |
9401 |
+- bfqq->wr_cur_max_time = bic->wr_time_left; |
9402 |
+- bfqq->last_wr_start_finish = jiffies; |
9403 |
+- bfqq->entity.prio_changed = 1; |
9404 |
+- } |
9405 |
+- /* |
9406 |
+- * Clear wr_time_left to prevent bfq_bfqq_save_state() from |
9407 |
+- * getting confused about the queue's need of a weight-raising |
9408 |
+- * period. |
9409 |
+- */ |
9410 |
+- bic->wr_time_left = 0; |
9411 |
+ } |
9412 |
+ |
9413 |
+ static int bfqq_process_refs(struct bfq_queue *bfqq) |
9414 |
+@@ -640,7 +645,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) |
9415 |
+ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); |
9416 |
+ |
9417 |
+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; |
9418 |
+- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; |
9419 |
++ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; |
9420 |
+ BUG_ON(process_refs < 0); |
9421 |
+ return process_refs; |
9422 |
+ } |
9423 |
+@@ -655,6 +660,7 @@ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
9424 |
+ hlist_del_init(&item->burst_list_node); |
9425 |
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); |
9426 |
+ bfqd->burst_size = 1; |
9427 |
++ bfqd->burst_parent_entity = bfqq->entity.parent; |
9428 |
+ } |
9429 |
+ |
9430 |
+ /* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ |
9431 |
+@@ -663,6 +669,10 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
9432 |
+ /* Increment burst size to take into account also bfqq */ |
9433 |
+ bfqd->burst_size++; |
9434 |
+ |
9435 |
++ bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); |
9436 |
++ |
9437 |
++ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); |
9438 |
++ |
9439 |
+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { |
9440 |
+ struct bfq_queue *pos, *bfqq_item; |
9441 |
+ struct hlist_node *n; |
9442 |
+@@ -672,15 +682,19 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
9443 |
+ * other to consider this burst as large. |
9444 |
+ */ |
9445 |
+ bfqd->large_burst = true; |
9446 |
++ bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); |
9447 |
+ |
9448 |
+ /* |
9449 |
+ * We can now mark all queues in the burst list as |
9450 |
+ * belonging to a large burst. |
9451 |
+ */ |
9452 |
+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, |
9453 |
+- burst_list_node) |
9454 |
++ burst_list_node) { |
9455 |
+ bfq_mark_bfqq_in_large_burst(bfqq_item); |
9456 |
++ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); |
9457 |
++ } |
9458 |
+ bfq_mark_bfqq_in_large_burst(bfqq); |
9459 |
++ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); |
9460 |
+ |
9461 |
+ /* |
9462 |
+ * From now on, and until the current burst finishes, any |
9463 |
+@@ -692,67 +706,79 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
9464 |
+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, |
9465 |
+ burst_list_node) |
9466 |
+ hlist_del_init(&pos->burst_list_node); |
9467 |
+- } else /* burst not yet large: add bfqq to the burst list */ |
9468 |
++ } else /* |
9469 |
++ * Burst not yet large: add bfqq to the burst list. Do |
9470 |
++ * not increment the ref counter for bfqq, because bfqq |
9471 |
++ * is removed from the burst list before freeing bfqq |
9472 |
++ * in put_queue. |
9473 |
++ */ |
9474 |
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); |
9475 |
+ } |
9476 |
+ |
9477 |
+ /* |
9478 |
+- * If many queues happen to become active shortly after each other, then, |
9479 |
+- * to help the processes associated to these queues get their job done as |
9480 |
+- * soon as possible, it is usually better to not grant either weight-raising |
9481 |
+- * or device idling to these queues. In this comment we describe, firstly, |
9482 |
+- * the reasons why this fact holds, and, secondly, the next function, which |
9483 |
+- * implements the main steps needed to properly mark these queues so that |
9484 |
+- * they can then be treated in a different way. |
9485 |
++ * If many queues belonging to the same group happen to be created |
9486 |
++ * shortly after each other, then the processes associated with these |
9487 |
++ * queues have typically a common goal. In particular, bursts of queue |
9488 |
++ * creations are usually caused by services or applications that spawn |
9489 |
++ * many parallel threads/processes. Examples are systemd during boot, |
9490 |
++ * or git grep. To help these processes get their job done as soon as |
9491 |
++ * possible, it is usually better to not grant either weight-raising |
9492 |
++ * or device idling to their queues. |
9493 |
+ * |
9494 |
+- * As for the terminology, we say that a queue becomes active, i.e., |
9495 |
+- * switches from idle to backlogged, either when it is created (as a |
9496 |
+- * consequence of the arrival of an I/O request), or, if already existing, |
9497 |
+- * when a new request for the queue arrives while the queue is idle. |
9498 |
+- * Bursts of activations, i.e., activations of different queues occurring |
9499 |
+- * shortly after each other, are typically caused by services or applications |
9500 |
+- * that spawn or reactivate many parallel threads/processes. Examples are |
9501 |
+- * systemd during boot or git grep. |
9502 |
++ * In this comment we describe, firstly, the reasons why this fact |
9503 |
++ * holds, and, secondly, the next function, which implements the main |
9504 |
++ * steps needed to properly mark these queues so that they can then be |
9505 |
++ * treated in a different way. |
9506 |
+ * |
9507 |
+- * These services or applications benefit mostly from a high throughput: |
9508 |
+- * the quicker the requests of the activated queues are cumulatively served, |
9509 |
+- * the sooner the target job of these queues gets completed. As a consequence, |
9510 |
+- * weight-raising any of these queues, which also implies idling the device |
9511 |
+- * for it, is almost always counterproductive: in most cases it just lowers |
9512 |
+- * throughput. |
9513 |
++ * The above services or applications benefit mostly from a high |
9514 |
++ * throughput: the quicker the requests of the activated queues are |
9515 |
++ * cumulatively served, the sooner the target job of these queues gets |
9516 |
++ * completed. As a consequence, weight-raising any of these queues, |
9517 |
++ * which also implies idling the device for it, is almost always |
9518 |
++ * counterproductive. In most cases it just lowers throughput. |
9519 |
+ * |
9520 |
+- * On the other hand, a burst of activations may be also caused by the start |
9521 |
+- * of an application that does not consist in a lot of parallel I/O-bound |
9522 |
+- * threads. In fact, with a complex application, the burst may be just a |
9523 |
+- * consequence of the fact that several processes need to be executed to |
9524 |
+- * start-up the application. To start an application as quickly as possible, |
9525 |
+- * the best thing to do is to privilege the I/O related to the application |
9526 |
+- * with respect to all other I/O. Therefore, the best strategy to start as |
9527 |
+- * quickly as possible an application that causes a burst of activations is |
9528 |
+- * to weight-raise all the queues activated during the burst. This is the |
9529 |
++ * On the other hand, a burst of queue creations may be caused also by |
9530 |
++ * the start of an application that does not consist of a lot of |
9531 |
++ * parallel I/O-bound threads. In fact, with a complex application, |
9532 |
++ * several short processes may need to be executed to start-up the |
9533 |
++ * application. In this respect, to start an application as quickly as |
9534 |
++ * possible, the best thing to do is in any case to privilege the I/O |
9535 |
++ * related to the application with respect to all other |
9536 |
++ * I/O. Therefore, the best strategy to start as quickly as possible |
9537 |
++ * an application that causes a burst of queue creations is to |
9538 |
++ * weight-raise all the queues created during the burst. This is the |
9539 |
+ * exact opposite of the best strategy for the other type of bursts. |
9540 |
+ * |
9541 |
+- * In the end, to take the best action for each of the two cases, the two |
9542 |
+- * types of bursts need to be distinguished. Fortunately, this seems |
9543 |
+- * relatively easy to do, by looking at the sizes of the bursts. In |
9544 |
+- * particular, we found a threshold such that bursts with a larger size |
9545 |
+- * than that threshold are apparently caused only by services or commands |
9546 |
+- * such as systemd or git grep. For brevity, hereafter we call just 'large' |
9547 |
+- * these bursts. BFQ *does not* weight-raise queues whose activations occur |
9548 |
+- * in a large burst. In addition, for each of these queues BFQ performs or |
9549 |
+- * does not perform idling depending on which choice boosts the throughput |
9550 |
+- * most. The exact choice depends on the device and request pattern at |
9551 |
++ * In the end, to take the best action for each of the two cases, the |
9552 |
++ * two types of bursts need to be distinguished. Fortunately, this |
9553 |
++ * seems relatively easy, by looking at the sizes of the bursts. In |
9554 |
++ * particular, we found a threshold such that only bursts with a |
9555 |
++ * larger size than that threshold are apparently caused by |
9556 |
++ * services or commands such as systemd or git grep. For brevity, |
9557 |
++ * hereafter we call just 'large' these bursts. BFQ *does not* |
9558 |
++ * weight-raise queues whose creation occurs in a large burst. In |
9559 |
++ * addition, for each of these queues BFQ performs or does not perform |
9560 |
++ * idling depending on which choice boosts the throughput more. The |
9561 |
++ * exact choice depends on the device and request pattern at |
9562 |
+ * hand. |
9563 |
+ * |
9564 |
+- * Turning back to the next function, it implements all the steps needed |
9565 |
+- * to detect the occurrence of a large burst and to properly mark all the |
9566 |
+- * queues belonging to it (so that they can then be treated in a different |
9567 |
+- * way). This goal is achieved by maintaining a special "burst list" that |
9568 |
+- * holds, temporarily, the queues that belong to the burst in progress. The |
9569 |
+- * list is then used to mark these queues as belonging to a large burst if |
9570 |
+- * the burst does become large. The main steps are the following. |
9571 |
++ * Unfortunately, false positives may occur while an interactive task |
9572 |
++ * is starting (e.g., an application is being started). The |
9573 |
++ * consequence is that the queues associated with the task do not |
9574 |
++ * enjoy weight raising as expected. Fortunately these false positives |
9575 |
++ * are very rare. They typically occur if some service happens to |
9576 |
++ * start doing I/O exactly when the interactive task starts. |
9577 |
++ * |
9578 |
++ * Turning back to the next function, it implements all the steps |
9579 |
++ * needed to detect the occurrence of a large burst and to properly |
9580 |
++ * mark all the queues belonging to it (so that they can then be |
9581 |
++ * treated in a different way). This goal is achieved by maintaining a |
9582 |
++ * "burst list" that holds, temporarily, the queues that belong to the |
9583 |
++ * burst in progress. The list is then used to mark these queues as |
9584 |
++ * belonging to a large burst if the burst does become large. The main |
9585 |
++ * steps are the following. |
9586 |
+ * |
9587 |
+- * . when the very first queue is activated, the queue is inserted into the |
9588 |
++ * . when the very first queue is created, the queue is inserted into the |
9589 |
+ * list (as it could be the first queue in a possible burst) |
9590 |
+ * |
9591 |
+ * . if the current burst has not yet become large, and a queue Q that does |
9592 |
+@@ -773,13 +799,13 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
9593 |
+ * |
9594 |
+ * . the device enters a large-burst mode |
9595 |
+ * |
9596 |
+- * . if a queue Q that does not belong to the burst is activated while |
9597 |
++ * . if a queue Q that does not belong to the burst is created while |
9598 |
+ * the device is in large-burst mode and shortly after the last time |
9599 |
+ * at which a queue either entered the burst list or was marked as |
9600 |
+ * belonging to the current large burst, then Q is immediately marked |
9601 |
+ * as belonging to a large burst. |
9602 |
+ * |
9603 |
+- * . if a queue Q that does not belong to the burst is activated a while |
9604 |
++ * . if a queue Q that does not belong to the burst is created a while |
9605 |
+ * later, i.e., not shortly after, than the last time at which a queue |
9606 |
+ * either entered the burst list or was marked as belonging to the |
9607 |
+ * current large burst, then the current burst is deemed as finished and: |
9608 |
+@@ -792,52 +818,44 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
9609 |
+ * in a possible new burst (then the burst list contains just Q |
9610 |
+ * after this step). |
9611 |
+ */ |
9612 |
+-static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
9613 |
+- bool idle_for_long_time) |
9614 |
++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
9615 |
+ { |
9616 |
+ /* |
9617 |
+- * If bfqq happened to be activated in a burst, but has been idle |
9618 |
+- * for at least as long as an interactive queue, then we assume |
9619 |
+- * that, in the overall I/O initiated in the burst, the I/O |
9620 |
+- * associated to bfqq is finished. So bfqq does not need to be |
9621 |
+- * treated as a queue belonging to a burst anymore. Accordingly, |
9622 |
+- * we reset bfqq's in_large_burst flag if set, and remove bfqq |
9623 |
+- * from the burst list if it's there. We do not decrement instead |
9624 |
+- * burst_size, because the fact that bfqq does not need to belong |
9625 |
+- * to the burst list any more does not invalidate the fact that |
9626 |
+- * bfqq may have been activated during the current burst. |
9627 |
+- */ |
9628 |
+- if (idle_for_long_time) { |
9629 |
+- hlist_del_init(&bfqq->burst_list_node); |
9630 |
+- bfq_clear_bfqq_in_large_burst(bfqq); |
9631 |
+- } |
9632 |
+- |
9633 |
+- /* |
9634 |
+ * If bfqq is already in the burst list or is part of a large |
9635 |
+- * burst, then there is nothing else to do. |
9636 |
++ * burst, or finally has just been split, then there is |
9637 |
++ * nothing else to do. |
9638 |
+ */ |
9639 |
+ if (!hlist_unhashed(&bfqq->burst_list_node) || |
9640 |
+- bfq_bfqq_in_large_burst(bfqq)) |
9641 |
++ bfq_bfqq_in_large_burst(bfqq) || |
9642 |
++ time_is_after_eq_jiffies(bfqq->split_time + |
9643 |
++ msecs_to_jiffies(10))) |
9644 |
+ return; |
9645 |
+ |
9646 |
+ /* |
9647 |
+- * If bfqq's activation happens late enough, then the current |
9648 |
+- * burst is finished, and related data structures must be reset. |
9649 |
++ * If bfqq's creation happens late enough, or bfqq belongs to |
9650 |
++ * a different group than the burst group, then the current |
9651 |
++ * burst is finished, and related data structures must be |
9652 |
++ * reset. |
9653 |
+ * |
9654 |
+- * In this respect, consider the special case where bfqq is the very |
9655 |
+- * first queue being activated. In this case, last_ins_in_burst is |
9656 |
+- * not yet significant when we get here. But it is easy to verify |
9657 |
+- * that, whether or not the following condition is true, bfqq will |
9658 |
+- * end up being inserted into the burst list. In particular the |
9659 |
+- * list will happen to contain only bfqq. And this is exactly what |
9660 |
+- * has to happen, as bfqq may be the first queue in a possible |
9661 |
++ * In this respect, consider the special case where bfqq is |
9662 |
++ * the very first queue created after BFQ is selected for this |
9663 |
++ * device. In this case, last_ins_in_burst and |
9664 |
++ * burst_parent_entity are not yet significant when we get |
9665 |
++ * here. But it is easy to verify that, whether or not the |
9666 |
++ * following condition is true, bfqq will end up being |
9667 |
++ * inserted into the burst list. In particular the list will |
9668 |
++ * happen to contain only bfqq. And this is exactly what has |
9669 |
++ * to happen, as bfqq may be the first queue of the first |
9670 |
+ * burst. |
9671 |
+ */ |
9672 |
+ if (time_is_before_jiffies(bfqd->last_ins_in_burst + |
9673 |
+- bfqd->bfq_burst_interval)) { |
9674 |
++ bfqd->bfq_burst_interval) || |
9675 |
++ bfqq->entity.parent != bfqd->burst_parent_entity) { |
9676 |
+ bfqd->large_burst = false; |
9677 |
+ bfq_reset_burst_list(bfqd, bfqq); |
9678 |
+- return; |
9679 |
++ bfq_log_bfqq(bfqd, bfqq, |
9680 |
++ "handle_burst: late activation or different group"); |
9681 |
++ goto end; |
9682 |
+ } |
9683 |
+ |
9684 |
+ /* |
9685 |
+@@ -846,8 +864,9 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
9686 |
+ * bfqq as belonging to this large burst immediately. |
9687 |
+ */ |
9688 |
+ if (bfqd->large_burst) { |
9689 |
++ bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); |
9690 |
+ bfq_mark_bfqq_in_large_burst(bfqq); |
9691 |
+- return; |
9692 |
++ goto end; |
9693 |
+ } |
9694 |
+ |
9695 |
+ /* |
9696 |
+@@ -856,25 +875,492 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
9697 |
+ * queue. Then we add bfqq to the burst. |
9698 |
+ */ |
9699 |
+ bfq_add_to_burst(bfqd, bfqq); |
9700 |
++end: |
9701 |
++ /* |
9702 |
++ * At this point, bfqq either has been added to the current |
9703 |
++ * burst or has caused the current burst to terminate and a |
9704 |
++ * possible new burst to start. In particular, in the second |
9705 |
++ * case, bfqq has become the first queue in the possible new |
9706 |
++ * burst. In both cases last_ins_in_burst needs to be moved |
9707 |
++ * forward. |
9708 |
++ */ |
9709 |
++ bfqd->last_ins_in_burst = jiffies; |
9710 |
++ |
9711 |
++} |
9712 |
++ |
9713 |
++static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) |
9714 |
++{ |
9715 |
++ struct bfq_entity *entity = &bfqq->entity; |
9716 |
++ return entity->budget - entity->service; |
9717 |
++} |
9718 |
++ |
9719 |
++/* |
9720 |
++ * If enough samples have been computed, return the current max budget |
9721 |
++ * stored in bfqd, which is dynamically updated according to the |
9722 |
++ * estimated disk peak rate; otherwise return the default max budget |
9723 |
++ */ |
9724 |
++static int bfq_max_budget(struct bfq_data *bfqd) |
9725 |
++{ |
9726 |
++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) |
9727 |
++ return bfq_default_max_budget; |
9728 |
++ else |
9729 |
++ return bfqd->bfq_max_budget; |
9730 |
++} |
9731 |
++ |
9732 |
++/* |
9733 |
++ * Return min budget, which is a fraction of the current or default |
9734 |
++ * max budget (trying with 1/32) |
9735 |
++ */ |
9736 |
++static int bfq_min_budget(struct bfq_data *bfqd) |
9737 |
++{ |
9738 |
++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) |
9739 |
++ return bfq_default_max_budget / 32; |
9740 |
++ else |
9741 |
++ return bfqd->bfq_max_budget / 32; |
9742 |
++} |
9743 |
++ |
9744 |
++static void bfq_bfqq_expire(struct bfq_data *bfqd, |
9745 |
++ struct bfq_queue *bfqq, |
9746 |
++ bool compensate, |
9747 |
++ enum bfqq_expiration reason); |
9748 |
++ |
9749 |
++/* |
9750 |
++ * The next function, invoked after the input queue bfqq switches from |
9751 |
++ * idle to busy, updates the budget of bfqq. The function also tells |
9752 |
++ * whether the in-service queue should be expired, by returning |
9753 |
++ * true. The purpose of expiring the in-service queue is to give bfqq |
9754 |
++ * the chance to possibly preempt the in-service queue, and the reason |
9755 |
++ * for preempting the in-service queue is to achieve one of the two |
9756 |
++ * goals below. |
9757 |
++ * |
9758 |
++ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has |
9759 |
++ * expired because it has remained idle. In particular, bfqq may have |
9760 |
++ * expired for one of the following two reasons: |
9761 |
++ * |
9762 |
++ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and |
9763 |
++ * did not make it to issue a new request before its last request |
9764 |
++ * was served; |
9765 |
++ * |
9766 |
++ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue |
9767 |
++ * a new request before the expiration of the idling-time. |
9768 |
++ * |
9769 |
++ * Even if bfqq has expired for one of the above reasons, the process |
9770 |
++ * associated with the queue may be however issuing requests greedily, |
9771 |
++ * and thus be sensitive to the bandwidth it receives (bfqq may have |
9772 |
++ * remained idle for other reasons: CPU high load, bfqq not enjoying |
9773 |
++ * idling, I/O throttling somewhere in the path from the process to |
9774 |
++ * the I/O scheduler, ...). But if, after every expiration for one of |
9775 |
++ * the above two reasons, bfqq has to wait for the service of at least |
9776 |
++ * one full budget of another queue before being served again, then |
9777 |
++ * bfqq is likely to get a much lower bandwidth or resource time than |
9778 |
++ * its reserved ones. To address this issue, two countermeasures need |
9779 |
++ * to be taken. |
9780 |
++ * |
9781 |
++ * First, the budget and the timestamps of bfqq need to be updated in |
9782 |
++ * a special way on bfqq reactivation: they need to be updated as if |
9783 |
++ * bfqq did not remain idle and did not expire. In fact, if they are |
9784 |
++ * computed as if bfqq expired and remained idle until reactivation, |
9785 |
++ * then the process associated with bfqq is treated as if, instead of |
9786 |
++ * being greedy, it stopped issuing requests when bfqq remained idle, |
9787 |
++ * and restarts issuing requests only on this reactivation. In other |
9788 |
++ * words, the scheduler does not help the process recover the "service |
9789 |
++ * hole" between bfqq expiration and reactivation. As a consequence, |
9790 |
++ * the process receives a lower bandwidth than its reserved one. In |
9791 |
++ * contrast, to recover this hole, the budget must be updated as if |
9792 |
++ * bfqq was not expired at all before this reactivation, i.e., it must |
9793 |
++ * be set to the value of the remaining budget when bfqq was |
9794 |
++ * expired. Along the same line, timestamps need to be assigned the |
9795 |
++ * value they had the last time bfqq was selected for service, i.e., |
9796 |
++ * before last expiration. Thus timestamps need to be back-shifted |
9797 |
++ * with respect to their normal computation (see [1] for more details |
9798 |
++ * on this tricky aspect). |
9799 |
++ * |
9800 |
++ * Secondly, to allow the process to recover the hole, the in-service |
9801 |
++ * queue must be expired too, to give bfqq the chance to preempt it |
9802 |
++ * immediately. In fact, if bfqq has to wait for a full budget of the |
9803 |
++ * in-service queue to be completed, then it may become impossible to |
9804 |
++ * let the process recover the hole, even if the back-shifted |
9805 |
++ * timestamps of bfqq are lower than those of the in-service queue. If |
9806 |
++ * this happens for most or all of the holes, then the process may not |
9807 |
++ * receive its reserved bandwidth. In this respect, it is worth noting |
9808 |
++ * that, being the service of outstanding requests unpreemptible, a |
9809 |
++ * little fraction of the holes may however be unrecoverable, thereby |
9810 |
++ * causing a little loss of bandwidth. |
9811 |
++ * |
9812 |
++ * The last important point is detecting whether bfqq does need this |
9813 |
++ * bandwidth recovery. In this respect, the next function deems the |
9814 |
++ * process associated with bfqq greedy, and thus allows it to recover |
9815 |
++ * the hole, if: 1) the process is waiting for the arrival of a new |
9816 |
++ * request (which implies that bfqq expired for one of the above two |
9817 |
++ * reasons), and 2) such a request has arrived soon. The first |
9818 |
++ * condition is controlled through the flag non_blocking_wait_rq, |
9819 |
++ * while the second through the flag arrived_in_time. If both |
9820 |
++ * conditions hold, then the function computes the budget in the |
9821 |
++ * above-described special way, and signals that the in-service queue |
9822 |
++ * should be expired. Timestamp back-shifting is done later in |
9823 |
++ * __bfq_activate_entity. |
9824 |
++ * |
9825 |
++ * 2. Reduce latency. Even if timestamps are not backshifted to let |
9826 |
++ * the process associated with bfqq recover a service hole, bfqq may |
9827 |
++ * however happen to have, after being (re)activated, a lower finish |
9828 |
++ * timestamp than the in-service queue. That is, the next budget of |
9829 |
++ * bfqq may have to be completed before the one of the in-service |
9830 |
++ * queue. If this is the case, then preempting the in-service queue |
9831 |
++ * allows this goal to be achieved, apart from the unpreemptible, |
9832 |
++ * outstanding requests mentioned above. |
9833 |
++ * |
9834 |
++ * Unfortunately, regardless of which of the above two goals one wants |
9835 |
++ * to achieve, service trees need first to be updated to know whether |
9836 |
++ * the in-service queue must be preempted. To have service trees |
9837 |
++ * correctly updated, the in-service queue must be expired and |
9838 |
++ * rescheduled, and bfqq must be scheduled too. This is one of the |
9839 |
++ * most costly operations (in future versions, the scheduling |
9840 |
++ * mechanism may be re-designed in such a way to make it possible to |
9841 |
++ * know whether preemption is needed without needing to update service |
9842 |
++ * trees). In addition, queue preemptions almost always cause random |
9843 |
++ * I/O, and thus loss of throughput. Because of these facts, the next |
9844 |
++ * function adopts the following simple scheme to avoid both costly |
9845 |
++ * operations and too frequent preemptions: it requests the expiration |
9846 |
++ * of the in-service queue (unconditionally) only for queues that need |
9847 |
++ * to recover a hole, or that either are weight-raised or deserve to |
9848 |
++ * be weight-raised. |
9849 |
++ */ |
9850 |
++static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, |
9851 |
++ struct bfq_queue *bfqq, |
9852 |
++ bool arrived_in_time, |
9853 |
++ bool wr_or_deserves_wr) |
9854 |
++{ |
9855 |
++ struct bfq_entity *entity = &bfqq->entity; |
9856 |
++ |
9857 |
++ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { |
9858 |
++ /* |
9859 |
++ * We do not clear the flag non_blocking_wait_rq here, as |
9860 |
++ * the latter is used in bfq_activate_bfqq to signal |
9861 |
++ * that timestamps need to be back-shifted (and is |
9862 |
++ * cleared right after). |
9863 |
++ */ |
9864 |
++ |
9865 |
++ /* |
9866 |
++ * In next assignment we rely on that either |
9867 |
++ * entity->service or entity->budget are not updated |
9868 |
++ * on expiration if bfqq is empty (see |
9869 |
++ * __bfq_bfqq_recalc_budget). Thus both quantities |
9870 |
++ * remain unchanged after such an expiration, and the |
9871 |
++ * following statement therefore assigns to |
9872 |
++ * entity->budget the remaining budget on such an |
9873 |
++ * expiration. For clarity, entity->service is not |
9874 |
++ * updated on expiration in any case, and, in normal |
9875 |
++ * operation, is reset only when bfqq is selected for |
9876 |
++ * service (see bfq_get_next_queue). |
9877 |
++ */ |
9878 |
++ entity->budget = min_t(unsigned long, |
9879 |
++ bfq_bfqq_budget_left(bfqq), |
9880 |
++ bfqq->max_budget); |
9881 |
++ |
9882 |
++ BUG_ON(entity->budget < 0); |
9883 |
++ return true; |
9884 |
++ } |
9885 |
++ |
9886 |
++ entity->budget = max_t(unsigned long, bfqq->max_budget, |
9887 |
++ bfq_serv_to_charge(bfqq->next_rq,bfqq)); |
9888 |
++ BUG_ON(entity->budget < 0); |
9889 |
++ |
9890 |
++ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); |
9891 |
++ return wr_or_deserves_wr; |
9892 |
++} |
9893 |
++ |
9894 |
++static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, |
9895 |
++ struct bfq_queue *bfqq, |
9896 |
++ unsigned int old_wr_coeff, |
9897 |
++ bool wr_or_deserves_wr, |
9898 |
++ bool interactive, |
9899 |
++ bool in_burst, |
9900 |
++ bool soft_rt) |
9901 |
++{ |
9902 |
++ if (old_wr_coeff == 1 && wr_or_deserves_wr) { |
9903 |
++ /* start a weight-raising period */ |
9904 |
++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; |
9905 |
++ if (interactive) /* update wr duration */ |
9906 |
++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); |
9907 |
++ else |
9908 |
++ bfqq->wr_cur_max_time = |
9909 |
++ bfqd->bfq_wr_rt_max_time; |
9910 |
++ /* |
9911 |
++ * If needed, further reduce budget to make sure it is |
9912 |
++ * close to bfqq's backlog, so as to reduce the |
9913 |
++ * scheduling-error component due to a too large |
9914 |
++ * budget. Do not care about throughput consequences, |
9915 |
++ * but only about latency. Finally, do not assign a |
9916 |
++ * too small budget either, to avoid increasing |
9917 |
++ * latency by causing too frequent expirations. |
9918 |
++ */ |
9919 |
++ bfqq->entity.budget = min_t(unsigned long, |
9920 |
++ bfqq->entity.budget, |
9921 |
++ 2 * bfq_min_budget(bfqd)); |
9922 |
++ |
9923 |
++ bfq_log_bfqq(bfqd, bfqq, |
9924 |
++ "wrais starting at %lu, rais_max_time %u", |
9925 |
++ jiffies, |
9926 |
++ jiffies_to_msecs(bfqq->wr_cur_max_time)); |
9927 |
++ } else if (old_wr_coeff > 1) { |
9928 |
++ if (interactive) /* update wr duration */ |
9929 |
++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); |
9930 |
++ else if (in_burst) { |
9931 |
++ bfqq->wr_coeff = 1; |
9932 |
++ bfq_log_bfqq(bfqd, bfqq, |
9933 |
++ "wrais ending at %lu, rais_max_time %u", |
9934 |
++ jiffies, |
9935 |
++ jiffies_to_msecs(bfqq-> |
9936 |
++ wr_cur_max_time)); |
9937 |
++ } else if (time_before( |
9938 |
++ bfqq->last_wr_start_finish + |
9939 |
++ bfqq->wr_cur_max_time, |
9940 |
++ jiffies + |
9941 |
++ bfqd->bfq_wr_rt_max_time) && |
9942 |
++ soft_rt) { |
9943 |
++ /* |
9944 |
++ * The remaining weight-raising time is lower |
9945 |
++ * than bfqd->bfq_wr_rt_max_time, which means |
9946 |
++ * that the application is enjoying weight |
9947 |
++ * raising either because deemed soft-rt in |
9948 |
++ * the near past, or because deemed interactive |
9949 |
++ * a long ago. |
9950 |
++ * In both cases, resetting now the current |
9951 |
++ * remaining weight-raising time for the |
9952 |
++ * application to the weight-raising duration |
9953 |
++ * for soft rt applications would not cause any |
9954 |
++ * latency increase for the application (as the |
9955 |
++ * new duration would be higher than the |
9956 |
++ * remaining time). |
9957 |
++ * |
9958 |
++ * In addition, the application is now meeting |
9959 |
++ * the requirements for being deemed soft rt. |
9960 |
++ * In the end we can correctly and safely |
9961 |
++ * (re)charge the weight-raising duration for |
9962 |
++ * the application with the weight-raising |
9963 |
++ * duration for soft rt applications. |
9964 |
++ * |
9965 |
++ * In particular, doing this recharge now, i.e., |
9966 |
++ * before the weight-raising period for the |
9967 |
++ * application finishes, reduces the probability |
9968 |
++ * of the following negative scenario: |
9969 |
++ * 1) the weight of a soft rt application is |
9970 |
++ * raised at startup (as for any newly |
9971 |
++ * created application), |
9972 |
++ * 2) since the application is not interactive, |
9973 |
++ * at a certain time weight-raising is |
9974 |
++ * stopped for the application, |
9975 |
++ * 3) at that time the application happens to |
9976 |
++ * still have pending requests, and hence |
9977 |
++ * is destined to not have a chance to be |
9978 |
++ * deemed soft rt before these requests are |
9979 |
++ * completed (see the comments to the |
9980 |
++ * function bfq_bfqq_softrt_next_start() |
9981 |
++ * for details on soft rt detection), |
9982 |
++ * 4) these pending requests experience a high |
9983 |
++ * latency because the application is not |
9984 |
++ * weight-raised while they are pending. |
9985 |
++ */ |
9986 |
++ bfqq->last_wr_start_finish = jiffies; |
9987 |
++ bfqq->wr_cur_max_time = |
9988 |
++ bfqd->bfq_wr_rt_max_time; |
9989 |
++ bfq_log_bfqq(bfqd, bfqq, |
9990 |
++ "switching to soft_rt wr, or " |
9991 |
++ " just moving forward duration"); |
9992 |
++ } |
9993 |
++ } |
9994 |
++} |
9995 |
++ |
9996 |
++static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, |
9997 |
++ struct bfq_queue *bfqq) |
9998 |
++{ |
9999 |
++ return bfqq->dispatched == 0 && |
10000 |
++ time_is_before_jiffies( |
10001 |
++ bfqq->budget_timeout + |
10002 |
++ bfqd->bfq_wr_min_idle_time); |
10003 |
++} |
10004 |
++ |
10005 |
++static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, |
10006 |
++ struct bfq_queue *bfqq, |
10007 |
++ int old_wr_coeff, |
10008 |
++ struct request *rq, |
10009 |
++ bool *interactive) |
10010 |
++{ |
10011 |
++ bool soft_rt, in_burst, wr_or_deserves_wr, |
10012 |
++ bfqq_wants_to_preempt, |
10013 |
++ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), |
10014 |
++ /* |
10015 |
++ * See the comments on |
10016 |
++ * bfq_bfqq_update_budg_for_activation for |
10017 |
++ * details on the usage of the next variable. |
10018 |
++ */ |
10019 |
++ arrived_in_time = time_is_after_jiffies( |
10020 |
++ RQ_BIC(rq)->ttime.last_end_request + |
10021 |
++ bfqd->bfq_slice_idle * 3); |
10022 |
++ |
10023 |
++ bfq_log_bfqq(bfqd, bfqq, |
10024 |
++ "bfq_add_request non-busy: " |
10025 |
++ "jiffies %lu, in_time %d, idle_long %d busyw %d " |
10026 |
++ "wr_coeff %u", |
10027 |
++ jiffies, arrived_in_time, |
10028 |
++ idle_for_long_time, |
10029 |
++ bfq_bfqq_non_blocking_wait_rq(bfqq), |
10030 |
++ old_wr_coeff); |
10031 |
++ |
10032 |
++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); |
10033 |
++ |
10034 |
++ BUG_ON(bfqq == bfqd->in_service_queue); |
10035 |
++ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, |
10036 |
++ rq->cmd_flags); |
10037 |
++ |
10038 |
++ /* |
10039 |
++ * bfqq deserves to be weight-raised if: |
10040 |
++ * - it is sync, |
10041 |
++ * - it does not belong to a large burst, |
10042 |
++ * - it has been idle for enough time or is soft real-time, |
10043 |
++ * - is linked to a bfq_io_cq (it is not shared in any sense) |
10044 |
++ */ |
10045 |
++ in_burst = bfq_bfqq_in_large_burst(bfqq); |
10046 |
++ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && |
10047 |
++ !in_burst && |
10048 |
++ time_is_before_jiffies(bfqq->soft_rt_next_start); |
10049 |
++ *interactive = |
10050 |
++ !in_burst && |
10051 |
++ idle_for_long_time; |
10052 |
++ wr_or_deserves_wr = bfqd->low_latency && |
10053 |
++ (bfqq->wr_coeff > 1 || |
10054 |
++ (bfq_bfqq_sync(bfqq) && |
10055 |
++ bfqq->bic && (*interactive || soft_rt))); |
10056 |
++ |
10057 |
++ bfq_log_bfqq(bfqd, bfqq, |
10058 |
++ "bfq_add_request: " |
10059 |
++ "in_burst %d, " |
10060 |
++ "soft_rt %d (next %lu), inter %d, bic %p", |
10061 |
++ bfq_bfqq_in_large_burst(bfqq), soft_rt, |
10062 |
++ bfqq->soft_rt_next_start, |
10063 |
++ *interactive, |
10064 |
++ bfqq->bic); |
10065 |
++ |
10066 |
++ /* |
10067 |
++ * Using the last flag, update budget and check whether bfqq |
10068 |
++ * may want to preempt the in-service queue. |
10069 |
++ */ |
10070 |
++ bfqq_wants_to_preempt = |
10071 |
++ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, |
10072 |
++ arrived_in_time, |
10073 |
++ wr_or_deserves_wr); |
10074 |
++ |
10075 |
++ /* |
10076 |
++ * If bfqq happened to be activated in a burst, but has been |
10077 |
++ * idle for much more than an interactive queue, then we |
10078 |
++ * assume that, in the overall I/O initiated in the burst, the |
10079 |
++ * I/O associated with bfqq is finished. So bfqq does not need |
10080 |
++ * to be treated as a queue belonging to a burst |
10081 |
++ * anymore. Accordingly, we reset bfqq's in_large_burst flag |
10082 |
++ * if set, and remove bfqq from the burst list if it's |
10083 |
++ * there. We do not decrement burst_size, because the fact |
10084 |
++ * that bfqq does not need to belong to the burst list any |
10085 |
++ * more does not invalidate the fact that bfqq was created in |
10086 |
++ * a burst. |
10087 |
++ */ |
10088 |
++ if (likely(!bfq_bfqq_just_created(bfqq)) && |
10089 |
++ idle_for_long_time && |
10090 |
++ time_is_before_jiffies( |
10091 |
++ bfqq->budget_timeout + |
10092 |
++ msecs_to_jiffies(10000))) { |
10093 |
++ hlist_del_init(&bfqq->burst_list_node); |
10094 |
++ bfq_clear_bfqq_in_large_burst(bfqq); |
10095 |
++ } |
10096 |
++ |
10097 |
++ bfq_clear_bfqq_just_created(bfqq); |
10098 |
++ |
10099 |
++ if (!bfq_bfqq_IO_bound(bfqq)) { |
10100 |
++ if (arrived_in_time) { |
10101 |
++ bfqq->requests_within_timer++; |
10102 |
++ if (bfqq->requests_within_timer >= |
10103 |
++ bfqd->bfq_requests_within_timer) |
10104 |
++ bfq_mark_bfqq_IO_bound(bfqq); |
10105 |
++ } else |
10106 |
++ bfqq->requests_within_timer = 0; |
10107 |
++ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", |
10108 |
++ bfqq->requests_within_timer); |
10109 |
++ } |
10110 |
++ |
10111 |
++ if (bfqd->low_latency) { |
10112 |
++ if (unlikely(time_is_after_jiffies(bfqq->split_time))) |
10113 |
++ /* wraparound */ |
10114 |
++ bfqq->split_time = |
10115 |
++ jiffies - bfqd->bfq_wr_min_idle_time - 1; |
10116 |
++ |
10117 |
++ if (time_is_before_jiffies(bfqq->split_time + |
10118 |
++ bfqd->bfq_wr_min_idle_time)) { |
10119 |
++ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, |
10120 |
++ old_wr_coeff, |
10121 |
++ wr_or_deserves_wr, |
10122 |
++ *interactive, |
10123 |
++ in_burst, |
10124 |
++ soft_rt); |
10125 |
++ |
10126 |
++ if (old_wr_coeff != bfqq->wr_coeff) |
10127 |
++ bfqq->entity.prio_changed = 1; |
10128 |
++ } |
10129 |
++ } |
10130 |
++ |
10131 |
++ bfqq->last_idle_bklogged = jiffies; |
10132 |
++ bfqq->service_from_backlogged = 0; |
10133 |
++ bfq_clear_bfqq_softrt_update(bfqq); |
10134 |
++ |
10135 |
++ bfq_add_bfqq_busy(bfqd, bfqq); |
10136 |
++ |
10137 |
++ /* |
10138 |
++ * Expire in-service queue only if preemption may be needed |
10139 |
++ * for guarantees. In this respect, the function |
10140 |
++ * next_queue_may_preempt just checks a simple, necessary |
10141 |
++ * condition, and not a sufficient condition based on |
10142 |
++ * timestamps. In fact, for the latter condition to be |
10143 |
++ * evaluated, timestamps would need first to be updated, and |
10144 |
++ * this operation is quite costly (see the comments on the |
10145 |
++ * function bfq_bfqq_update_budg_for_activation). |
10146 |
++ */ |
10147 |
++ if (bfqd->in_service_queue && bfqq_wants_to_preempt && |
10148 |
++ bfqd->in_service_queue->wr_coeff == 1 && |
10149 |
++ next_queue_may_preempt(bfqd)) { |
10150 |
++ struct bfq_queue *in_serv = |
10151 |
++ bfqd->in_service_queue; |
10152 |
++ BUG_ON(in_serv == bfqq); |
10153 |
++ |
10154 |
++ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, |
10155 |
++ false, BFQ_BFQQ_PREEMPTED); |
10156 |
++ BUG_ON(in_serv->entity.budget < 0); |
10157 |
++ } |
10158 |
+ } |
10159 |
+ |
10160 |
+ static void bfq_add_request(struct request *rq) |
10161 |
+ { |
10162 |
+ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
10163 |
+- struct bfq_entity *entity = &bfqq->entity; |
10164 |
+ struct bfq_data *bfqd = bfqq->bfqd; |
10165 |
+ struct request *next_rq, *prev; |
10166 |
+- unsigned long old_wr_coeff = bfqq->wr_coeff; |
10167 |
++ unsigned int old_wr_coeff = bfqq->wr_coeff; |
10168 |
+ bool interactive = false; |
10169 |
+ |
10170 |
+- bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); |
10171 |
++ bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", |
10172 |
++ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); |
10173 |
++ |
10174 |
++ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ |
10175 |
++ bfq_log_bfqq(bfqd, bfqq, |
10176 |
++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", |
10177 |
++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), |
10178 |
++ jiffies_to_msecs(bfqq->wr_cur_max_time), |
10179 |
++ bfqq->wr_coeff, |
10180 |
++ bfqq->entity.weight, bfqq->entity.orig_weight); |
10181 |
++ |
10182 |
+ bfqq->queued[rq_is_sync(rq)]++; |
10183 |
+ bfqd->queued++; |
10184 |
+ |
10185 |
+ elv_rb_add(&bfqq->sort_list, rq); |
10186 |
+ |
10187 |
+ /* |
10188 |
+- * Check if this request is a better next-serve candidate. |
10189 |
++ * Check if this request is a better next-to-serve candidate. |
10190 |
+ */ |
10191 |
+ prev = bfqq->next_rq; |
10192 |
+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); |
10193 |
+@@ -887,160 +1373,10 @@ static void bfq_add_request(struct request *rq) |
10194 |
+ if (prev != bfqq->next_rq) |
10195 |
+ bfq_pos_tree_add_move(bfqd, bfqq); |
10196 |
+ |
10197 |
+- if (!bfq_bfqq_busy(bfqq)) { |
10198 |
+- bool soft_rt, coop_or_in_burst, |
10199 |
+- idle_for_long_time = time_is_before_jiffies( |
10200 |
+- bfqq->budget_timeout + |
10201 |
+- bfqd->bfq_wr_min_idle_time); |
10202 |
+- |
10203 |
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED |
10204 |
+- bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, |
10205 |
+- rq->cmd_flags); |
10206 |
+-#endif |
10207 |
+- if (bfq_bfqq_sync(bfqq)) { |
10208 |
+- bool already_in_burst = |
10209 |
+- !hlist_unhashed(&bfqq->burst_list_node) || |
10210 |
+- bfq_bfqq_in_large_burst(bfqq); |
10211 |
+- bfq_handle_burst(bfqd, bfqq, idle_for_long_time); |
10212 |
+- /* |
10213 |
+- * If bfqq was not already in the current burst, |
10214 |
+- * then, at this point, bfqq either has been |
10215 |
+- * added to the current burst or has caused the |
10216 |
+- * current burst to terminate. In particular, in |
10217 |
+- * the second case, bfqq has become the first |
10218 |
+- * queue in a possible new burst. |
10219 |
+- * In both cases last_ins_in_burst needs to be |
10220 |
+- * moved forward. |
10221 |
+- */ |
10222 |
+- if (!already_in_burst) |
10223 |
+- bfqd->last_ins_in_burst = jiffies; |
10224 |
+- } |
10225 |
+- |
10226 |
+- coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || |
10227 |
+- bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; |
10228 |
+- soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && |
10229 |
+- !coop_or_in_burst && |
10230 |
+- time_is_before_jiffies(bfqq->soft_rt_next_start); |
10231 |
+- interactive = !coop_or_in_burst && idle_for_long_time; |
10232 |
+- entity->budget = max_t(unsigned long, bfqq->max_budget, |
10233 |
+- bfq_serv_to_charge(next_rq, bfqq)); |
10234 |
+- |
10235 |
+- if (!bfq_bfqq_IO_bound(bfqq)) { |
10236 |
+- if (time_before(jiffies, |
10237 |
+- RQ_BIC(rq)->ttime.last_end_request + |
10238 |
+- bfqd->bfq_slice_idle)) { |
10239 |
+- bfqq->requests_within_timer++; |
10240 |
+- if (bfqq->requests_within_timer >= |
10241 |
+- bfqd->bfq_requests_within_timer) |
10242 |
+- bfq_mark_bfqq_IO_bound(bfqq); |
10243 |
+- } else |
10244 |
+- bfqq->requests_within_timer = 0; |
10245 |
+- } |
10246 |
+- |
10247 |
+- if (!bfqd->low_latency) |
10248 |
+- goto add_bfqq_busy; |
10249 |
+- |
10250 |
+- if (bfq_bfqq_just_split(bfqq)) |
10251 |
+- goto set_prio_changed; |
10252 |
+- |
10253 |
+- /* |
10254 |
+- * If the queue: |
10255 |
+- * - is not being boosted, |
10256 |
+- * - has been idle for enough time, |
10257 |
+- * - is not a sync queue or is linked to a bfq_io_cq (it is |
10258 |
+- * shared "for its nature" or it is not shared and its |
10259 |
+- * requests have not been redirected to a shared queue) |
10260 |
+- * start a weight-raising period. |
10261 |
+- */ |
10262 |
+- if (old_wr_coeff == 1 && (interactive || soft_rt) && |
10263 |
+- (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { |
10264 |
+- bfqq->wr_coeff = bfqd->bfq_wr_coeff; |
10265 |
+- if (interactive) |
10266 |
+- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); |
10267 |
+- else |
10268 |
+- bfqq->wr_cur_max_time = |
10269 |
+- bfqd->bfq_wr_rt_max_time; |
10270 |
+- bfq_log_bfqq(bfqd, bfqq, |
10271 |
+- "wrais starting at %lu, rais_max_time %u", |
10272 |
+- jiffies, |
10273 |
+- jiffies_to_msecs(bfqq->wr_cur_max_time)); |
10274 |
+- } else if (old_wr_coeff > 1) { |
10275 |
+- if (interactive) |
10276 |
+- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); |
10277 |
+- else if (coop_or_in_burst || |
10278 |
+- (bfqq->wr_cur_max_time == |
10279 |
+- bfqd->bfq_wr_rt_max_time && |
10280 |
+- !soft_rt)) { |
10281 |
+- bfqq->wr_coeff = 1; |
10282 |
+- bfq_log_bfqq(bfqd, bfqq, |
10283 |
+- "wrais ending at %lu, rais_max_time %u", |
10284 |
+- jiffies, |
10285 |
+- jiffies_to_msecs(bfqq-> |
10286 |
+- wr_cur_max_time)); |
10287 |
+- } else if (time_before( |
10288 |
+- bfqq->last_wr_start_finish + |
10289 |
+- bfqq->wr_cur_max_time, |
10290 |
+- jiffies + |
10291 |
+- bfqd->bfq_wr_rt_max_time) && |
10292 |
+- soft_rt) { |
10293 |
+- /* |
10294 |
+- * |
10295 |
+- * The remaining weight-raising time is lower |
10296 |
+- * than bfqd->bfq_wr_rt_max_time, which means |
10297 |
+- * that the application is enjoying weight |
10298 |
+- * raising either because deemed soft-rt in |
10299 |
+- * the near past, or because deemed interactive |
10300 |
+- * a long ago. |
10301 |
+- * In both cases, resetting now the current |
10302 |
+- * remaining weight-raising time for the |
10303 |
+- * application to the weight-raising duration |
10304 |
+- * for soft rt applications would not cause any |
10305 |
+- * latency increase for the application (as the |
10306 |
+- * new duration would be higher than the |
10307 |
+- * remaining time). |
10308 |
+- * |
10309 |
+- * In addition, the application is now meeting |
10310 |
+- * the requirements for being deemed soft rt. |
10311 |
+- * In the end we can correctly and safely |
10312 |
+- * (re)charge the weight-raising duration for |
10313 |
+- * the application with the weight-raising |
10314 |
+- * duration for soft rt applications. |
10315 |
+- * |
10316 |
+- * In particular, doing this recharge now, i.e., |
10317 |
+- * before the weight-raising period for the |
10318 |
+- * application finishes, reduces the probability |
10319 |
+- * of the following negative scenario: |
10320 |
+- * 1) the weight of a soft rt application is |
10321 |
+- * raised at startup (as for any newly |
10322 |
+- * created application), |
10323 |
+- * 2) since the application is not interactive, |
10324 |
+- * at a certain time weight-raising is |
10325 |
+- * stopped for the application, |
10326 |
+- * 3) at that time the application happens to |
10327 |
+- * still have pending requests, and hence |
10328 |
+- * is destined to not have a chance to be |
10329 |
+- * deemed soft rt before these requests are |
10330 |
+- * completed (see the comments to the |
10331 |
+- * function bfq_bfqq_softrt_next_start() |
10332 |
+- * for details on soft rt detection), |
10333 |
+- * 4) these pending requests experience a high |
10334 |
+- * latency because the application is not |
10335 |
+- * weight-raised while they are pending. |
10336 |
+- */ |
10337 |
+- bfqq->last_wr_start_finish = jiffies; |
10338 |
+- bfqq->wr_cur_max_time = |
10339 |
+- bfqd->bfq_wr_rt_max_time; |
10340 |
+- } |
10341 |
+- } |
10342 |
+-set_prio_changed: |
10343 |
+- if (old_wr_coeff != bfqq->wr_coeff) |
10344 |
+- entity->prio_changed = 1; |
10345 |
+-add_bfqq_busy: |
10346 |
+- bfqq->last_idle_bklogged = jiffies; |
10347 |
+- bfqq->service_from_backlogged = 0; |
10348 |
+- bfq_clear_bfqq_softrt_update(bfqq); |
10349 |
+- bfq_add_bfqq_busy(bfqd, bfqq); |
10350 |
+- } else { |
10351 |
++ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ |
10352 |
++ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, |
10353 |
++ rq, &interactive); |
10354 |
++ else { |
10355 |
+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && |
10356 |
+ time_is_before_jiffies( |
10357 |
+ bfqq->last_wr_start_finish + |
10358 |
+@@ -1049,16 +1385,43 @@ add_bfqq_busy: |
10359 |
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); |
10360 |
+ |
10361 |
+ bfqd->wr_busy_queues++; |
10362 |
+- entity->prio_changed = 1; |
10363 |
++ bfqq->entity.prio_changed = 1; |
10364 |
+ bfq_log_bfqq(bfqd, bfqq, |
10365 |
+- "non-idle wrais starting at %lu, rais_max_time %u", |
10366 |
+- jiffies, |
10367 |
+- jiffies_to_msecs(bfqq->wr_cur_max_time)); |
10368 |
++ "non-idle wrais starting, " |
10369 |
++ "wr_max_time %u wr_busy %d", |
10370 |
++ jiffies_to_msecs(bfqq->wr_cur_max_time), |
10371 |
++ bfqd->wr_busy_queues); |
10372 |
+ } |
10373 |
+ if (prev != bfqq->next_rq) |
10374 |
+ bfq_updated_next_req(bfqd, bfqq); |
10375 |
+ } |
10376 |
+ |
10377 |
++ /* |
10378 |
++ * Assign jiffies to last_wr_start_finish in the following |
10379 |
++ * cases: |
10380 |
++ * |
10381 |
++ * . if bfqq is not going to be weight-raised, because, for |
10382 |
++ * non weight-raised queues, last_wr_start_finish stores the |
10383 |
++ * arrival time of the last request; as of now, this piece |
10384 |
++ * of information is used only for deciding whether to |
10385 |
++ * weight-raise async queues |
10386 |
++ * |
10387 |
++ * . if bfqq is not weight-raised, because, if bfqq is now |
10388 |
++ * switching to weight-raised, then last_wr_start_finish |
10389 |
++ * stores the time when weight-raising starts |
10390 |
++ * |
10391 |
++ * . if bfqq is interactive, because, regardless of whether |
10392 |
++ * bfqq is currently weight-raised, the weight-raising |
10393 |
++ * period must start or restart (this case is considered |
10394 |
++ * separately because it is not detected by the above |
10395 |
++ * conditions, if bfqq is already weight-raised) |
10396 |
++ * |
10397 |
++ * last_wr_start_finish has to be updated also if bfqq is soft |
10398 |
++ * real-time, because the weight-raising period is constantly |
10399 |
++ * restarted on idle-to-busy transitions for these queues, but |
10400 |
++ * this is already done in bfq_bfqq_handle_idle_busy_switch if |
10401 |
++ * needed. |
10402 |
++ */ |
10403 |
+ if (bfqd->low_latency && |
10404 |
+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) |
10405 |
+ bfqq->last_wr_start_finish = jiffies; |
10406 |
+@@ -1106,6 +1469,9 @@ static void bfq_remove_request(struct request *rq) |
10407 |
+ struct bfq_data *bfqd = bfqq->bfqd; |
10408 |
+ const int sync = rq_is_sync(rq); |
10409 |
+ |
10410 |
++ BUG_ON(bfqq->entity.service > bfqq->entity.budget && |
10411 |
++ bfqq == bfqd->in_service_queue); |
10412 |
++ |
10413 |
+ if (bfqq->next_rq == rq) { |
10414 |
+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); |
10415 |
+ bfq_updated_next_req(bfqd, bfqq); |
10416 |
+@@ -1119,8 +1485,25 @@ static void bfq_remove_request(struct request *rq) |
10417 |
+ elv_rb_del(&bfqq->sort_list, rq); |
10418 |
+ |
10419 |
+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { |
10420 |
+- if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) |
10421 |
++ BUG_ON(bfqq->entity.budget < 0); |
10422 |
++ |
10423 |
++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { |
10424 |
+ bfq_del_bfqq_busy(bfqd, bfqq, 1); |
10425 |
++ |
10426 |
++ /* bfqq emptied. In normal operation, when |
10427 |
++ * bfqq is empty, bfqq->entity.service and |
10428 |
++ * bfqq->entity.budget must contain, |
10429 |
++ * respectively, the service received and the |
10430 |
++ * budget used last time bfqq emptied. These |
10431 |
++ * facts do not hold in this case, as at least |
10432 |
++ * this last removal occurred while bfqq is |
10433 |
++ * not in service. To avoid inconsistencies, |
10434 |
++ * reset both bfqq->entity.service and |
10435 |
++ * bfqq->entity.budget. |
10436 |
++ */ |
10437 |
++ bfqq->entity.budget = bfqq->entity.service = 0; |
10438 |
++ } |
10439 |
++ |
10440 |
+ /* |
10441 |
+ * Remove queue from request-position tree as it is empty. |
10442 |
+ */ |
10443 |
+@@ -1134,9 +1517,7 @@ static void bfq_remove_request(struct request *rq) |
10444 |
+ BUG_ON(bfqq->meta_pending == 0); |
10445 |
+ bfqq->meta_pending--; |
10446 |
+ } |
10447 |
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED |
10448 |
+ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); |
10449 |
+-#endif |
10450 |
+ } |
10451 |
+ |
10452 |
+ static int bfq_merge(struct request_queue *q, struct request **req, |
10453 |
+@@ -1221,21 +1602,25 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, |
10454 |
+ bfqq->next_rq = rq; |
10455 |
+ |
10456 |
+ bfq_remove_request(next); |
10457 |
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED |
10458 |
+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); |
10459 |
+-#endif |
10460 |
+ } |
10461 |
+ |
10462 |
+ /* Must be called with bfqq != NULL */ |
10463 |
+ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) |
10464 |
+ { |
10465 |
+ BUG_ON(!bfqq); |
10466 |
++ |
10467 |
+ if (bfq_bfqq_busy(bfqq)) |
10468 |
+ bfqq->bfqd->wr_busy_queues--; |
10469 |
+ bfqq->wr_coeff = 1; |
10470 |
+ bfqq->wr_cur_max_time = 0; |
10471 |
+- /* Trigger a weight change on the next activation of the queue */ |
10472 |
++ /* |
10473 |
++ * Trigger a weight change on the next invocation of |
10474 |
++ * __bfq_entity_update_weight_prio. |
10475 |
++ */ |
10476 |
+ bfqq->entity.prio_changed = 1; |
10477 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", |
10478 |
++ bfqq->bfqd->wr_busy_queues); |
10479 |
+ } |
10480 |
+ |
10481 |
+ static void bfq_end_wr_async_queues(struct bfq_data *bfqd, |
10482 |
+@@ -1278,7 +1663,7 @@ static int bfq_rq_close_to_sector(void *io_struct, bool request, |
10483 |
+ sector_t sector) |
10484 |
+ { |
10485 |
+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= |
10486 |
+- BFQQ_SEEK_THR; |
10487 |
++ BFQQ_CLOSE_THR; |
10488 |
+ } |
10489 |
+ |
10490 |
+ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, |
10491 |
+@@ -1400,7 +1785,7 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) |
10492 |
+ * throughput. |
10493 |
+ */ |
10494 |
+ bfqq->new_bfqq = new_bfqq; |
10495 |
+- atomic_add(process_refs, &new_bfqq->ref); |
10496 |
++ new_bfqq->ref += process_refs; |
10497 |
+ return new_bfqq; |
10498 |
+ } |
10499 |
+ |
10500 |
+@@ -1431,9 +1816,23 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, |
10501 |
+ } |
10502 |
+ |
10503 |
+ /* |
10504 |
+- * Attempt to schedule a merge of bfqq with the currently in-service queue |
10505 |
+- * or with a close queue among the scheduled queues. |
10506 |
+- * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue |
10507 |
++ * If this function returns true, then bfqq cannot be merged. The idea |
10508 |
++ * is that true cooperation happens very early after processes start |
10509 |
++ * to do I/O. Usually, late cooperations are just accidental false |
10510 |
++ * positives. In case bfqq is weight-raised, such false positives |
10511 |
++ * would evidently degrade latency guarantees for bfqq. |
10512 |
++ */ |
10513 |
++bool wr_from_too_long(struct bfq_queue *bfqq) |
10514 |
++{ |
10515 |
++ return bfqq->wr_coeff > 1 && |
10516 |
++ time_is_before_jiffies(bfqq->last_wr_start_finish + |
10517 |
++ msecs_to_jiffies(100)); |
10518 |
++} |
10519 |
++ |
10520 |
++/* |
10521 |
++ * Attempt to schedule a merge of bfqq with the currently in-service |
10522 |
++ * queue or with a close queue among the scheduled queues. Return |
10523 |
++ * NULL if no merge was scheduled, a pointer to the shared bfq_queue |
10524 |
+ * structure otherwise. |
10525 |
+ * |
10526 |
+ * The OOM queue is not allowed to participate to cooperation: in fact, since |
10527 |
+@@ -1442,6 +1841,18 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, |
10528 |
+ * handle merging with the OOM queue would be quite complex and expensive |
10529 |
+ * to maintain. Besides, in such a critical condition as an out of memory, |
10530 |
+ * the benefits of queue merging may be little relevant, or even negligible. |
10531 |
++ * |
10532 |
++ * Weight-raised queues can be merged only if their weight-raising |
10533 |
++ * period has just started. In fact cooperating processes are usually |
10534 |
++ * started together. Thus, with this filter we avoid false positives |
10535 |
++ * that would jeopardize low-latency guarantees. |
10536 |
++ * |
10537 |
++ * WARNING: queue merging may impair fairness among non-weight raised |
10538 |
++ * queues, for at least two reasons: 1) the original weight of a |
10539 |
++ * merged queue may change during the merged state, 2) even being the |
10540 |
++ * weight the same, a merged queue may be bloated with many more |
10541 |
++ * requests than the ones produced by its originally-associated |
10542 |
++ * process. |
10543 |
+ */ |
10544 |
+ static struct bfq_queue * |
10545 |
+ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
10546 |
+@@ -1451,16 +1862,32 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
10547 |
+ |
10548 |
+ if (bfqq->new_bfqq) |
10549 |
+ return bfqq->new_bfqq; |
10550 |
+- if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) |
10551 |
++ |
10552 |
++ if (io_struct && wr_from_too_long(bfqq) && |
10553 |
++ likely(bfqq != &bfqd->oom_bfqq)) |
10554 |
++ bfq_log_bfqq(bfqd, bfqq, |
10555 |
++ "would have looked for coop, but bfq%d wr", |
10556 |
++ bfqq->pid); |
10557 |
++ |
10558 |
++ if (!io_struct || |
10559 |
++ wr_from_too_long(bfqq) || |
10560 |
++ unlikely(bfqq == &bfqd->oom_bfqq)) |
10561 |
+ return NULL; |
10562 |
+- /* If device has only one backlogged bfq_queue, don't search. */ |
10563 |
++ |
10564 |
++ /* If there is only one backlogged queue, don't search. */ |
10565 |
+ if (bfqd->busy_queues == 1) |
10566 |
+ return NULL; |
10567 |
+ |
10568 |
+ in_service_bfqq = bfqd->in_service_queue; |
10569 |
+ |
10570 |
++ if (in_service_bfqq && in_service_bfqq != bfqq && |
10571 |
++ bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) |
10572 |
++ && likely(in_service_bfqq == &bfqd->oom_bfqq)) |
10573 |
++ bfq_log_bfqq(bfqd, bfqq, |
10574 |
++ "would have tried merge with in-service-queue, but wr"); |
10575 |
++ |
10576 |
+ if (!in_service_bfqq || in_service_bfqq == bfqq || |
10577 |
+- !bfqd->in_service_bic || |
10578 |
++ !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || |
10579 |
+ unlikely(in_service_bfqq == &bfqd->oom_bfqq)) |
10580 |
+ goto check_scheduled; |
10581 |
+ |
10582 |
+@@ -1482,7 +1909,15 @@ check_scheduled: |
10583 |
+ |
10584 |
+ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); |
10585 |
+ |
10586 |
+- if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && |
10587 |
++ if (new_bfqq && wr_from_too_long(new_bfqq) && |
10588 |
++ likely(new_bfqq != &bfqd->oom_bfqq) && |
10589 |
++ bfq_may_be_close_cooperator(bfqq, new_bfqq)) |
10590 |
++ bfq_log_bfqq(bfqd, bfqq, |
10591 |
++ "would have merged with bfq%d, but wr", |
10592 |
++ new_bfqq->pid); |
10593 |
++ |
10594 |
++ if (new_bfqq && !wr_from_too_long(new_bfqq) && |
10595 |
++ likely(new_bfqq != &bfqd->oom_bfqq) && |
10596 |
+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) |
10597 |
+ return bfq_setup_merge(bfqq, new_bfqq); |
10598 |
+ |
10599 |
+@@ -1498,46 +1933,11 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) |
10600 |
+ */ |
10601 |
+ if (!bfqq->bic) |
10602 |
+ return; |
10603 |
+- if (bfqq->bic->wr_time_left) |
10604 |
+- /* |
10605 |
+- * This is the queue of a just-started process, and would |
10606 |
+- * deserve weight raising: we set wr_time_left to the full |
10607 |
+- * weight-raising duration to trigger weight-raising when |
10608 |
+- * and if the queue is split and the first request of the |
10609 |
+- * queue is enqueued. |
10610 |
+- */ |
10611 |
+- bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); |
10612 |
+- else if (bfqq->wr_coeff > 1) { |
10613 |
+- unsigned long wr_duration = |
10614 |
+- jiffies - bfqq->last_wr_start_finish; |
10615 |
+- /* |
10616 |
+- * It may happen that a queue's weight raising period lasts |
10617 |
+- * longer than its wr_cur_max_time, as weight raising is |
10618 |
+- * handled only when a request is enqueued or dispatched (it |
10619 |
+- * does not use any timer). If the weight raising period is |
10620 |
+- * about to end, don't save it. |
10621 |
+- */ |
10622 |
+- if (bfqq->wr_cur_max_time <= wr_duration) |
10623 |
+- bfqq->bic->wr_time_left = 0; |
10624 |
+- else |
10625 |
+- bfqq->bic->wr_time_left = |
10626 |
+- bfqq->wr_cur_max_time - wr_duration; |
10627 |
+- /* |
10628 |
+- * The bfq_queue is becoming shared or the requests of the |
10629 |
+- * process owning the queue are being redirected to a shared |
10630 |
+- * queue. Stop the weight raising period of the queue, as in |
10631 |
+- * both cases it should not be owned by an interactive or |
10632 |
+- * soft real-time application. |
10633 |
+- */ |
10634 |
+- bfq_bfqq_end_wr(bfqq); |
10635 |
+- } else |
10636 |
+- bfqq->bic->wr_time_left = 0; |
10637 |
++ |
10638 |
+ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); |
10639 |
+ bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); |
10640 |
+ bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); |
10641 |
+ bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); |
10642 |
+- bfqq->bic->cooperations++; |
10643 |
+- bfqq->bic->failed_cooperations = 0; |
10644 |
+ } |
10645 |
+ |
10646 |
+ static void bfq_get_bic_reference(struct bfq_queue *bfqq) |
10647 |
+@@ -1562,6 +1962,40 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, |
10648 |
+ if (bfq_bfqq_IO_bound(bfqq)) |
10649 |
+ bfq_mark_bfqq_IO_bound(new_bfqq); |
10650 |
+ bfq_clear_bfqq_IO_bound(bfqq); |
10651 |
++ |
10652 |
++ /* |
10653 |
++ * If bfqq is weight-raised, then let new_bfqq inherit |
10654 |
++ * weight-raising. To reduce false positives, neglect the case |
10655 |
++ * where bfqq has just been created, but has not yet made it |
10656 |
++ * to be weight-raised (which may happen because EQM may merge |
10657 |
++ * bfqq even before bfq_add_request is executed for the first |
10658 |
++ * time for bfqq). Handling this case would however be very |
10659 |
++ * easy, thanks to the flag just_created. |
10660 |
++ */ |
10661 |
++ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { |
10662 |
++ new_bfqq->wr_coeff = bfqq->wr_coeff; |
10663 |
++ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; |
10664 |
++ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; |
10665 |
++ if (bfq_bfqq_busy(new_bfqq)) |
10666 |
++ bfqd->wr_busy_queues++; |
10667 |
++ new_bfqq->entity.prio_changed = 1; |
10668 |
++ bfq_log_bfqq(bfqd, new_bfqq, |
10669 |
++ "wr starting after merge with %d, " |
10670 |
++ "rais_max_time %u", |
10671 |
++ bfqq->pid, |
10672 |
++ jiffies_to_msecs(bfqq->wr_cur_max_time)); |
10673 |
++ } |
10674 |
++ |
10675 |
++ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ |
10676 |
++ bfqq->wr_coeff = 1; |
10677 |
++ bfqq->entity.prio_changed = 1; |
10678 |
++ if (bfq_bfqq_busy(bfqq)) |
10679 |
++ bfqd->wr_busy_queues--; |
10680 |
++ } |
10681 |
++ |
10682 |
++ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", |
10683 |
++ bfqd->wr_busy_queues); |
10684 |
++ |
10685 |
+ /* |
10686 |
+ * Grab a reference to the bic, to prevent it from being destroyed |
10687 |
+ * before being possibly touched by a bfq_split_bfqq(). |
10688 |
+@@ -1588,18 +2022,6 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, |
10689 |
+ bfq_put_queue(bfqq); |
10690 |
+ } |
10691 |
+ |
10692 |
+-static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) |
10693 |
+-{ |
10694 |
+- struct bfq_io_cq *bic = bfqq->bic; |
10695 |
+- struct bfq_data *bfqd = bfqq->bfqd; |
10696 |
+- |
10697 |
+- if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { |
10698 |
+- bic->failed_cooperations++; |
10699 |
+- if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) |
10700 |
+- bic->cooperations = 0; |
10701 |
+- } |
10702 |
+-} |
10703 |
+- |
10704 |
+ static int bfq_allow_merge(struct request_queue *q, struct request *rq, |
10705 |
+ struct bio *bio) |
10706 |
+ { |
10707 |
+@@ -1637,30 +2059,86 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, |
10708 |
+ * to decide whether bio and rq can be merged. |
10709 |
+ */ |
10710 |
+ bfqq = new_bfqq; |
10711 |
+- } else |
10712 |
+- bfq_bfqq_increase_failed_cooperations(bfqq); |
10713 |
++ } |
10714 |
+ } |
10715 |
+ |
10716 |
+ return bfqq == RQ_BFQQ(rq); |
10717 |
+ } |
10718 |
+ |
10719 |
++/* |
10720 |
++ * Set the maximum time for the in-service queue to consume its |
10721 |
++ * budget. This prevents seeky processes from lowering the throughput. |
10722 |
++ * In practice, a time-slice service scheme is used with seeky |
10723 |
++ * processes. |
10724 |
++ */ |
10725 |
++static void bfq_set_budget_timeout(struct bfq_data *bfqd, |
10726 |
++ struct bfq_queue *bfqq) |
10727 |
++{ |
10728 |
++ unsigned int timeout_coeff; |
10729 |
++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) |
10730 |
++ timeout_coeff = 1; |
10731 |
++ else |
10732 |
++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; |
10733 |
++ |
10734 |
++ bfqd->last_budget_start = ktime_get(); |
10735 |
++ |
10736 |
++ bfqq->budget_timeout = jiffies + |
10737 |
++ bfqd->bfq_timeout * timeout_coeff; |
10738 |
++ |
10739 |
++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", |
10740 |
++ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); |
10741 |
++} |
10742 |
++ |
10743 |
+ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, |
10744 |
+ struct bfq_queue *bfqq) |
10745 |
+ { |
10746 |
+ if (bfqq) { |
10747 |
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED |
10748 |
+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); |
10749 |
+-#endif |
10750 |
+ bfq_mark_bfqq_must_alloc(bfqq); |
10751 |
+- bfq_mark_bfqq_budget_new(bfqq); |
10752 |
+ bfq_clear_bfqq_fifo_expire(bfqq); |
10753 |
+ |
10754 |
+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; |
10755 |
+ |
10756 |
++ BUG_ON(bfqq == bfqd->in_service_queue); |
10757 |
++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); |
10758 |
++ |
10759 |
++ if (bfqq->wr_coeff > 1 && |
10760 |
++ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && |
10761 |
++ time_is_before_jiffies(bfqq->budget_timeout)) { |
10762 |
++ /* |
10763 |
++ * For soft real-time queues, move the start |
10764 |
++ * of the weight-raising period forward by the |
10765 |
++ * time the queue has not received any |
10766 |
++ * service. Otherwise, a relatively long |
10767 |
++ * service delay is likely to cause the |
10768 |
++ * weight-raising period of the queue to end, |
10769 |
++ * because of the short duration of the |
10770 |
++ * weight-raising period of a soft real-time |
10771 |
++ * queue. It is worth noting that this move |
10772 |
++ * is not so dangerous for the other queues, |
10773 |
++ * because soft real-time queues are not |
10774 |
++ * greedy. |
10775 |
++ * |
10776 |
++ * To not add a further variable, we use the |
10777 |
++ * overloaded field budget_timeout to |
10778 |
++ * determine for how long the queue has not |
10779 |
++ * received service, i.e., how much time has |
10780 |
++ * elapsed since the queue expired. However, |
10781 |
++ * this is a little imprecise, because |
10782 |
++ * budget_timeout is set to jiffies if bfqq |
10783 |
++ * not only expires, but also remains with no |
10784 |
++ * request. |
10785 |
++ */ |
10786 |
++ bfqq->last_wr_start_finish += jiffies - |
10787 |
++ bfqq->budget_timeout; |
10788 |
++ } |
10789 |
++ |
10790 |
++ bfq_set_budget_timeout(bfqd, bfqq); |
10791 |
+ bfq_log_bfqq(bfqd, bfqq, |
10792 |
+ "set_in_service_queue, cur-budget = %d", |
10793 |
+ bfqq->entity.budget); |
10794 |
+- } |
10795 |
++ } else |
10796 |
++ bfq_log(bfqd, "set_in_service_queue: NULL"); |
10797 |
+ |
10798 |
+ bfqd->in_service_queue = bfqq; |
10799 |
+ } |
10800 |
+@@ -1676,31 +2154,6 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) |
10801 |
+ return bfqq; |
10802 |
+ } |
10803 |
+ |
10804 |
+-/* |
10805 |
+- * If enough samples have been computed, return the current max budget |
10806 |
+- * stored in bfqd, which is dynamically updated according to the |
10807 |
+- * estimated disk peak rate; otherwise return the default max budget |
10808 |
+- */ |
10809 |
+-static int bfq_max_budget(struct bfq_data *bfqd) |
10810 |
+-{ |
10811 |
+- if (bfqd->budgets_assigned < bfq_stats_min_budgets) |
10812 |
+- return bfq_default_max_budget; |
10813 |
+- else |
10814 |
+- return bfqd->bfq_max_budget; |
10815 |
+-} |
10816 |
+- |
10817 |
+-/* |
10818 |
+- * Return min budget, which is a fraction of the current or default |
10819 |
+- * max budget (trying with 1/32) |
10820 |
+- */ |
10821 |
+-static int bfq_min_budget(struct bfq_data *bfqd) |
10822 |
+-{ |
10823 |
+- if (bfqd->budgets_assigned < bfq_stats_min_budgets) |
10824 |
+- return bfq_default_max_budget / 32; |
10825 |
+- else |
10826 |
+- return bfqd->bfq_max_budget / 32; |
10827 |
+-} |
10828 |
+- |
10829 |
+ static void bfq_arm_slice_timer(struct bfq_data *bfqd) |
10830 |
+ { |
10831 |
+ struct bfq_queue *bfqq = bfqd->in_service_queue; |
10832 |
+@@ -1723,64 +2176,36 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) |
10833 |
+ * |
10834 |
+ * To prevent processes with (partly) seeky workloads from |
10835 |
+ * being too ill-treated, grant them a small fraction of the |
10836 |
+- * assigned budget before reducing the waiting time to |
10837 |
+- * BFQ_MIN_TT. This happened to help reduce latency. |
10838 |
+- */ |
10839 |
+- sl = bfqd->bfq_slice_idle; |
10840 |
+- /* |
10841 |
+- * Unless the queue is being weight-raised or the scenario is |
10842 |
+- * asymmetric, grant only minimum idle time if the queue either |
10843 |
+- * has been seeky for long enough or has already proved to be |
10844 |
+- * constantly seeky. |
10845 |
+- */ |
10846 |
+- if (bfq_sample_valid(bfqq->seek_samples) && |
10847 |
+- ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > |
10848 |
+- bfq_max_budget(bfqq->bfqd) / 8) || |
10849 |
+- bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && |
10850 |
+- bfq_symmetric_scenario(bfqd)) |
10851 |
+- sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); |
10852 |
+- else if (bfqq->wr_coeff > 1) |
10853 |
+- sl = sl * 3; |
10854 |
+- bfqd->last_idling_start = ktime_get(); |
10855 |
+- mod_timer(&bfqd->idle_slice_timer, jiffies + sl); |
10856 |
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED |
10857 |
+- bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); |
10858 |
+-#endif |
10859 |
+- bfq_log(bfqd, "arm idle: %u/%u ms", |
10860 |
+- jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); |
10861 |
+-} |
10862 |
+- |
10863 |
+-/* |
10864 |
+- * Set the maximum time for the in-service queue to consume its |
10865 |
+- * budget. This prevents seeky processes from lowering the disk |
10866 |
+- * throughput (always guaranteed with a time slice scheme as in CFQ). |
10867 |
+- */ |
10868 |
+-static void bfq_set_budget_timeout(struct bfq_data *bfqd) |
10869 |
+-{ |
10870 |
+- struct bfq_queue *bfqq = bfqd->in_service_queue; |
10871 |
+- unsigned int timeout_coeff; |
10872 |
+- if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) |
10873 |
+- timeout_coeff = 1; |
10874 |
+- else |
10875 |
+- timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; |
10876 |
+- |
10877 |
+- bfqd->last_budget_start = ktime_get(); |
10878 |
+- |
10879 |
+- bfq_clear_bfqq_budget_new(bfqq); |
10880 |
+- bfqq->budget_timeout = jiffies + |
10881 |
+- bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; |
10882 |
++ * assigned budget before reducing the waiting time to |
10883 |
++ * BFQ_MIN_TT. This happened to help reduce latency. |
10884 |
++ */ |
10885 |
++ sl = bfqd->bfq_slice_idle; |
10886 |
++ /* |
10887 |
++ * Unless the queue is being weight-raised or the scenario is |
10888 |
++ * asymmetric, grant only minimum idle time if the queue |
10889 |
++ * is seeky. A long idling is preserved for a weight-raised |
10890 |
++ * queue, or, more in general, in an asymemtric scenario, |
10891 |
++ * because a long idling is needed for guaranteeing to a queue |
10892 |
++ * its reserved share of the throughput (in particular, it is |
10893 |
++ * needed if the queue has a higher weight than some other |
10894 |
++ * queue). |
10895 |
++ */ |
10896 |
++ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && |
10897 |
++ bfq_symmetric_scenario(bfqd)) |
10898 |
++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); |
10899 |
+ |
10900 |
+- bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", |
10901 |
+- jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * |
10902 |
+- timeout_coeff)); |
10903 |
++ bfqd->last_idling_start = ktime_get(); |
10904 |
++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl); |
10905 |
++ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); |
10906 |
++ bfq_log(bfqd, "arm idle: %u/%u ms", |
10907 |
++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); |
10908 |
+ } |
10909 |
+ |
10910 |
+ /* |
10911 |
+- * Move request from internal lists to the request queue dispatch list. |
10912 |
++ * Move request from internal lists to the dispatch list of the request queue |
10913 |
+ */ |
10914 |
+ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) |
10915 |
+ { |
10916 |
+- struct bfq_data *bfqd = q->elevator->elevator_data; |
10917 |
+ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
10918 |
+ |
10919 |
+ /* |
10920 |
+@@ -1794,15 +2219,9 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) |
10921 |
+ * incrementing bfqq->dispatched. |
10922 |
+ */ |
10923 |
+ bfqq->dispatched++; |
10924 |
++ |
10925 |
+ bfq_remove_request(rq); |
10926 |
+ elv_dispatch_sort(q, rq); |
10927 |
+- |
10928 |
+- if (bfq_bfqq_sync(bfqq)) |
10929 |
+- bfqd->sync_flight++; |
10930 |
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED |
10931 |
+- bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq), |
10932 |
+- rq->cmd_flags); |
10933 |
+-#endif |
10934 |
+ } |
10935 |
+ |
10936 |
+ /* |
10937 |
+@@ -1822,18 +2241,12 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq) |
10938 |
+ |
10939 |
+ rq = rq_entry_fifo(bfqq->fifo.next); |
10940 |
+ |
10941 |
+- if (time_before(jiffies, rq->fifo_time)) |
10942 |
++ if (time_is_after_jiffies(rq->fifo_time)) |
10943 |
+ return NULL; |
10944 |
+ |
10945 |
+ return rq; |
10946 |
+ } |
10947 |
+ |
10948 |
+-static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) |
10949 |
+-{ |
10950 |
+- struct bfq_entity *entity = &bfqq->entity; |
10951 |
+- return entity->budget - entity->service; |
10952 |
+-} |
10953 |
+- |
10954 |
+ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
10955 |
+ { |
10956 |
+ BUG_ON(bfqq != bfqd->in_service_queue); |
10957 |
+@@ -1850,12 +2263,15 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
10958 |
+ bfq_mark_bfqq_split_coop(bfqq); |
10959 |
+ |
10960 |
+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { |
10961 |
+- /* |
10962 |
+- * Overloading budget_timeout field to store the time |
10963 |
+- * at which the queue remains with no backlog; used by |
10964 |
+- * the weight-raising mechanism. |
10965 |
+- */ |
10966 |
+- bfqq->budget_timeout = jiffies; |
10967 |
++ if (bfqq->dispatched == 0) |
10968 |
++ /* |
10969 |
++ * Overloading budget_timeout field to store |
10970 |
++ * the time at which the queue remains with no |
10971 |
++ * backlog and no outstanding request; used by |
10972 |
++ * the weight-raising mechanism. |
10973 |
++ */ |
10974 |
++ bfqq->budget_timeout = jiffies; |
10975 |
++ |
10976 |
+ bfq_del_bfqq_busy(bfqd, bfqq, 1); |
10977 |
+ } else { |
10978 |
+ bfq_activate_bfqq(bfqd, bfqq); |
10979 |
+@@ -1882,10 +2298,19 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, |
10980 |
+ struct request *next_rq; |
10981 |
+ int budget, min_budget; |
10982 |
+ |
10983 |
+- budget = bfqq->max_budget; |
10984 |
++ BUG_ON(bfqq != bfqd->in_service_queue); |
10985 |
++ |
10986 |
+ min_budget = bfq_min_budget(bfqd); |
10987 |
+ |
10988 |
+- BUG_ON(bfqq != bfqd->in_service_queue); |
10989 |
++ if (bfqq->wr_coeff == 1) |
10990 |
++ budget = bfqq->max_budget; |
10991 |
++ else /* |
10992 |
++ * Use a constant, low budget for weight-raised queues, |
10993 |
++ * to help achieve a low latency. Keep it slightly higher |
10994 |
++ * than the minimum possible budget, to cause a little |
10995 |
++ * bit fewer expirations. |
10996 |
++ */ |
10997 |
++ budget = 2 * min_budget; |
10998 |
+ |
10999 |
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", |
11000 |
+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); |
11001 |
+@@ -1894,7 +2319,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, |
11002 |
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", |
11003 |
+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); |
11004 |
+ |
11005 |
+- if (bfq_bfqq_sync(bfqq)) { |
11006 |
++ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { |
11007 |
+ switch (reason) { |
11008 |
+ /* |
11009 |
+ * Caveat: in all the following cases we trade latency |
11010 |
+@@ -1936,14 +2361,10 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, |
11011 |
+ break; |
11012 |
+ case BFQ_BFQQ_BUDGET_TIMEOUT: |
11013 |
+ /* |
11014 |
+- * We double the budget here because: 1) it |
11015 |
+- * gives the chance to boost the throughput if |
11016 |
+- * this is not a seeky process (which may have |
11017 |
+- * bumped into this timeout because of, e.g., |
11018 |
+- * ZBR), 2) together with charge_full_budget |
11019 |
+- * it helps give seeky processes higher |
11020 |
+- * timestamps, and hence be served less |
11021 |
+- * frequently. |
11022 |
++ * We double the budget here because it gives |
11023 |
++ * the chance to boost the throughput if this |
11024 |
++ * is not a seeky process (and has bumped into |
11025 |
++ * this timeout because of, e.g., ZBR). |
11026 |
+ */ |
11027 |
+ budget = min(budget * 2, bfqd->bfq_max_budget); |
11028 |
+ break; |
11029 |
+@@ -1960,17 +2381,49 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, |
11030 |
+ budget = min(budget * 4, bfqd->bfq_max_budget); |
11031 |
+ break; |
11032 |
+ case BFQ_BFQQ_NO_MORE_REQUESTS: |
11033 |
+- /* |
11034 |
+- * Leave the budget unchanged. |
11035 |
+- */ |
11036 |
++ /* |
11037 |
++ * For queues that expire for this reason, it |
11038 |
++ * is particularly important to keep the |
11039 |
++ * budget close to the actual service they |
11040 |
++ * need. Doing so reduces the timestamp |
11041 |
++ * misalignment problem described in the |
11042 |
++ * comments in the body of |
11043 |
++ * __bfq_activate_entity. In fact, suppose |
11044 |
++ * that a queue systematically expires for |
11045 |
++ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a |
11046 |
++ * new request in time to enjoy timestamp |
11047 |
++ * back-shifting. The larger the budget of the |
11048 |
++ * queue is with respect to the service the |
11049 |
++ * queue actually requests in each service |
11050 |
++ * slot, the more times the queue can be |
11051 |
++ * reactivated with the same virtual finish |
11052 |
++ * time. It follows that, even if this finish |
11053 |
++ * time is pushed to the system virtual time |
11054 |
++ * to reduce the consequent timestamp |
11055 |
++ * misalignment, the queue unjustly enjoys for |
11056 |
++ * many re-activations a lower finish time |
11057 |
++ * than all newly activated queues. |
11058 |
++ * |
11059 |
++ * The service needed by bfqq is measured |
11060 |
++ * quite precisely by bfqq->entity.service. |
11061 |
++ * Since bfqq does not enjoy device idling, |
11062 |
++ * bfqq->entity.service is equal to the number |
11063 |
++ * of sectors that the process associated with |
11064 |
++ * bfqq requested to read/write before waiting |
11065 |
++ * for request completions, or blocking for |
11066 |
++ * other reasons. |
11067 |
++ */ |
11068 |
++ budget = max_t(int, bfqq->entity.service, min_budget); |
11069 |
++ break; |
11070 |
+ default: |
11071 |
+ return; |
11072 |
+ } |
11073 |
+- } else |
11074 |
++ } else if (!bfq_bfqq_sync(bfqq)) |
11075 |
+ /* |
11076 |
+- * Async queues get always the maximum possible budget |
11077 |
+- * (their ability to dispatch is limited by |
11078 |
+- * @bfqd->bfq_max_budget_async_rq). |
11079 |
++ * Async queues get always the maximum possible |
11080 |
++ * budget, as for them we do not care about latency |
11081 |
++ * (in addition, their ability to dispatch is limited |
11082 |
++ * by the charging factor). |
11083 |
+ */ |
11084 |
+ budget = bfqd->bfq_max_budget; |
11085 |
+ |
11086 |
+@@ -1981,65 +2434,105 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, |
11087 |
+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); |
11088 |
+ |
11089 |
+ /* |
11090 |
+- * Make sure that we have enough budget for the next request. |
11091 |
+- * Since the finish time of the bfqq must be kept in sync with |
11092 |
+- * the budget, be sure to call __bfq_bfqq_expire() after the |
11093 |
++ * If there is still backlog, then assign a new budget, making |
11094 |
++ * sure that it is large enough for the next request. Since |
11095 |
++ * the finish time of bfqq must be kept in sync with the |
11096 |
++ * budget, be sure to call __bfq_bfqq_expire() *after* this |
11097 |
+ * update. |
11098 |
++ * |
11099 |
++ * If there is no backlog, then no need to update the budget; |
11100 |
++ * it will be updated on the arrival of a new request. |
11101 |
+ */ |
11102 |
+ next_rq = bfqq->next_rq; |
11103 |
+- if (next_rq) |
11104 |
++ if (next_rq) { |
11105 |
++ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || |
11106 |
++ reason == BFQ_BFQQ_NO_MORE_REQUESTS); |
11107 |
+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, |
11108 |
+ bfq_serv_to_charge(next_rq, bfqq)); |
11109 |
+- else |
11110 |
+- bfqq->entity.budget = bfqq->max_budget; |
11111 |
++ BUG_ON(!bfq_bfqq_busy(bfqq)); |
11112 |
++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); |
11113 |
++ } |
11114 |
+ |
11115 |
+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", |
11116 |
+ next_rq ? blk_rq_sectors(next_rq) : 0, |
11117 |
+ bfqq->entity.budget); |
11118 |
+ } |
11119 |
+ |
11120 |
+-static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) |
11121 |
++static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) |
11122 |
+ { |
11123 |
+- unsigned long max_budget; |
11124 |
+- |
11125 |
+ /* |
11126 |
+ * The max_budget calculated when autotuning is equal to the |
11127 |
+- * amount of sectors transfered in timeout_sync at the |
11128 |
++ * amount of sectors transfered in timeout at the |
11129 |
+ * estimated peak rate. |
11130 |
+ */ |
11131 |
+- max_budget = (unsigned long)(peak_rate * 1000 * |
11132 |
+- timeout >> BFQ_RATE_SHIFT); |
11133 |
+- |
11134 |
+- return max_budget; |
11135 |
++ return bfqd->peak_rate * 1000 * jiffies_to_msecs(bfqd->bfq_timeout) >> |
11136 |
++ BFQ_RATE_SHIFT; |
11137 |
+ } |
11138 |
+ |
11139 |
+ /* |
11140 |
+- * In addition to updating the peak rate, checks whether the process |
11141 |
+- * is "slow", and returns 1 if so. This slow flag is used, in addition |
11142 |
+- * to the budget timeout, to reduce the amount of service provided to |
11143 |
+- * seeky processes, and hence reduce their chances to lower the |
11144 |
+- * throughput. See the code for more details. |
11145 |
++ * Update the read peak rate (quantity used for auto-tuning) as a |
11146 |
++ * function of the rate at which bfqq has been served, and check |
11147 |
++ * whether the process associated with bfqq is "slow". Return true if |
11148 |
++ * the process is slow. The slow flag is used, in addition to the |
11149 |
++ * budget timeout, to reduce the amount of service provided to seeky |
11150 |
++ * processes, and hence reduce their chances to lower the |
11151 |
++ * throughput. More details in the body of the function. |
11152 |
++ * |
11153 |
++ * An important observation is in order: with devices with internal |
11154 |
++ * queues, it is hard if ever possible to know when and for how long |
11155 |
++ * an I/O request is processed by the device (apart from the trivial |
11156 |
++ * I/O pattern where a new request is dispatched only after the |
11157 |
++ * previous one has been completed). This makes it hard to evaluate |
11158 |
++ * the real rate at which the I/O requests of each bfq_queue are |
11159 |
++ * served. In fact, for an I/O scheduler like BFQ, serving a |
11160 |
++ * bfq_queue means just dispatching its requests during its service |
11161 |
++ * slot, i.e., until the budget of the queue is exhausted, or the |
11162 |
++ * queue remains idle, or, finally, a timeout fires. But, during the |
11163 |
++ * service slot of a bfq_queue, the device may be still processing |
11164 |
++ * requests of bfq_queues served in previous service slots. On the |
11165 |
++ * opposite end, the requests of the in-service bfq_queue may be |
11166 |
++ * completed after the service slot of the queue finishes. Anyway, |
11167 |
++ * unless more sophisticated solutions are used (where possible), the |
11168 |
++ * sum of the sizes of the requests dispatched during the service slot |
11169 |
++ * of a bfq_queue is probably the only approximation available for |
11170 |
++ * the service received by the bfq_queue during its service slot. And, |
11171 |
++ * as written above, this sum is the quantity used in this function to |
11172 |
++ * evaluate the peak rate. |
11173 |
+ */ |
11174 |
+ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
11175 |
+- bool compensate, enum bfqq_expiration reason) |
11176 |
++ bool compensate, enum bfqq_expiration reason, |
11177 |
++ unsigned long *delta_ms) |
11178 |
+ { |
11179 |
+- u64 bw, usecs, expected, timeout; |
11180 |
+- ktime_t delta; |
11181 |
++ u64 bw, bwdiv10, delta_usecs, delta_ms_tmp; |
11182 |
++ ktime_t delta_ktime; |
11183 |
+ int update = 0; |
11184 |
++ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ |
11185 |
+ |
11186 |
+- if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) |
11187 |
++ if (!bfq_bfqq_sync(bfqq)) |
11188 |
+ return false; |
11189 |
+ |
11190 |
+ if (compensate) |
11191 |
+- delta = bfqd->last_idling_start; |
11192 |
++ delta_ktime = bfqd->last_idling_start; |
11193 |
+ else |
11194 |
+- delta = ktime_get(); |
11195 |
+- delta = ktime_sub(delta, bfqd->last_budget_start); |
11196 |
+- usecs = ktime_to_us(delta); |
11197 |
++ delta_ktime = ktime_get(); |
11198 |
++ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); |
11199 |
++ delta_usecs = ktime_to_us(delta_ktime); |
11200 |
+ |
11201 |
+ /* Don't trust short/unrealistic values. */ |
11202 |
+- if (usecs < 100 || usecs >= LONG_MAX) |
11203 |
+- return false; |
11204 |
++ if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) { |
11205 |
++ if (blk_queue_nonrot(bfqd->queue)) |
11206 |
++ *delta_ms = BFQ_MIN_TT; /* give same worst-case |
11207 |
++ guarantees as |
11208 |
++ idling for seeky |
11209 |
++ */ |
11210 |
++ else /* Charge at least one seek */ |
11211 |
++ *delta_ms = jiffies_to_msecs(bfq_slice_idle); |
11212 |
++ return slow; |
11213 |
++ } |
11214 |
++ |
11215 |
++ delta_ms_tmp = delta_usecs; |
11216 |
++ do_div(delta_ms_tmp, 1000); |
11217 |
++ *delta_ms = delta_ms_tmp; |
11218 |
+ |
11219 |
+ /* |
11220 |
+ * Calculate the bandwidth for the last slice. We use a 64 bit |
11221 |
+@@ -2048,32 +2541,51 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
11222 |
+ * and to avoid overflows. |
11223 |
+ */ |
11224 |
+ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; |
11225 |
+- do_div(bw, (unsigned long)usecs); |
11226 |
+- |
11227 |
+- timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); |
11228 |
++ do_div(bw, (unsigned long)delta_usecs); |
11229 |
+ |
11230 |
++ bfq_log(bfqd, "measured bw = %llu sects/sec", |
11231 |
++ (1000000*bw)>>BFQ_RATE_SHIFT); |
11232 |
+ /* |
11233 |
+ * Use only long (> 20ms) intervals to filter out spikes for |
11234 |
+ * the peak rate estimation. |
11235 |
+ */ |
11236 |
+- if (usecs > 20000) { |
11237 |
++ if (delta_usecs > 20000) { |
11238 |
++ bool fully_sequential = bfqq->seek_history == 0; |
11239 |
++ /* |
11240 |
++ * Soft real-time queues are not good candidates for |
11241 |
++ * evaluating bw, as they are likely to be slow even |
11242 |
++ * if sequential. |
11243 |
++ */ |
11244 |
++ bool non_soft_rt = bfqq->wr_coeff == 1 || |
11245 |
++ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time; |
11246 |
++ bool consumed_large_budget = |
11247 |
++ reason == BFQ_BFQQ_BUDGET_EXHAUSTED && |
11248 |
++ bfqq->entity.budget >= bfqd->bfq_max_budget * 2 / 3; |
11249 |
++ bool served_for_long_time = |
11250 |
++ reason == BFQ_BFQQ_BUDGET_TIMEOUT || |
11251 |
++ consumed_large_budget; |
11252 |
++ |
11253 |
++ BUG_ON(bfqq->seek_history == 0 && |
11254 |
++ hweight32(bfqq->seek_history) != 0); |
11255 |
++ |
11256 |
+ if (bw > bfqd->peak_rate || |
11257 |
+- (!BFQQ_SEEKY(bfqq) && |
11258 |
+- reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { |
11259 |
+- bfq_log(bfqd, "measured bw =%llu", bw); |
11260 |
++ (bfq_bfqq_sync(bfqq) && fully_sequential && non_soft_rt && |
11261 |
++ served_for_long_time)) { |
11262 |
+ /* |
11263 |
+ * To smooth oscillations use a low-pass filter with |
11264 |
+- * alpha=7/8, i.e., |
11265 |
+- * new_rate = (7/8) * old_rate + (1/8) * bw |
11266 |
++ * alpha=9/10, i.e., |
11267 |
++ * new_rate = (9/10) * old_rate + (1/10) * bw |
11268 |
+ */ |
11269 |
+- do_div(bw, 8); |
11270 |
+- if (bw == 0) |
11271 |
+- return 0; |
11272 |
+- bfqd->peak_rate *= 7; |
11273 |
+- do_div(bfqd->peak_rate, 8); |
11274 |
+- bfqd->peak_rate += bw; |
11275 |
++ bwdiv10 = bw; |
11276 |
++ do_div(bwdiv10, 10); |
11277 |
++ if (bwdiv10 == 0) |
11278 |
++ return false; /* bw too low to be used */ |
11279 |
++ bfqd->peak_rate *= 9; |
11280 |
++ do_div(bfqd->peak_rate, 10); |
11281 |
++ bfqd->peak_rate += bwdiv10; |
11282 |
+ update = 1; |
11283 |
+- bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); |
11284 |
++ bfq_log(bfqd, "new peak_rate = %llu sects/sec", |
11285 |
++ (1000000*bfqd->peak_rate)>>BFQ_RATE_SHIFT); |
11286 |
+ } |
11287 |
+ |
11288 |
+ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; |
11289 |
+@@ -2086,9 +2598,8 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
11290 |
+ int dev_type = blk_queue_nonrot(bfqd->queue); |
11291 |
+ if (bfqd->bfq_user_max_budget == 0) { |
11292 |
+ bfqd->bfq_max_budget = |
11293 |
+- bfq_calc_max_budget(bfqd->peak_rate, |
11294 |
+- timeout); |
11295 |
+- bfq_log(bfqd, "new max_budget=%d", |
11296 |
++ bfq_calc_max_budget(bfqd); |
11297 |
++ bfq_log(bfqd, "new max_budget = %d", |
11298 |
+ bfqd->bfq_max_budget); |
11299 |
+ } |
11300 |
+ if (bfqd->device_speed == BFQ_BFQD_FAST && |
11301 |
+@@ -2102,38 +2613,35 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
11302 |
+ bfqd->RT_prod = R_fast[dev_type] * |
11303 |
+ T_fast[dev_type]; |
11304 |
+ } |
11305 |
++ bfq_log(bfqd, "dev_speed_class = %d (%d sects/sec), " |
11306 |
++ "thresh %d setcs/sec", |
11307 |
++ bfqd->device_speed, |
11308 |
++ bfqd->device_speed == BFQ_BFQD_FAST ? |
11309 |
++ (1000000*R_fast[dev_type])>>BFQ_RATE_SHIFT : |
11310 |
++ (1000000*R_slow[dev_type])>>BFQ_RATE_SHIFT, |
11311 |
++ (1000000*device_speed_thresh[dev_type])>> |
11312 |
++ BFQ_RATE_SHIFT); |
11313 |
+ } |
11314 |
++ /* |
11315 |
++ * Caveat: processes doing IO in the slower disk zones |
11316 |
++ * tend to be slow(er) even if not seeky. In this |
11317 |
++ * respect, the estimated peak rate is likely to be an |
11318 |
++ * average over the disk surface. Accordingly, to not |
11319 |
++ * be too harsh with unlucky processes, a process is |
11320 |
++ * deemed slow only if its bw has been lower than half |
11321 |
++ * of the estimated peak rate. |
11322 |
++ */ |
11323 |
++ slow = bw < bfqd->peak_rate / 2; |
11324 |
+ } |
11325 |
+ |
11326 |
+- /* |
11327 |
+- * If the process has been served for a too short time |
11328 |
+- * interval to let its possible sequential accesses prevail on |
11329 |
+- * the initial seek time needed to move the disk head on the |
11330 |
+- * first sector it requested, then give the process a chance |
11331 |
+- * and for the moment return false. |
11332 |
+- */ |
11333 |
+- if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) |
11334 |
+- return false; |
11335 |
+- |
11336 |
+- /* |
11337 |
+- * A process is considered ``slow'' (i.e., seeky, so that we |
11338 |
+- * cannot treat it fairly in the service domain, as it would |
11339 |
+- * slow down too much the other processes) if, when a slice |
11340 |
+- * ends for whatever reason, it has received service at a |
11341 |
+- * rate that would not be high enough to complete the budget |
11342 |
+- * before the budget timeout expiration. |
11343 |
+- */ |
11344 |
+- expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; |
11345 |
++ bfq_log_bfqq(bfqd, bfqq, |
11346 |
++ "update_peak_rate: bw %llu sect/s, peak rate %llu, " |
11347 |
++ "slow %d", |
11348 |
++ (1000000*bw)>>BFQ_RATE_SHIFT, |
11349 |
++ (1000000*bfqd->peak_rate)>>BFQ_RATE_SHIFT, |
11350 |
++ bw < bfqd->peak_rate / 2); |
11351 |
+ |
11352 |
+- /* |
11353 |
+- * Caveat: processes doing IO in the slower disk zones will |
11354 |
+- * tend to be slow(er) even if not seeky. And the estimated |
11355 |
+- * peak rate will actually be an average over the disk |
11356 |
+- * surface. Hence, to not be too harsh with unlucky processes, |
11357 |
+- * we keep a budget/3 margin of safety before declaring a |
11358 |
+- * process slow. |
11359 |
+- */ |
11360 |
+- return expected > (4 * bfqq->entity.budget) / 3; |
11361 |
++ return slow; |
11362 |
+ } |
11363 |
+ |
11364 |
+ /* |
11365 |
+@@ -2191,6 +2699,15 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
11366 |
+ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, |
11367 |
+ struct bfq_queue *bfqq) |
11368 |
+ { |
11369 |
++ bfq_log_bfqq(bfqd, bfqq, |
11370 |
++ "softrt_next_start: service_blkg %lu " |
11371 |
++ "soft_rate %u sects/sec" |
11372 |
++ "interval %u", |
11373 |
++ bfqq->service_from_backlogged, |
11374 |
++ bfqd->bfq_wr_max_softrt_rate, |
11375 |
++ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / |
11376 |
++ bfqd->bfq_wr_max_softrt_rate)); |
11377 |
++ |
11378 |
+ return max(bfqq->last_idle_bklogged + |
11379 |
+ HZ * bfqq->service_from_backlogged / |
11380 |
+ bfqd->bfq_wr_max_softrt_rate, |
11381 |
+@@ -2198,13 +2715,21 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, |
11382 |
+ } |
11383 |
+ |
11384 |
+ /* |
11385 |
+- * Return the largest-possible time instant such that, for as long as possible, |
11386 |
+- * the current time will be lower than this time instant according to the macro |
11387 |
+- * time_is_before_jiffies(). |
11388 |
++ * Return the farthest future time instant according to jiffies |
11389 |
++ * macros. |
11390 |
++ */ |
11391 |
++static unsigned long bfq_greatest_from_now(void) |
11392 |
++{ |
11393 |
++ return jiffies + MAX_JIFFY_OFFSET; |
11394 |
++} |
11395 |
++ |
11396 |
++/* |
11397 |
++ * Return the farthest past time instant according to jiffies |
11398 |
++ * macros. |
11399 |
+ */ |
11400 |
+-static unsigned long bfq_infinity_from_now(unsigned long now) |
11401 |
++static unsigned long bfq_smallest_from_now(void) |
11402 |
+ { |
11403 |
+- return now + ULONG_MAX / 2; |
11404 |
++ return jiffies - MAX_JIFFY_OFFSET; |
11405 |
+ } |
11406 |
+ |
11407 |
+ /** |
11408 |
+@@ -2214,28 +2739,24 @@ static unsigned long bfq_infinity_from_now(unsigned long now) |
11409 |
+ * @compensate: if true, compensate for the time spent idling. |
11410 |
+ * @reason: the reason causing the expiration. |
11411 |
+ * |
11412 |
++ * If the process associated with bfqq does slow I/O (e.g., because it |
11413 |
++ * issues random requests), we charge bfqq with the time it has been |
11414 |
++ * in service instead of the service it has received (see |
11415 |
++ * bfq_bfqq_charge_time for details on how this goal is achieved). As |
11416 |
++ * a consequence, bfqq will typically get higher timestamps upon |
11417 |
++ * reactivation, and hence it will be rescheduled as if it had |
11418 |
++ * received more service than what it has actually received. In the |
11419 |
++ * end, bfqq receives less service in proportion to how slowly its |
11420 |
++ * associated process consumes its budgets (and hence how seriously it |
11421 |
++ * tends to lower the throughput). In addition, this time-charging |
11422 |
++ * strategy guarantees time fairness among slow processes. In |
11423 |
++ * contrast, if the process associated with bfqq is not slow, we |
11424 |
++ * charge bfqq exactly with the service it has received. |
11425 |
+ * |
11426 |
+- * If the process associated to the queue is slow (i.e., seeky), or in |
11427 |
+- * case of budget timeout, or, finally, if it is async, we |
11428 |
+- * artificially charge it an entire budget (independently of the |
11429 |
+- * actual service it received). As a consequence, the queue will get |
11430 |
+- * higher timestamps than the correct ones upon reactivation, and |
11431 |
+- * hence it will be rescheduled as if it had received more service |
11432 |
+- * than what it actually received. In the end, this class of processes |
11433 |
+- * will receive less service in proportion to how slowly they consume |
11434 |
+- * their budgets (and hence how seriously they tend to lower the |
11435 |
+- * throughput). |
11436 |
+- * |
11437 |
+- * In contrast, when a queue expires because it has been idling for |
11438 |
+- * too much or because it exhausted its budget, we do not touch the |
11439 |
+- * amount of service it has received. Hence when the queue will be |
11440 |
+- * reactivated and its timestamps updated, the latter will be in sync |
11441 |
+- * with the actual service received by the queue until expiration. |
11442 |
+- * |
11443 |
+- * Charging a full budget to the first type of queues and the exact |
11444 |
+- * service to the others has the effect of using the WF2Q+ policy to |
11445 |
+- * schedule the former on a timeslice basis, without violating the |
11446 |
+- * service domain guarantees of the latter. |
11447 |
++ * Charging time to the first type of queues and the exact service to |
11448 |
++ * the other has the effect of using the WF2Q+ policy to schedule the |
11449 |
++ * former on a timeslice basis, without violating service domain |
11450 |
++ * guarantees among the latter. |
11451 |
+ */ |
11452 |
+ static void bfq_bfqq_expire(struct bfq_data *bfqd, |
11453 |
+ struct bfq_queue *bfqq, |
11454 |
+@@ -2243,40 +2764,51 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, |
11455 |
+ enum bfqq_expiration reason) |
11456 |
+ { |
11457 |
+ bool slow; |
11458 |
++ unsigned long delta = 0; |
11459 |
++ struct bfq_entity *entity = &bfqq->entity; |
11460 |
++ |
11461 |
+ BUG_ON(bfqq != bfqd->in_service_queue); |
11462 |
+ |
11463 |
+ /* |
11464 |
+- * Update disk peak rate for autotuning and check whether the |
11465 |
++ * Update device peak rate for autotuning and check whether the |
11466 |
+ * process is slow (see bfq_update_peak_rate). |
11467 |
+ */ |
11468 |
+- slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); |
11469 |
++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason, &delta); |
11470 |
+ |
11471 |
+ /* |
11472 |
+- * As above explained, 'punish' slow (i.e., seeky), timed-out |
11473 |
+- * and async queues, to favor sequential sync workloads. |
11474 |
+- * |
11475 |
+- * Processes doing I/O in the slower disk zones will tend to be |
11476 |
+- * slow(er) even if not seeky. Hence, since the estimated peak |
11477 |
+- * rate is actually an average over the disk surface, these |
11478 |
+- * processes may timeout just for bad luck. To avoid punishing |
11479 |
+- * them we do not charge a full budget to a process that |
11480 |
+- * succeeded in consuming at least 2/3 of its budget. |
11481 |
++ * Increase service_from_backlogged before next statement, |
11482 |
++ * because the possible next invocation of |
11483 |
++ * bfq_bfqq_charge_time would likely inflate |
11484 |
++ * entity->service. In contrast, service_from_backlogged must |
11485 |
++ * contain real service, to enable the soft real-time |
11486 |
++ * heuristic to correctly compute the bandwidth consumed by |
11487 |
++ * bfqq. |
11488 |
+ */ |
11489 |
+- if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && |
11490 |
+- bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) |
11491 |
+- bfq_bfqq_charge_full_budget(bfqq); |
11492 |
++ bfqq->service_from_backlogged += entity->service; |
11493 |
+ |
11494 |
+- bfqq->service_from_backlogged += bfqq->entity.service; |
11495 |
+- |
11496 |
+- if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && |
11497 |
+- !bfq_bfqq_constantly_seeky(bfqq)) { |
11498 |
+- bfq_mark_bfqq_constantly_seeky(bfqq); |
11499 |
+- if (!blk_queue_nonrot(bfqd->queue)) |
11500 |
+- bfqd->const_seeky_busy_in_flight_queues++; |
11501 |
+- } |
11502 |
++ /* |
11503 |
++ * As above explained, charge slow (typically seeky) and |
11504 |
++ * timed-out queues with the time and not the service |
11505 |
++ * received, to favor sequential workloads. |
11506 |
++ * |
11507 |
++ * Processes doing I/O in the slower disk zones will tend to |
11508 |
++ * be slow(er) even if not seeky. Therefore, since the |
11509 |
++ * estimated peak rate is actually an average over the disk |
11510 |
++ * surface, these processes may timeout just for bad luck. To |
11511 |
++ * avoid punishing them, do not charge time to processes that |
11512 |
++ * succeeded in consuming at least 2/3 of their budget. This |
11513 |
++ * allows BFQ to preserve enough elasticity to still perform |
11514 |
++ * bandwidth, and not time, distribution with little unlucky |
11515 |
++ * or quasi-sequential processes. |
11516 |
++ */ |
11517 |
++ if (bfqq->wr_coeff == 1 && |
11518 |
++ (slow || |
11519 |
++ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && |
11520 |
++ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) |
11521 |
++ bfq_bfqq_charge_time(bfqd, bfqq, delta); |
11522 |
+ |
11523 |
+ if (reason == BFQ_BFQQ_TOO_IDLE && |
11524 |
+- bfqq->entity.service <= 2 * bfqq->entity.budget / 10 ) |
11525 |
++ entity->service <= 2 * entity->budget / 10 ) |
11526 |
+ bfq_clear_bfqq_IO_bound(bfqq); |
11527 |
+ |
11528 |
+ if (bfqd->low_latency && bfqq->wr_coeff == 1) |
11529 |
+@@ -2285,19 +2817,23 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, |
11530 |
+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && |
11531 |
+ RB_EMPTY_ROOT(&bfqq->sort_list)) { |
11532 |
+ /* |
11533 |
+- * If we get here, and there are no outstanding requests, |
11534 |
+- * then the request pattern is isochronous (see the comments |
11535 |
+- * to the function bfq_bfqq_softrt_next_start()). Hence we |
11536 |
+- * can compute soft_rt_next_start. If, instead, the queue |
11537 |
+- * still has outstanding requests, then we have to wait |
11538 |
+- * for the completion of all the outstanding requests to |
11539 |
++ * If we get here, and there are no outstanding |
11540 |
++ * requests, then the request pattern is isochronous |
11541 |
++ * (see the comments on the function |
11542 |
++ * bfq_bfqq_softrt_next_start()). Thus we can compute |
11543 |
++ * soft_rt_next_start. If, instead, the queue still |
11544 |
++ * has outstanding requests, then we have to wait for |
11545 |
++ * the completion of all the outstanding requests to |
11546 |
+ * discover whether the request pattern is actually |
11547 |
+ * isochronous. |
11548 |
+ */ |
11549 |
+- if (bfqq->dispatched == 0) |
11550 |
++ BUG_ON(bfqd->busy_queues < 1); |
11551 |
++ if (bfqq->dispatched == 0) { |
11552 |
+ bfqq->soft_rt_next_start = |
11553 |
+ bfq_bfqq_softrt_next_start(bfqd, bfqq); |
11554 |
+- else { |
11555 |
++ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", |
11556 |
++ bfqq->soft_rt_next_start); |
11557 |
++ } else { |
11558 |
+ /* |
11559 |
+ * The application is still waiting for the |
11560 |
+ * completion of one or more requests: |
11561 |
+@@ -2314,7 +2850,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, |
11562 |
+ * happened to be in the past. |
11563 |
+ */ |
11564 |
+ bfqq->soft_rt_next_start = |
11565 |
+- bfq_infinity_from_now(jiffies); |
11566 |
++ bfq_greatest_from_now(); |
11567 |
+ /* |
11568 |
+ * Schedule an update of soft_rt_next_start to when |
11569 |
+ * the task may be discovered to be isochronous. |
11570 |
+@@ -2324,8 +2860,9 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, |
11571 |
+ } |
11572 |
+ |
11573 |
+ bfq_log_bfqq(bfqd, bfqq, |
11574 |
+- "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, |
11575 |
+- slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); |
11576 |
++ "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", |
11577 |
++ reason, slow, bfqq->dispatched, |
11578 |
++ bfq_bfqq_idle_window(bfqq), entity->weight); |
11579 |
+ |
11580 |
+ /* |
11581 |
+ * Increase, decrease or leave budget unchanged according to |
11582 |
+@@ -2333,6 +2870,14 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, |
11583 |
+ */ |
11584 |
+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); |
11585 |
+ __bfq_bfqq_expire(bfqd, bfqq); |
11586 |
++ |
11587 |
++ BUG_ON(!bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && |
11588 |
++ !bfq_class_idle(bfqq)); |
11589 |
++ |
11590 |
++ if (!bfq_bfqq_busy(bfqq) && |
11591 |
++ reason != BFQ_BFQQ_BUDGET_TIMEOUT && |
11592 |
++ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) |
11593 |
++ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); |
11594 |
+ } |
11595 |
+ |
11596 |
+ /* |
11597 |
+@@ -2342,20 +2887,17 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, |
11598 |
+ */ |
11599 |
+ static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) |
11600 |
+ { |
11601 |
+- if (bfq_bfqq_budget_new(bfqq) || |
11602 |
+- time_before(jiffies, bfqq->budget_timeout)) |
11603 |
+- return false; |
11604 |
+- return true; |
11605 |
++ return time_is_before_eq_jiffies(bfqq->budget_timeout); |
11606 |
+ } |
11607 |
+ |
11608 |
+ /* |
11609 |
+- * If we expire a queue that is waiting for the arrival of a new |
11610 |
+- * request, we may prevent the fictitious timestamp back-shifting that |
11611 |
+- * allows the guarantees of the queue to be preserved (see [1] for |
11612 |
+- * this tricky aspect). Hence we return true only if this condition |
11613 |
+- * does not hold, or if the queue is slow enough to deserve only to be |
11614 |
+- * kicked off for preserving a high throughput. |
11615 |
+-*/ |
11616 |
++ * If we expire a queue that is actively waiting (i.e., with the |
11617 |
++ * device idled) for the arrival of a new request, then we may incur |
11618 |
++ * the timestamp misalignment problem described in the body of the |
11619 |
++ * function __bfq_activate_entity. Hence we return true only if this |
11620 |
++ * condition does not hold, or if the queue is slow enough to deserve |
11621 |
++ * only to be kicked off for preserving a high throughput. |
11622 |
++ */ |
11623 |
+ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) |
11624 |
+ { |
11625 |
+ bfq_log_bfqq(bfqq->bfqd, bfqq, |
11626 |
+@@ -2397,10 +2939,12 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) |
11627 |
+ { |
11628 |
+ struct bfq_data *bfqd = bfqq->bfqd; |
11629 |
+ bool idling_boosts_thr, idling_boosts_thr_without_issues, |
11630 |
+- all_queues_seeky, on_hdd_and_not_all_queues_seeky, |
11631 |
+ idling_needed_for_service_guarantees, |
11632 |
+ asymmetric_scenario; |
11633 |
+ |
11634 |
++ if (bfqd->strict_guarantees) |
11635 |
++ return true; |
11636 |
++ |
11637 |
+ /* |
11638 |
+ * The next variable takes into account the cases where idling |
11639 |
+ * boosts the throughput. |
11640 |
+@@ -2422,7 +2966,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) |
11641 |
+ */ |
11642 |
+ idling_boosts_thr = !bfqd->hw_tag || |
11643 |
+ (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && |
11644 |
+- bfq_bfqq_idle_window(bfqq)) ; |
11645 |
++ bfq_bfqq_idle_window(bfqq)); |
11646 |
+ |
11647 |
+ /* |
11648 |
+ * The value of the next variable, |
11649 |
+@@ -2463,74 +3007,27 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) |
11650 |
+ bfqd->wr_busy_queues == 0; |
11651 |
+ |
11652 |
+ /* |
11653 |
+- * There are then two cases where idling must be performed not |
11654 |
++ * There is then a case where idling must be performed not |
11655 |
+ * for throughput concerns, but to preserve service |
11656 |
+- * guarantees. In the description of these cases, we say, for |
11657 |
+- * short, that a queue is sequential/random if the process |
11658 |
+- * associated to the queue issues sequential/random requests |
11659 |
+- * (in the second case the queue may be tagged as seeky or |
11660 |
+- * even constantly_seeky). |
11661 |
+- * |
11662 |
+- * To introduce the first case, we note that, since |
11663 |
+- * bfq_bfqq_idle_window(bfqq) is false if the device is |
11664 |
+- * NCQ-capable and bfqq is random (see |
11665 |
+- * bfq_update_idle_window()), then, from the above two |
11666 |
+- * assignments it follows that |
11667 |
+- * idling_boosts_thr_without_issues is false if the device is |
11668 |
+- * NCQ-capable and bfqq is random. Therefore, for this case, |
11669 |
+- * device idling would never be allowed if we used just |
11670 |
+- * idling_boosts_thr_without_issues to decide whether to allow |
11671 |
+- * it. And, beneficially, this would imply that throughput |
11672 |
+- * would always be boosted also with random I/O on NCQ-capable |
11673 |
+- * HDDs. |
11674 |
+- * |
11675 |
+- * But we must be careful on this point, to avoid an unfair |
11676 |
+- * treatment for bfqq. In fact, because of the same above |
11677 |
+- * assignments, idling_boosts_thr_without_issues is, on the |
11678 |
+- * other hand, true if 1) the device is an HDD and bfqq is |
11679 |
+- * sequential, and 2) there are no busy weight-raised |
11680 |
+- * queues. As a consequence, if we used just |
11681 |
+- * idling_boosts_thr_without_issues to decide whether to idle |
11682 |
+- * the device, then with an HDD we might easily bump into a |
11683 |
+- * scenario where queues that are sequential and I/O-bound |
11684 |
+- * would enjoy idling, whereas random queues would not. The |
11685 |
+- * latter might then get a low share of the device throughput, |
11686 |
+- * simply because the former would get many requests served |
11687 |
+- * after being set as in service, while the latter would not. |
11688 |
++ * guarantees. |
11689 |
+ * |
11690 |
+- * To address this issue, we start by setting to true a |
11691 |
+- * sentinel variable, on_hdd_and_not_all_queues_seeky, if the |
11692 |
+- * device is rotational and not all queues with pending or |
11693 |
+- * in-flight requests are constantly seeky (i.e., there are |
11694 |
+- * active sequential queues, and bfqq might then be mistreated |
11695 |
+- * if it does not enjoy idling because it is random). |
11696 |
+- */ |
11697 |
+- all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && |
11698 |
+- bfqd->busy_in_flight_queues == |
11699 |
+- bfqd->const_seeky_busy_in_flight_queues; |
11700 |
+- |
11701 |
+- on_hdd_and_not_all_queues_seeky = |
11702 |
+- !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; |
11703 |
+- |
11704 |
+- /* |
11705 |
+- * To introduce the second case where idling needs to be |
11706 |
+- * performed to preserve service guarantees, we can note that |
11707 |
+- * allowing the drive to enqueue more than one request at a |
11708 |
+- * time, and hence delegating de facto final scheduling |
11709 |
+- * decisions to the drive's internal scheduler, causes loss of |
11710 |
+- * control on the actual request service order. In particular, |
11711 |
+- * the critical situation is when requests from different |
11712 |
+- * processes happens to be present, at the same time, in the |
11713 |
+- * internal queue(s) of the drive. In such a situation, the |
11714 |
+- * drive, by deciding the service order of the |
11715 |
+- * internally-queued requests, does determine also the actual |
11716 |
+- * throughput distribution among these processes. But the |
11717 |
+- * drive typically has no notion or concern about per-process |
11718 |
+- * throughput distribution, and makes its decisions only on a |
11719 |
+- * per-request basis. Therefore, the service distribution |
11720 |
+- * enforced by the drive's internal scheduler is likely to |
11721 |
+- * coincide with the desired device-throughput distribution |
11722 |
+- * only in a completely symmetric scenario where: |
11723 |
++ * To introduce this case, we can note that allowing the drive |
11724 |
++ * to enqueue more than one request at a time, and hence |
11725 |
++ * delegating de facto final scheduling decisions to the |
11726 |
++ * drive's internal scheduler, entails loss of control on the |
11727 |
++ * actual request service order. In particular, the critical |
11728 |
++ * situation is when requests from different processes happen |
11729 |
++ * to be present, at the same time, in the internal queue(s) |
11730 |
++ * of the drive. In such a situation, the drive, by deciding |
11731 |
++ * the service order of the internally-queued requests, does |
11732 |
++ * determine also the actual throughput distribution among |
11733 |
++ * these processes. But the drive typically has no notion or |
11734 |
++ * concern about per-process throughput distribution, and |
11735 |
++ * makes its decisions only on a per-request basis. Therefore, |
11736 |
++ * the service distribution enforced by the drive's internal |
11737 |
++ * scheduler is likely to coincide with the desired |
11738 |
++ * device-throughput distribution only in a completely |
11739 |
++ * symmetric scenario where: |
11740 |
+ * (i) each of these processes must get the same throughput as |
11741 |
+ * the others; |
11742 |
+ * (ii) all these processes have the same I/O pattern |
11743 |
+@@ -2552,26 +3049,53 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) |
11744 |
+ * words, only if sub-condition (i) holds, then idling is |
11745 |
+ * allowed, and the device tends to be prevented from queueing |
11746 |
+ * many requests, possibly of several processes. The reason |
11747 |
+- * for not controlling also sub-condition (ii) is that, first, |
11748 |
+- * in the case of an HDD, the asymmetry in terms of types of |
11749 |
+- * I/O patterns is already taken in to account in the above |
11750 |
+- * sentinel variable |
11751 |
+- * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a |
11752 |
+- * flash-based device, we prefer however to privilege |
11753 |
+- * throughput (and idling lowers throughput for this type of |
11754 |
+- * devices), for the following reasons: |
11755 |
+- * 1) differently from HDDs, the service time of random |
11756 |
+- * requests is not orders of magnitudes lower than the service |
11757 |
+- * time of sequential requests; thus, even if processes doing |
11758 |
+- * sequential I/O get a preferential treatment with respect to |
11759 |
+- * others doing random I/O, the consequences are not as |
11760 |
+- * dramatic as with HDDs; |
11761 |
+- * 2) if a process doing random I/O does need strong |
11762 |
+- * throughput guarantees, it is hopefully already being |
11763 |
+- * weight-raised, or the user is likely to have assigned it a |
11764 |
+- * higher weight than the other processes (and thus |
11765 |
+- * sub-condition (i) is likely to be false, which triggers |
11766 |
+- * idling). |
11767 |
++ * for not controlling also sub-condition (ii) is that we |
11768 |
++ * exploit preemption to preserve guarantees in case of |
11769 |
++ * symmetric scenarios, even if (ii) does not hold, as |
11770 |
++ * explained in the next two paragraphs. |
11771 |
++ * |
11772 |
++ * Even if a queue, say Q, is expired when it remains idle, Q |
11773 |
++ * can still preempt the new in-service queue if the next |
11774 |
++ * request of Q arrives soon (see the comments on |
11775 |
++ * bfq_bfqq_update_budg_for_activation). If all queues and |
11776 |
++ * groups have the same weight, this form of preemption, |
11777 |
++ * combined with the hole-recovery heuristic described in the |
11778 |
++ * comments on function bfq_bfqq_update_budg_for_activation, |
11779 |
++ * are enough to preserve a correct bandwidth distribution in |
11780 |
++ * the mid term, even without idling. In fact, even if not |
11781 |
++ * idling allows the internal queues of the device to contain |
11782 |
++ * many requests, and thus to reorder requests, we can rather |
11783 |
++ * safely assume that the internal scheduler still preserves a |
11784 |
++ * minimum of mid-term fairness. The motivation for using |
11785 |
++ * preemption instead of idling is that, by not idling, |
11786 |
++ * service guarantees are preserved without minimally |
11787 |
++ * sacrificing throughput. In other words, both a high |
11788 |
++ * throughput and its desired distribution are obtained. |
11789 |
++ * |
11790 |
++ * More precisely, this preemption-based, idleless approach |
11791 |
++ * provides fairness in terms of IOPS, and not sectors per |
11792 |
++ * second. This can be seen with a simple example. Suppose |
11793 |
++ * that there are two queues with the same weight, but that |
11794 |
++ * the first queue receives requests of 8 sectors, while the |
11795 |
++ * second queue receives requests of 1024 sectors. In |
11796 |
++ * addition, suppose that each of the two queues contains at |
11797 |
++ * most one request at a time, which implies that each queue |
11798 |
++ * always remains idle after it is served. Finally, after |
11799 |
++ * remaining idle, each queue receives very quickly a new |
11800 |
++ * request. It follows that the two queues are served |
11801 |
++ * alternatively, preempting each other if needed. This |
11802 |
++ * implies that, although both queues have the same weight, |
11803 |
++ * the queue with large requests receives a service that is |
11804 |
++ * 1024/8 times as high as the service received by the other |
11805 |
++ * queue. |
11806 |
++ * |
11807 |
++ * On the other hand, device idling is performed, and thus |
11808 |
++ * pure sector-domain guarantees are provided, for the |
11809 |
++ * following queues, which are likely to need stronger |
11810 |
++ * throughput guarantees: weight-raised queues, and queues |
11811 |
++ * with a higher weight than other queues. When such queues |
11812 |
++ * are active, sub-condition (i) is false, which triggers |
11813 |
++ * device idling. |
11814 |
+ * |
11815 |
+ * According to the above considerations, the next variable is |
11816 |
+ * true (only) if sub-condition (i) holds. To compute the |
11817 |
+@@ -2579,7 +3103,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) |
11818 |
+ * the function bfq_symmetric_scenario(), but also check |
11819 |
+ * whether bfqq is being weight-raised, because |
11820 |
+ * bfq_symmetric_scenario() does not take into account also |
11821 |
+- * weight-raised queues (see comments to |
11822 |
++ * weight-raised queues (see comments on |
11823 |
+ * bfq_weights_tree_add()). |
11824 |
+ * |
11825 |
+ * As a side note, it is worth considering that the above |
11826 |
+@@ -2601,17 +3125,16 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) |
11827 |
+ * bfqq. Such a case is when bfqq became active in a burst of |
11828 |
+ * queue activations. Queues that became active during a large |
11829 |
+ * burst benefit only from throughput, as discussed in the |
11830 |
+- * comments to bfq_handle_burst. Thus, if bfqq became active |
11831 |
++ * comments on bfq_handle_burst. Thus, if bfqq became active |
11832 |
+ * in a burst and not idling the device maximizes throughput, |
11833 |
+ * then the device must no be idled, because not idling the |
11834 |
+ * device provides bfqq and all other queues in the burst with |
11835 |
+- * maximum benefit. Combining this and the two cases above, we |
11836 |
+- * can now establish when idling is actually needed to |
11837 |
+- * preserve service guarantees. |
11838 |
++ * maximum benefit. Combining this and the above case, we can |
11839 |
++ * now establish when idling is actually needed to preserve |
11840 |
++ * service guarantees. |
11841 |
+ */ |
11842 |
+ idling_needed_for_service_guarantees = |
11843 |
+- (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) && |
11844 |
+- !bfq_bfqq_in_large_burst(bfqq); |
11845 |
++ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); |
11846 |
+ |
11847 |
+ /* |
11848 |
+ * We have now all the components we need to compute the return |
11849 |
+@@ -2621,6 +3144,14 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) |
11850 |
+ * 2) idling either boosts the throughput (without issues), or |
11851 |
+ * is necessary to preserve service guarantees. |
11852 |
+ */ |
11853 |
++ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d " |
11854 |
++ "wr_busy %d boosts %d IO-bound %d guar %d", |
11855 |
++ bfq_bfqq_sync(bfqq), idling_boosts_thr, |
11856 |
++ bfqd->wr_busy_queues, |
11857 |
++ idling_boosts_thr_without_issues, |
11858 |
++ bfq_bfqq_IO_bound(bfqq), |
11859 |
++ idling_needed_for_service_guarantees); |
11860 |
++ |
11861 |
+ return bfq_bfqq_sync(bfqq) && |
11862 |
+ (idling_boosts_thr_without_issues || |
11863 |
+ idling_needed_for_service_guarantees); |
11864 |
+@@ -2632,7 +3163,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) |
11865 |
+ * 1) the queue must remain in service and cannot be expired, and |
11866 |
+ * 2) the device must be idled to wait for the possible arrival of a new |
11867 |
+ * request for the queue. |
11868 |
+- * See the comments to the function bfq_bfqq_may_idle for the reasons |
11869 |
++ * See the comments on the function bfq_bfqq_may_idle for the reasons |
11870 |
+ * why performing device idling is the best choice to boost the throughput |
11871 |
+ * and preserve service guarantees when bfq_bfqq_may_idle itself |
11872 |
+ * returns true. |
11873 |
+@@ -2698,9 +3229,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) |
11874 |
+ */ |
11875 |
+ bfq_clear_bfqq_wait_request(bfqq); |
11876 |
+ del_timer(&bfqd->idle_slice_timer); |
11877 |
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED |
11878 |
+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); |
11879 |
+-#endif |
11880 |
+ } |
11881 |
+ goto keep_queue; |
11882 |
+ } |
11883 |
+@@ -2745,14 +3274,11 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
11884 |
+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); |
11885 |
+ |
11886 |
+ /* |
11887 |
+- * If the queue was activated in a burst, or |
11888 |
+- * too much time has elapsed from the beginning |
11889 |
+- * of this weight-raising period, or the queue has |
11890 |
+- * exceeded the acceptable number of cooperations, |
11891 |
+- * then end weight raising. |
11892 |
++ * If the queue was activated in a burst, or too much |
11893 |
++ * time has elapsed from the beginning of this |
11894 |
++ * weight-raising period, then end weight raising. |
11895 |
+ */ |
11896 |
+ if (bfq_bfqq_in_large_burst(bfqq) || |
11897 |
+- bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || |
11898 |
+ time_is_before_jiffies(bfqq->last_wr_start_finish + |
11899 |
+ bfqq->wr_cur_max_time)) { |
11900 |
+ bfqq->last_wr_start_finish = jiffies; |
11901 |
+@@ -2814,10 +3340,25 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, |
11902 |
+ goto expire; |
11903 |
+ } |
11904 |
+ |
11905 |
++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); |
11906 |
+ /* Finally, insert request into driver dispatch list. */ |
11907 |
+ bfq_bfqq_served(bfqq, service_to_charge); |
11908 |
++ |
11909 |
++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); |
11910 |
++ |
11911 |
+ bfq_dispatch_insert(bfqd->queue, rq); |
11912 |
+ |
11913 |
++ /* |
11914 |
++ * If weight raising has to terminate for bfqq, then next |
11915 |
++ * function causes an immediate update of bfqq's weight, |
11916 |
++ * without waiting for next activation. As a consequence, on |
11917 |
++ * expiration, bfqq will be timestamped as if has never been |
11918 |
++ * weight-raised during this service slot, even if it has |
11919 |
++ * received part or even most of the service as a |
11920 |
++ * weight-raised queue. This inflates bfqq's timestamps, which |
11921 |
++ * is beneficial, as bfqq is then more willing to leave the |
11922 |
++ * device immediately to possible other weight-raised queues. |
11923 |
++ */ |
11924 |
+ bfq_update_wr_data(bfqd, bfqq); |
11925 |
+ |
11926 |
+ bfq_log_bfqq(bfqd, bfqq, |
11927 |
+@@ -2833,9 +3374,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, |
11928 |
+ bfqd->in_service_bic = RQ_BIC(rq); |
11929 |
+ } |
11930 |
+ |
11931 |
+- if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && |
11932 |
+- dispatched >= bfqd->bfq_max_budget_async_rq) || |
11933 |
+- bfq_class_idle(bfqq))) |
11934 |
++ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) |
11935 |
+ goto expire; |
11936 |
+ |
11937 |
+ return dispatched; |
11938 |
+@@ -2881,8 +3420,8 @@ static int bfq_forced_dispatch(struct bfq_data *bfqd) |
11939 |
+ st = bfq_entity_service_tree(&bfqq->entity); |
11940 |
+ |
11941 |
+ dispatched += __bfq_forced_dispatch_bfqq(bfqq); |
11942 |
+- bfqq->max_budget = bfq_max_budget(bfqd); |
11943 |
+ |
11944 |
++ bfqq->max_budget = bfq_max_budget(bfqd); |
11945 |
+ bfq_forget_idle(st); |
11946 |
+ } |
11947 |
+ |
11948 |
+@@ -2895,9 +3434,9 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) |
11949 |
+ { |
11950 |
+ struct bfq_data *bfqd = q->elevator->elevator_data; |
11951 |
+ struct bfq_queue *bfqq; |
11952 |
+- int max_dispatch; |
11953 |
+ |
11954 |
+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); |
11955 |
++ |
11956 |
+ if (bfqd->busy_queues == 0) |
11957 |
+ return 0; |
11958 |
+ |
11959 |
+@@ -2908,21 +3447,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) |
11960 |
+ if (!bfqq) |
11961 |
+ return 0; |
11962 |
+ |
11963 |
+- if (bfq_class_idle(bfqq)) |
11964 |
+- max_dispatch = 1; |
11965 |
+- |
11966 |
+- if (!bfq_bfqq_sync(bfqq)) |
11967 |
+- max_dispatch = bfqd->bfq_max_budget_async_rq; |
11968 |
+- |
11969 |
+- if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { |
11970 |
+- if (bfqd->busy_queues > 1) |
11971 |
+- return 0; |
11972 |
+- if (bfqq->dispatched >= 4 * max_dispatch) |
11973 |
+- return 0; |
11974 |
+- } |
11975 |
+- |
11976 |
+- if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) |
11977 |
+- return 0; |
11978 |
++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); |
11979 |
+ |
11980 |
+ bfq_clear_bfqq_wait_request(bfqq); |
11981 |
+ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); |
11982 |
+@@ -2933,6 +3458,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) |
11983 |
+ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", |
11984 |
+ bfq_bfqq_sync(bfqq) ? "sync" : "async"); |
11985 |
+ |
11986 |
++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); |
11987 |
+ return 1; |
11988 |
+ } |
11989 |
+ |
11990 |
+@@ -2949,11 +3475,11 @@ static void bfq_put_queue(struct bfq_queue *bfqq) |
11991 |
+ struct bfq_group *bfqg = bfqq_group(bfqq); |
11992 |
+ #endif |
11993 |
+ |
11994 |
+- BUG_ON(atomic_read(&bfqq->ref) <= 0); |
11995 |
++ BUG_ON(bfqq->ref <= 0); |
11996 |
+ |
11997 |
+- bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, |
11998 |
+- atomic_read(&bfqq->ref)); |
11999 |
+- if (!atomic_dec_and_test(&bfqq->ref)) |
12000 |
++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); |
12001 |
++ bfqq->ref--; |
12002 |
++ if (bfqq->ref) |
12003 |
+ return; |
12004 |
+ |
12005 |
+ BUG_ON(rb_first(&bfqq->sort_list)); |
12006 |
+@@ -3007,8 +3533,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
12007 |
+ bfq_schedule_dispatch(bfqd); |
12008 |
+ } |
12009 |
+ |
12010 |
+- bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, |
12011 |
+- atomic_read(&bfqq->ref)); |
12012 |
++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); |
12013 |
+ |
12014 |
+ bfq_put_cooperator(bfqq); |
12015 |
+ |
12016 |
+@@ -3019,26 +3544,7 @@ static void bfq_init_icq(struct io_cq *icq) |
12017 |
+ { |
12018 |
+ struct bfq_io_cq *bic = icq_to_bic(icq); |
12019 |
+ |
12020 |
+- bic->ttime.last_end_request = jiffies; |
12021 |
+- /* |
12022 |
+- * A newly created bic indicates that the process has just |
12023 |
+- * started doing I/O, and is probably mapping into memory its |
12024 |
+- * executable and libraries: it definitely needs weight raising. |
12025 |
+- * There is however the possibility that the process performs, |
12026 |
+- * for a while, I/O close to some other process. EQM intercepts |
12027 |
+- * this behavior and may merge the queue corresponding to the |
12028 |
+- * process with some other queue, BEFORE the weight of the queue |
12029 |
+- * is raised. Merged queues are not weight-raised (they are assumed |
12030 |
+- * to belong to processes that benefit only from high throughput). |
12031 |
+- * If the merge is basically the consequence of an accident, then |
12032 |
+- * the queue will be split soon and will get back its old weight. |
12033 |
+- * It is then important to write down somewhere that this queue |
12034 |
+- * does need weight raising, even if it did not make it to get its |
12035 |
+- * weight raised before being merged. To this purpose, we overload |
12036 |
+- * the field raising_time_left and assign 1 to it, to mark the queue |
12037 |
+- * as needing weight raising. |
12038 |
+- */ |
12039 |
+- bic->wr_time_left = 1; |
12040 |
++ bic->ttime.last_end_request = bfq_smallest_from_now(); |
12041 |
+ } |
12042 |
+ |
12043 |
+ static void bfq_exit_icq(struct io_cq *icq) |
12044 |
+@@ -3046,21 +3552,21 @@ static void bfq_exit_icq(struct io_cq *icq) |
12045 |
+ struct bfq_io_cq *bic = icq_to_bic(icq); |
12046 |
+ struct bfq_data *bfqd = bic_to_bfqd(bic); |
12047 |
+ |
12048 |
+- if (bic->bfqq[BLK_RW_ASYNC]) { |
12049 |
+- bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); |
12050 |
+- bic->bfqq[BLK_RW_ASYNC] = NULL; |
12051 |
++ if (bic_to_bfqq(bic, false)) { |
12052 |
++ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); |
12053 |
++ bic_set_bfqq(bic, NULL, false); |
12054 |
+ } |
12055 |
+ |
12056 |
+- if (bic->bfqq[BLK_RW_SYNC]) { |
12057 |
++ if (bic_to_bfqq(bic, true)) { |
12058 |
+ /* |
12059 |
+ * If the bic is using a shared queue, put the reference |
12060 |
+ * taken on the io_context when the bic started using a |
12061 |
+ * shared bfq_queue. |
12062 |
+ */ |
12063 |
+- if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) |
12064 |
++ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) |
12065 |
+ put_io_context(icq->ioc); |
12066 |
+- bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); |
12067 |
+- bic->bfqq[BLK_RW_SYNC] = NULL; |
12068 |
++ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); |
12069 |
++ bic_set_bfqq(bic, NULL, true); |
12070 |
+ } |
12071 |
+ } |
12072 |
+ |
12073 |
+@@ -3068,7 +3574,8 @@ static void bfq_exit_icq(struct io_cq *icq) |
12074 |
+ * Update the entity prio values; note that the new values will not |
12075 |
+ * be used until the next (re)activation. |
12076 |
+ */ |
12077 |
+-static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) |
12078 |
++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, |
12079 |
++ struct bfq_io_cq *bic) |
12080 |
+ { |
12081 |
+ struct task_struct *tsk = current; |
12082 |
+ int ioprio_class; |
12083 |
+@@ -3100,7 +3607,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *b |
12084 |
+ break; |
12085 |
+ } |
12086 |
+ |
12087 |
+- if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) { |
12088 |
++ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { |
12089 |
+ printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n", |
12090 |
+ bfqq->new_ioprio); |
12091 |
+ BUG(); |
12092 |
+@@ -3108,45 +3615,40 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *b |
12093 |
+ |
12094 |
+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); |
12095 |
+ bfqq->entity.prio_changed = 1; |
12096 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
12097 |
++ "set_next_ioprio_data: bic_class %d prio %d class %d", |
12098 |
++ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); |
12099 |
+ } |
12100 |
+ |
12101 |
+ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) |
12102 |
+ { |
12103 |
+- struct bfq_data *bfqd; |
12104 |
+- struct bfq_queue *bfqq, *new_bfqq; |
12105 |
++ struct bfq_data *bfqd = bic_to_bfqd(bic); |
12106 |
++ struct bfq_queue *bfqq; |
12107 |
+ unsigned long uninitialized_var(flags); |
12108 |
+ int ioprio = bic->icq.ioc->ioprio; |
12109 |
+ |
12110 |
+- bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), |
12111 |
+- &flags); |
12112 |
+ /* |
12113 |
+ * This condition may trigger on a newly created bic, be sure to |
12114 |
+ * drop the lock before returning. |
12115 |
+ */ |
12116 |
+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) |
12117 |
+- goto out; |
12118 |
++ return; |
12119 |
+ |
12120 |
+ bic->ioprio = ioprio; |
12121 |
+ |
12122 |
+- bfqq = bic->bfqq[BLK_RW_ASYNC]; |
12123 |
++ bfqq = bic_to_bfqq(bic, false); |
12124 |
+ if (bfqq) { |
12125 |
+- new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, |
12126 |
+- GFP_ATOMIC); |
12127 |
+- if (new_bfqq) { |
12128 |
+- bic->bfqq[BLK_RW_ASYNC] = new_bfqq; |
12129 |
+- bfq_log_bfqq(bfqd, bfqq, |
12130 |
+- "check_ioprio_change: bfqq %p %d", |
12131 |
+- bfqq, atomic_read(&bfqq->ref)); |
12132 |
+- bfq_put_queue(bfqq); |
12133 |
+- } |
12134 |
++ bfq_put_queue(bfqq); |
12135 |
++ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); |
12136 |
++ bic_set_bfqq(bic, bfqq, false); |
12137 |
++ bfq_log_bfqq(bfqd, bfqq, |
12138 |
++ "check_ioprio_change: bfqq %p %d", |
12139 |
++ bfqq, bfqq->ref); |
12140 |
+ } |
12141 |
+ |
12142 |
+- bfqq = bic->bfqq[BLK_RW_SYNC]; |
12143 |
++ bfqq = bic_to_bfqq(bic, true); |
12144 |
+ if (bfqq) |
12145 |
+ bfq_set_next_ioprio_data(bfqq, bic); |
12146 |
+- |
12147 |
+-out: |
12148 |
+- bfq_put_bfqd_unlock(bfqd, &flags); |
12149 |
+ } |
12150 |
+ |
12151 |
+ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
12152 |
+@@ -3155,8 +3657,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
12153 |
+ RB_CLEAR_NODE(&bfqq->entity.rb_node); |
12154 |
+ INIT_LIST_HEAD(&bfqq->fifo); |
12155 |
+ INIT_HLIST_NODE(&bfqq->burst_list_node); |
12156 |
++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); |
12157 |
+ |
12158 |
+- atomic_set(&bfqq->ref, 0); |
12159 |
++ bfqq->ref = 0; |
12160 |
+ bfqq->bfqd = bfqd; |
12161 |
+ |
12162 |
+ if (bic) |
12163 |
+@@ -3166,6 +3669,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
12164 |
+ if (!bfq_class_idle(bfqq)) |
12165 |
+ bfq_mark_bfqq_idle_window(bfqq); |
12166 |
+ bfq_mark_bfqq_sync(bfqq); |
12167 |
++ bfq_mark_bfqq_just_created(bfqq); |
12168 |
+ } else |
12169 |
+ bfq_clear_bfqq_sync(bfqq); |
12170 |
+ bfq_mark_bfqq_IO_bound(bfqq); |
12171 |
+@@ -3175,72 +3679,17 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
12172 |
+ bfqq->pid = pid; |
12173 |
+ |
12174 |
+ bfqq->wr_coeff = 1; |
12175 |
+- bfqq->last_wr_start_finish = 0; |
12176 |
++ bfqq->last_wr_start_finish = bfq_smallest_from_now(); |
12177 |
++ bfqq->budget_timeout = bfq_smallest_from_now(); |
12178 |
++ bfqq->split_time = bfq_smallest_from_now(); |
12179 |
+ /* |
12180 |
+ * Set to the value for which bfqq will not be deemed as |
12181 |
+ * soft rt when it becomes backlogged. |
12182 |
+ */ |
12183 |
+- bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); |
12184 |
+-} |
12185 |
+- |
12186 |
+-static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, |
12187 |
+- struct bio *bio, int is_sync, |
12188 |
+- struct bfq_io_cq *bic, |
12189 |
+- gfp_t gfp_mask) |
12190 |
+-{ |
12191 |
+- struct bfq_group *bfqg; |
12192 |
+- struct bfq_queue *bfqq, *new_bfqq = NULL; |
12193 |
+- struct blkcg *blkcg; |
12194 |
+- |
12195 |
+-retry: |
12196 |
+- rcu_read_lock(); |
12197 |
+- |
12198 |
+- blkcg = bio_blkcg(bio); |
12199 |
+- bfqg = bfq_find_alloc_group(bfqd, blkcg); |
12200 |
+- /* bic always exists here */ |
12201 |
+- bfqq = bic_to_bfqq(bic, is_sync); |
12202 |
+- |
12203 |
+- /* |
12204 |
+- * Always try a new alloc if we fall back to the OOM bfqq |
12205 |
+- * originally, since it should just be a temporary situation. |
12206 |
+- */ |
12207 |
+- if (!bfqq || bfqq == &bfqd->oom_bfqq) { |
12208 |
+- bfqq = NULL; |
12209 |
+- if (new_bfqq) { |
12210 |
+- bfqq = new_bfqq; |
12211 |
+- new_bfqq = NULL; |
12212 |
+- } else if (gfpflags_allow_blocking(gfp_mask)) { |
12213 |
+- rcu_read_unlock(); |
12214 |
+- spin_unlock_irq(bfqd->queue->queue_lock); |
12215 |
+- new_bfqq = kmem_cache_alloc_node(bfq_pool, |
12216 |
+- gfp_mask | __GFP_ZERO, |
12217 |
+- bfqd->queue->node); |
12218 |
+- spin_lock_irq(bfqd->queue->queue_lock); |
12219 |
+- if (new_bfqq) |
12220 |
+- goto retry; |
12221 |
+- } else { |
12222 |
+- bfqq = kmem_cache_alloc_node(bfq_pool, |
12223 |
+- gfp_mask | __GFP_ZERO, |
12224 |
+- bfqd->queue->node); |
12225 |
+- } |
12226 |
+- |
12227 |
+- if (bfqq) { |
12228 |
+- bfq_init_bfqq(bfqd, bfqq, bic, current->pid, |
12229 |
+- is_sync); |
12230 |
+- bfq_init_entity(&bfqq->entity, bfqg); |
12231 |
+- bfq_log_bfqq(bfqd, bfqq, "allocated"); |
12232 |
+- } else { |
12233 |
+- bfqq = &bfqd->oom_bfqq; |
12234 |
+- bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); |
12235 |
+- } |
12236 |
+- } |
12237 |
+- |
12238 |
+- if (new_bfqq) |
12239 |
+- kmem_cache_free(bfq_pool, new_bfqq); |
12240 |
++ bfqq->soft_rt_next_start = bfq_greatest_from_now(); |
12241 |
+ |
12242 |
+- rcu_read_unlock(); |
12243 |
+- |
12244 |
+- return bfqq; |
12245 |
++ /* first request is almost certainly seeky */ |
12246 |
++ bfqq->seek_history = 1; |
12247 |
+ } |
12248 |
+ |
12249 |
+ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, |
12250 |
+@@ -3263,44 +3712,56 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, |
12251 |
+ } |
12252 |
+ |
12253 |
+ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, |
12254 |
+- struct bio *bio, int is_sync, |
12255 |
+- struct bfq_io_cq *bic, gfp_t gfp_mask) |
12256 |
++ struct bio *bio, bool is_sync, |
12257 |
++ struct bfq_io_cq *bic) |
12258 |
+ { |
12259 |
+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); |
12260 |
+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); |
12261 |
+ struct bfq_queue **async_bfqq = NULL; |
12262 |
+- struct bfq_queue *bfqq = NULL; |
12263 |
++ struct bfq_queue *bfqq; |
12264 |
++ struct bfq_group *bfqg; |
12265 |
+ |
12266 |
+- if (!is_sync) { |
12267 |
+- struct blkcg *blkcg; |
12268 |
+- struct bfq_group *bfqg; |
12269 |
++ rcu_read_lock(); |
12270 |
+ |
12271 |
+- rcu_read_lock(); |
12272 |
+- blkcg = bio_blkcg(bio); |
12273 |
+- rcu_read_unlock(); |
12274 |
+- bfqg = bfq_find_alloc_group(bfqd, blkcg); |
12275 |
++ bfqg = bfq_find_alloc_group(bfqd,bio_blkcg(bio)); |
12276 |
++ |
12277 |
++ if (!is_sync) { |
12278 |
+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, |
12279 |
+ ioprio); |
12280 |
+ bfqq = *async_bfqq; |
12281 |
++ if (bfqq) |
12282 |
++ goto out; |
12283 |
+ } |
12284 |
+ |
12285 |
+- if (!bfqq) |
12286 |
+- bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask); |
12287 |
++ bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO, |
12288 |
++ bfqd->queue->node); |
12289 |
++ |
12290 |
++ if (bfqq) { |
12291 |
++ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, |
12292 |
++ is_sync); |
12293 |
++ bfq_init_entity(&bfqq->entity, bfqg); |
12294 |
++ bfq_log_bfqq(bfqd, bfqq, "allocated"); |
12295 |
++ } else { |
12296 |
++ bfqq = &bfqd->oom_bfqq; |
12297 |
++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); |
12298 |
++ goto out; |
12299 |
++ } |
12300 |
+ |
12301 |
+ /* |
12302 |
+ * Pin the queue now that it's allocated, scheduler exit will |
12303 |
+ * prune it. |
12304 |
+ */ |
12305 |
+- if (!is_sync && !(*async_bfqq)) { |
12306 |
+- atomic_inc(&bfqq->ref); |
12307 |
++ if (async_bfqq) { |
12308 |
++ bfqq->ref++; |
12309 |
+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", |
12310 |
+- bfqq, atomic_read(&bfqq->ref)); |
12311 |
++ bfqq, bfqq->ref); |
12312 |
+ *async_bfqq = bfqq; |
12313 |
+ } |
12314 |
+ |
12315 |
+- atomic_inc(&bfqq->ref); |
12316 |
+- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, |
12317 |
+- atomic_read(&bfqq->ref)); |
12318 |
++out: |
12319 |
++ bfqq->ref++; |
12320 |
++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); |
12321 |
++ rcu_read_unlock(); |
12322 |
+ return bfqq; |
12323 |
+ } |
12324 |
+ |
12325 |
+@@ -3316,37 +3777,21 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, |
12326 |
+ bic->ttime.ttime_samples; |
12327 |
+ } |
12328 |
+ |
12329 |
+-static void bfq_update_io_seektime(struct bfq_data *bfqd, |
12330 |
+- struct bfq_queue *bfqq, |
12331 |
+- struct request *rq) |
12332 |
+-{ |
12333 |
+- sector_t sdist; |
12334 |
+- u64 total; |
12335 |
+- |
12336 |
+- if (bfqq->last_request_pos < blk_rq_pos(rq)) |
12337 |
+- sdist = blk_rq_pos(rq) - bfqq->last_request_pos; |
12338 |
+- else |
12339 |
+- sdist = bfqq->last_request_pos - blk_rq_pos(rq); |
12340 |
+- |
12341 |
+- /* |
12342 |
+- * Don't allow the seek distance to get too large from the |
12343 |
+- * odd fragment, pagein, etc. |
12344 |
+- */ |
12345 |
+- if (bfqq->seek_samples == 0) /* first request, not really a seek */ |
12346 |
+- sdist = 0; |
12347 |
+- else if (bfqq->seek_samples <= 60) /* second & third seek */ |
12348 |
+- sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); |
12349 |
+- else |
12350 |
+- sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); |
12351 |
+ |
12352 |
+- bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; |
12353 |
+- bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; |
12354 |
+- total = bfqq->seek_total + (bfqq->seek_samples/2); |
12355 |
+- do_div(total, bfqq->seek_samples); |
12356 |
+- bfqq->seek_mean = (sector_t)total; |
12357 |
++static void |
12358 |
++bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
12359 |
++ struct request *rq) |
12360 |
++{ |
12361 |
++ sector_t sdist = 0; |
12362 |
++ if (bfqq->last_request_pos) { |
12363 |
++ if (bfqq->last_request_pos < blk_rq_pos(rq)) |
12364 |
++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos; |
12365 |
++ else |
12366 |
++ sdist = bfqq->last_request_pos - blk_rq_pos(rq); |
12367 |
++ } |
12368 |
+ |
12369 |
+- bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, |
12370 |
+- (u64)bfqq->seek_mean); |
12371 |
++ bfqq->seek_history <<= 1; |
12372 |
++ bfqq->seek_history |= (sdist > BFQQ_SEEK_THR); |
12373 |
+ } |
12374 |
+ |
12375 |
+ /* |
12376 |
+@@ -3364,7 +3809,8 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, |
12377 |
+ return; |
12378 |
+ |
12379 |
+ /* Idle window just restored, statistics are meaningless. */ |
12380 |
+- if (bfq_bfqq_just_split(bfqq)) |
12381 |
++ if (time_is_after_eq_jiffies(bfqq->split_time + |
12382 |
++ bfqd->bfq_wr_min_idle_time)) |
12383 |
+ return; |
12384 |
+ |
12385 |
+ enable_idle = bfq_bfqq_idle_window(bfqq); |
12386 |
+@@ -3404,22 +3850,13 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
12387 |
+ |
12388 |
+ bfq_update_io_thinktime(bfqd, bic); |
12389 |
+ bfq_update_io_seektime(bfqd, bfqq, rq); |
12390 |
+- if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { |
12391 |
+- bfq_clear_bfqq_constantly_seeky(bfqq); |
12392 |
+- if (!blk_queue_nonrot(bfqd->queue)) { |
12393 |
+- BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); |
12394 |
+- bfqd->const_seeky_busy_in_flight_queues--; |
12395 |
+- } |
12396 |
+- } |
12397 |
+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || |
12398 |
+ !BFQQ_SEEKY(bfqq)) |
12399 |
+ bfq_update_idle_window(bfqd, bfqq, bic); |
12400 |
+- bfq_clear_bfqq_just_split(bfqq); |
12401 |
+ |
12402 |
+ bfq_log_bfqq(bfqd, bfqq, |
12403 |
+- "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", |
12404 |
+- bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), |
12405 |
+- (long long unsigned)bfqq->seek_mean); |
12406 |
++ "rq_enqueued: idle_window=%d (seeky %d)", |
12407 |
++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); |
12408 |
+ |
12409 |
+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); |
12410 |
+ |
12411 |
+@@ -3433,14 +3870,15 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
12412 |
+ * is small and the queue is not to be expired, then |
12413 |
+ * just exit. |
12414 |
+ * |
12415 |
+- * In this way, if the disk is being idled to wait for |
12416 |
+- * a new request from the in-service queue, we avoid |
12417 |
+- * unplugging the device and committing the disk to serve |
12418 |
+- * just a small request. On the contrary, we wait for |
12419 |
+- * the block layer to decide when to unplug the device: |
12420 |
+- * hopefully, new requests will be merged to this one |
12421 |
+- * quickly, then the device will be unplugged and |
12422 |
+- * larger requests will be dispatched. |
12423 |
++ * In this way, if the device is being idled to wait |
12424 |
++ * for a new request from the in-service queue, we |
12425 |
++ * avoid unplugging the device and committing the |
12426 |
++ * device to serve just a small request. On the |
12427 |
++ * contrary, we wait for the block layer to decide |
12428 |
++ * when to unplug the device: hopefully, new requests |
12429 |
++ * will be merged to this one quickly, then the device |
12430 |
++ * will be unplugged and larger requests will be |
12431 |
++ * dispatched. |
12432 |
+ */ |
12433 |
+ if (small_req && !budget_timeout) |
12434 |
+ return; |
12435 |
+@@ -3453,9 +3891,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
12436 |
+ */ |
12437 |
+ bfq_clear_bfqq_wait_request(bfqq); |
12438 |
+ del_timer(&bfqd->idle_slice_timer); |
12439 |
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED |
12440 |
+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); |
12441 |
+-#endif |
12442 |
+ |
12443 |
+ /* |
12444 |
+ * The queue is not empty, because a new request just |
12445 |
+@@ -3499,27 +3935,19 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) |
12446 |
+ */ |
12447 |
+ new_bfqq->allocated[rq_data_dir(rq)]++; |
12448 |
+ bfqq->allocated[rq_data_dir(rq)]--; |
12449 |
+- atomic_inc(&new_bfqq->ref); |
12450 |
++ new_bfqq->ref++; |
12451 |
++ bfq_clear_bfqq_just_created(bfqq); |
12452 |
+ bfq_put_queue(bfqq); |
12453 |
+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) |
12454 |
+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), |
12455 |
+ bfqq, new_bfqq); |
12456 |
+ rq->elv.priv[1] = new_bfqq; |
12457 |
+ bfqq = new_bfqq; |
12458 |
+- } else |
12459 |
+- bfq_bfqq_increase_failed_cooperations(bfqq); |
12460 |
++ } |
12461 |
+ } |
12462 |
+ |
12463 |
+ bfq_add_request(rq); |
12464 |
+ |
12465 |
+- /* |
12466 |
+- * Here a newly-created bfq_queue has already started a weight-raising |
12467 |
+- * period: clear raising_time_left to prevent bfq_bfqq_save_state() |
12468 |
+- * from assigning it a full weight-raising period. See the detailed |
12469 |
+- * comments about this field in bfq_init_icq(). |
12470 |
+- */ |
12471 |
+- if (bfqq->bic) |
12472 |
+- bfqq->bic->wr_time_left = 0; |
12473 |
+ rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; |
12474 |
+ list_add_tail(&rq->queuelist, &bfqq->fifo); |
12475 |
+ |
12476 |
+@@ -3528,8 +3956,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) |
12477 |
+ |
12478 |
+ static void bfq_update_hw_tag(struct bfq_data *bfqd) |
12479 |
+ { |
12480 |
+- bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, |
12481 |
+- bfqd->rq_in_driver); |
12482 |
++ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, |
12483 |
++ bfqd->rq_in_driver); |
12484 |
+ |
12485 |
+ if (bfqd->hw_tag == 1) |
12486 |
+ return; |
12487 |
+@@ -3560,43 +3988,41 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) |
12488 |
+ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", |
12489 |
+ blk_rq_sectors(rq), sync); |
12490 |
+ |
12491 |
++ assert_spin_locked(bfqd->queue->queue_lock); |
12492 |
+ bfq_update_hw_tag(bfqd); |
12493 |
+ |
12494 |
+ BUG_ON(!bfqd->rq_in_driver); |
12495 |
+ BUG_ON(!bfqq->dispatched); |
12496 |
+ bfqd->rq_in_driver--; |
12497 |
+ bfqq->dispatched--; |
12498 |
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED |
12499 |
+ bfqg_stats_update_completion(bfqq_group(bfqq), |
12500 |
+ rq_start_time_ns(rq), |
12501 |
+ rq_io_start_time_ns(rq), rq->cmd_flags); |
12502 |
+-#endif |
12503 |
+ |
12504 |
+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { |
12505 |
++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); |
12506 |
++ /* |
12507 |
++ * Set budget_timeout (which we overload to store the |
12508 |
++ * time at which the queue remains with no backlog and |
12509 |
++ * no outstanding request; used by the weight-raising |
12510 |
++ * mechanism). |
12511 |
++ */ |
12512 |
++ bfqq->budget_timeout = jiffies; |
12513 |
++ |
12514 |
+ bfq_weights_tree_remove(bfqd, &bfqq->entity, |
12515 |
+ &bfqd->queue_weights_tree); |
12516 |
+- if (!blk_queue_nonrot(bfqd->queue)) { |
12517 |
+- BUG_ON(!bfqd->busy_in_flight_queues); |
12518 |
+- bfqd->busy_in_flight_queues--; |
12519 |
+- if (bfq_bfqq_constantly_seeky(bfqq)) { |
12520 |
+- BUG_ON(!bfqd-> |
12521 |
+- const_seeky_busy_in_flight_queues); |
12522 |
+- bfqd->const_seeky_busy_in_flight_queues--; |
12523 |
+- } |
12524 |
+- } |
12525 |
+ } |
12526 |
+ |
12527 |
+- if (sync) { |
12528 |
+- bfqd->sync_flight--; |
12529 |
+- RQ_BIC(rq)->ttime.last_end_request = jiffies; |
12530 |
+- } |
12531 |
++ RQ_BIC(rq)->ttime.last_end_request = jiffies; |
12532 |
+ |
12533 |
+ /* |
12534 |
+- * If we are waiting to discover whether the request pattern of the |
12535 |
+- * task associated with the queue is actually isochronous, and |
12536 |
+- * both requisites for this condition to hold are satisfied, then |
12537 |
+- * compute soft_rt_next_start (see the comments to the function |
12538 |
+- * bfq_bfqq_softrt_next_start()). |
12539 |
++ * If we are waiting to discover whether the request pattern |
12540 |
++ * of the task associated with the queue is actually |
12541 |
++ * isochronous, and both requisites for this condition to hold |
12542 |
++ * are now satisfied, then compute soft_rt_next_start (see the |
12543 |
++ * comments on the function bfq_bfqq_softrt_next_start()). We |
12544 |
++ * schedule this delayed check when bfqq expires, if it still |
12545 |
++ * has in-flight requests. |
12546 |
+ */ |
12547 |
+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && |
12548 |
+ RB_EMPTY_ROOT(&bfqq->sort_list)) |
12549 |
+@@ -3608,10 +4034,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) |
12550 |
+ * or if we want to idle in case it has no pending requests. |
12551 |
+ */ |
12552 |
+ if (bfqd->in_service_queue == bfqq) { |
12553 |
+- if (bfq_bfqq_budget_new(bfqq)) |
12554 |
+- bfq_set_budget_timeout(bfqd); |
12555 |
+- |
12556 |
+- if (bfq_bfqq_must_idle(bfqq)) { |
12557 |
++ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { |
12558 |
+ bfq_arm_slice_timer(bfqd); |
12559 |
+ goto out; |
12560 |
+ } else if (bfq_may_expire_for_budg_timeout(bfqq)) |
12561 |
+@@ -3682,14 +4105,14 @@ static void bfq_put_request(struct request *rq) |
12562 |
+ rq->elv.priv[1] = NULL; |
12563 |
+ |
12564 |
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", |
12565 |
+- bfqq, atomic_read(&bfqq->ref)); |
12566 |
++ bfqq, bfqq->ref); |
12567 |
+ bfq_put_queue(bfqq); |
12568 |
+ } |
12569 |
+ } |
12570 |
+ |
12571 |
+ /* |
12572 |
+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this |
12573 |
+- * was the last process referring to said bfqq. |
12574 |
++ * was the last process referring to that bfqq. |
12575 |
+ */ |
12576 |
+ static struct bfq_queue * |
12577 |
+ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) |
12578 |
+@@ -3727,11 +4150,8 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, |
12579 |
+ unsigned long flags; |
12580 |
+ bool split = false; |
12581 |
+ |
12582 |
+- might_sleep_if(gfpflags_allow_blocking(gfp_mask)); |
12583 |
+- |
12584 |
+- bfq_check_ioprio_change(bic, bio); |
12585 |
+- |
12586 |
+ spin_lock_irqsave(q->queue_lock, flags); |
12587 |
++ bfq_check_ioprio_change(bic, bio); |
12588 |
+ |
12589 |
+ if (!bic) |
12590 |
+ goto queue_fail; |
12591 |
+@@ -3741,23 +4161,47 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, |
12592 |
+ new_queue: |
12593 |
+ bfqq = bic_to_bfqq(bic, is_sync); |
12594 |
+ if (!bfqq || bfqq == &bfqd->oom_bfqq) { |
12595 |
+- bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); |
12596 |
++ if (bfqq) |
12597 |
++ bfq_put_queue(bfqq); |
12598 |
++ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); |
12599 |
++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); |
12600 |
++ |
12601 |
+ bic_set_bfqq(bic, bfqq, is_sync); |
12602 |
+ if (split && is_sync) { |
12603 |
++ bfq_log_bfqq(bfqd, bfqq, |
12604 |
++ "set_request: was_in_list %d " |
12605 |
++ "was_in_large_burst %d " |
12606 |
++ "large burst in progress %d", |
12607 |
++ bic->was_in_burst_list, |
12608 |
++ bic->saved_in_large_burst, |
12609 |
++ bfqd->large_burst); |
12610 |
++ |
12611 |
+ if ((bic->was_in_burst_list && bfqd->large_burst) || |
12612 |
+- bic->saved_in_large_burst) |
12613 |
++ bic->saved_in_large_burst) { |
12614 |
++ bfq_log_bfqq(bfqd, bfqq, |
12615 |
++ "set_request: marking in " |
12616 |
++ "large burst"); |
12617 |
+ bfq_mark_bfqq_in_large_burst(bfqq); |
12618 |
+- else { |
12619 |
+- bfq_clear_bfqq_in_large_burst(bfqq); |
12620 |
+- if (bic->was_in_burst_list) |
12621 |
+- hlist_add_head(&bfqq->burst_list_node, |
12622 |
+- &bfqd->burst_list); |
12623 |
++ } else { |
12624 |
++ bfq_log_bfqq(bfqd, bfqq, |
12625 |
++ "set_request: clearing in " |
12626 |
++ "large burst"); |
12627 |
++ bfq_clear_bfqq_in_large_burst(bfqq); |
12628 |
++ if (bic->was_in_burst_list) |
12629 |
++ hlist_add_head(&bfqq->burst_list_node, |
12630 |
++ &bfqd->burst_list); |
12631 |
+ } |
12632 |
++ bfqq->split_time = jiffies; |
12633 |
+ } |
12634 |
+ } else { |
12635 |
+ /* If the queue was seeky for too long, break it apart. */ |
12636 |
+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { |
12637 |
+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); |
12638 |
++ |
12639 |
++ /* Update bic before losing reference to bfqq */ |
12640 |
++ if (bfq_bfqq_in_large_burst(bfqq)) |
12641 |
++ bic->saved_in_large_burst = true; |
12642 |
++ |
12643 |
+ bfqq = bfq_split_bfqq(bic, bfqq); |
12644 |
+ split = true; |
12645 |
+ if (!bfqq) |
12646 |
+@@ -3766,9 +4210,8 @@ new_queue: |
12647 |
+ } |
12648 |
+ |
12649 |
+ bfqq->allocated[rw]++; |
12650 |
+- atomic_inc(&bfqq->ref); |
12651 |
+- bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, |
12652 |
+- atomic_read(&bfqq->ref)); |
12653 |
++ bfqq->ref++; |
12654 |
++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); |
12655 |
+ |
12656 |
+ rq->elv.priv[0] = bic; |
12657 |
+ rq->elv.priv[1] = bfqq; |
12658 |
+@@ -3783,7 +4226,6 @@ new_queue: |
12659 |
+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { |
12660 |
+ bfqq->bic = bic; |
12661 |
+ if (split) { |
12662 |
+- bfq_mark_bfqq_just_split(bfqq); |
12663 |
+ /* |
12664 |
+ * If the queue has just been split from a shared |
12665 |
+ * queue, restore the idle window and the possible |
12666 |
+@@ -3793,6 +4235,9 @@ new_queue: |
12667 |
+ } |
12668 |
+ } |
12669 |
+ |
12670 |
++ if (unlikely(bfq_bfqq_just_created(bfqq))) |
12671 |
++ bfq_handle_burst(bfqd, bfqq); |
12672 |
++ |
12673 |
+ spin_unlock_irqrestore(q->queue_lock, flags); |
12674 |
+ |
12675 |
+ return 0; |
12676 |
+@@ -3872,6 +4317,7 @@ static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) |
12677 |
+ cancel_work_sync(&bfqd->unplug_work); |
12678 |
+ } |
12679 |
+ |
12680 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
12681 |
+ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, |
12682 |
+ struct bfq_queue **bfqq_ptr) |
12683 |
+ { |
12684 |
+@@ -3880,9 +4326,9 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, |
12685 |
+ |
12686 |
+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); |
12687 |
+ if (bfqq) { |
12688 |
+- bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); |
12689 |
++ bfq_bfqq_move(bfqd, bfqq, root_group); |
12690 |
+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", |
12691 |
+- bfqq, atomic_read(&bfqq->ref)); |
12692 |
++ bfqq, bfqq->ref); |
12693 |
+ bfq_put_queue(bfqq); |
12694 |
+ *bfqq_ptr = NULL; |
12695 |
+ } |
12696 |
+@@ -3904,6 +4350,7 @@ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) |
12697 |
+ |
12698 |
+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); |
12699 |
+ } |
12700 |
++#endif |
12701 |
+ |
12702 |
+ static void bfq_exit_queue(struct elevator_queue *e) |
12703 |
+ { |
12704 |
+@@ -3923,8 +4370,6 @@ static void bfq_exit_queue(struct elevator_queue *e) |
12705 |
+ |
12706 |
+ bfq_shutdown_timer_wq(bfqd); |
12707 |
+ |
12708 |
+- synchronize_rcu(); |
12709 |
+- |
12710 |
+ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); |
12711 |
+ |
12712 |
+ #ifdef CONFIG_BFQ_GROUP_IOSCHED |
12713 |
+@@ -3973,11 +4418,14 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) |
12714 |
+ * will not attempt to free it. |
12715 |
+ */ |
12716 |
+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); |
12717 |
+- atomic_inc(&bfqd->oom_bfqq.ref); |
12718 |
++ bfqd->oom_bfqq.ref++; |
12719 |
+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; |
12720 |
+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; |
12721 |
+ bfqd->oom_bfqq.entity.new_weight = |
12722 |
+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); |
12723 |
++ |
12724 |
++ /* oom_bfqq does not participate to bursts */ |
12725 |
++ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); |
12726 |
+ /* |
12727 |
+ * Trigger weight initialization, according to ioprio, at the |
12728 |
+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio |
12729 |
+@@ -3996,9 +4444,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) |
12730 |
+ goto out_free; |
12731 |
+ bfq_init_root_group(bfqd->root_group, bfqd); |
12732 |
+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); |
12733 |
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED |
12734 |
+- bfqd->active_numerous_groups = 0; |
12735 |
+-#endif |
12736 |
+ |
12737 |
+ init_timer(&bfqd->idle_slice_timer); |
12738 |
+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; |
12739 |
+@@ -4023,20 +4468,19 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) |
12740 |
+ bfqd->bfq_back_penalty = bfq_back_penalty; |
12741 |
+ bfqd->bfq_slice_idle = bfq_slice_idle; |
12742 |
+ bfqd->bfq_class_idle_last_service = 0; |
12743 |
+- bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; |
12744 |
+- bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; |
12745 |
+- bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; |
12746 |
++ bfqd->bfq_timeout = bfq_timeout; |
12747 |
+ |
12748 |
+- bfqd->bfq_coop_thresh = 2; |
12749 |
+- bfqd->bfq_failed_cooperations = 7000; |
12750 |
+ bfqd->bfq_requests_within_timer = 120; |
12751 |
+ |
12752 |
+- bfqd->bfq_large_burst_thresh = 11; |
12753 |
+- bfqd->bfq_burst_interval = msecs_to_jiffies(500); |
12754 |
++ bfqd->bfq_large_burst_thresh = 8; |
12755 |
++ bfqd->bfq_burst_interval = msecs_to_jiffies(180); |
12756 |
+ |
12757 |
+ bfqd->low_latency = true; |
12758 |
+ |
12759 |
+- bfqd->bfq_wr_coeff = 20; |
12760 |
++ /* |
12761 |
++ * Trade-off between responsiveness and fairness. |
12762 |
++ */ |
12763 |
++ bfqd->bfq_wr_coeff = 30; |
12764 |
+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); |
12765 |
+ bfqd->bfq_wr_max_time = 0; |
12766 |
+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); |
12767 |
+@@ -4048,16 +4492,15 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) |
12768 |
+ * video. |
12769 |
+ */ |
12770 |
+ bfqd->wr_busy_queues = 0; |
12771 |
+- bfqd->busy_in_flight_queues = 0; |
12772 |
+- bfqd->const_seeky_busy_in_flight_queues = 0; |
12773 |
+ |
12774 |
+ /* |
12775 |
+- * Begin by assuming, optimistically, that the device peak rate is |
12776 |
+- * equal to the highest reference rate. |
12777 |
++ * Begin by assuming, optimistically, that the device is a |
12778 |
++ * high-speed one, and that its peak rate is equal to 2/3 of |
12779 |
++ * the highest reference rate. |
12780 |
+ */ |
12781 |
+ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * |
12782 |
+ T_fast[blk_queue_nonrot(bfqd->queue)]; |
12783 |
+- bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; |
12784 |
++ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; |
12785 |
+ bfqd->device_speed = BFQ_BFQD_FAST; |
12786 |
+ |
12787 |
+ return 0; |
12788 |
+@@ -4161,10 +4604,8 @@ SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); |
12789 |
+ SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); |
12790 |
+ SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); |
12791 |
+ SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); |
12792 |
+-SHOW_FUNCTION(bfq_max_budget_async_rq_show, |
12793 |
+- bfqd->bfq_max_budget_async_rq, 0); |
12794 |
+-SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); |
12795 |
+-SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); |
12796 |
++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); |
12797 |
++SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); |
12798 |
+ SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); |
12799 |
+ SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); |
12800 |
+ SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); |
12801 |
+@@ -4199,10 +4640,6 @@ STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); |
12802 |
+ STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, |
12803 |
+ INT_MAX, 0); |
12804 |
+ STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); |
12805 |
+-STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, |
12806 |
+- 1, INT_MAX, 0); |
12807 |
+-STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, |
12808 |
+- INT_MAX, 1); |
12809 |
+ STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); |
12810 |
+ STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); |
12811 |
+ STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, |
12812 |
+@@ -4224,10 +4661,8 @@ static ssize_t bfq_weights_store(struct elevator_queue *e, |
12813 |
+ |
12814 |
+ static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) |
12815 |
+ { |
12816 |
+- u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); |
12817 |
+- |
12818 |
+ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) |
12819 |
+- return bfq_calc_max_budget(bfqd->peak_rate, timeout); |
12820 |
++ return bfq_calc_max_budget(bfqd); |
12821 |
+ else |
12822 |
+ return bfq_default_max_budget; |
12823 |
+ } |
12824 |
+@@ -4252,6 +4687,10 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, |
12825 |
+ return ret; |
12826 |
+ } |
12827 |
+ |
12828 |
++/* |
12829 |
++ * Leaving this name to preserve name compatibility with cfq |
12830 |
++ * parameters, but this timeout is used for both sync and async. |
12831 |
++ */ |
12832 |
+ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, |
12833 |
+ const char *page, size_t count) |
12834 |
+ { |
12835 |
+@@ -4264,13 +4703,31 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, |
12836 |
+ else if (__data > INT_MAX) |
12837 |
+ __data = INT_MAX; |
12838 |
+ |
12839 |
+- bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); |
12840 |
++ bfqd->bfq_timeout = msecs_to_jiffies(__data); |
12841 |
+ if (bfqd->bfq_user_max_budget == 0) |
12842 |
+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); |
12843 |
+ |
12844 |
+ return ret; |
12845 |
+ } |
12846 |
+ |
12847 |
++static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, |
12848 |
++ const char *page, size_t count) |
12849 |
++{ |
12850 |
++ struct bfq_data *bfqd = e->elevator_data; |
12851 |
++ unsigned long uninitialized_var(__data); |
12852 |
++ int ret = bfq_var_store(&__data, (page), count); |
12853 |
++ |
12854 |
++ if (__data > 1) |
12855 |
++ __data = 1; |
12856 |
++ if (!bfqd->strict_guarantees && __data == 1 |
12857 |
++ && bfqd->bfq_slice_idle < msecs_to_jiffies(8)) |
12858 |
++ bfqd->bfq_slice_idle = msecs_to_jiffies(8); |
12859 |
++ |
12860 |
++ bfqd->strict_guarantees = __data; |
12861 |
++ |
12862 |
++ return ret; |
12863 |
++} |
12864 |
++ |
12865 |
+ static ssize_t bfq_low_latency_store(struct elevator_queue *e, |
12866 |
+ const char *page, size_t count) |
12867 |
+ { |
12868 |
+@@ -4297,9 +4754,8 @@ static struct elv_fs_entry bfq_attrs[] = { |
12869 |
+ BFQ_ATTR(back_seek_penalty), |
12870 |
+ BFQ_ATTR(slice_idle), |
12871 |
+ BFQ_ATTR(max_budget), |
12872 |
+- BFQ_ATTR(max_budget_async_rq), |
12873 |
+ BFQ_ATTR(timeout_sync), |
12874 |
+- BFQ_ATTR(timeout_async), |
12875 |
++ BFQ_ATTR(strict_guarantees), |
12876 |
+ BFQ_ATTR(low_latency), |
12877 |
+ BFQ_ATTR(wr_coeff), |
12878 |
+ BFQ_ATTR(wr_max_time), |
12879 |
+@@ -4342,9 +4798,28 @@ static struct elevator_type iosched_bfq = { |
12880 |
+ .elevator_owner = THIS_MODULE, |
12881 |
+ }; |
12882 |
+ |
12883 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
12884 |
++static struct blkcg_policy blkcg_policy_bfq = { |
12885 |
++ .dfl_cftypes = bfq_blkg_files, |
12886 |
++ .legacy_cftypes = bfq_blkcg_legacy_files, |
12887 |
++ |
12888 |
++ .cpd_alloc_fn = bfq_cpd_alloc, |
12889 |
++ .cpd_init_fn = bfq_cpd_init, |
12890 |
++ .cpd_bind_fn = bfq_cpd_init, |
12891 |
++ .cpd_free_fn = bfq_cpd_free, |
12892 |
++ |
12893 |
++ .pd_alloc_fn = bfq_pd_alloc, |
12894 |
++ .pd_init_fn = bfq_pd_init, |
12895 |
++ .pd_offline_fn = bfq_pd_offline, |
12896 |
++ .pd_free_fn = bfq_pd_free, |
12897 |
++ .pd_reset_stats_fn = bfq_pd_reset_stats, |
12898 |
++}; |
12899 |
++#endif |
12900 |
++ |
12901 |
+ static int __init bfq_init(void) |
12902 |
+ { |
12903 |
+ int ret; |
12904 |
++ char msg[50] = "BFQ I/O-scheduler: v8"; |
12905 |
+ |
12906 |
+ /* |
12907 |
+ * Can be 0 on HZ < 1000 setups. |
12908 |
+@@ -4352,9 +4827,6 @@ static int __init bfq_init(void) |
12909 |
+ if (bfq_slice_idle == 0) |
12910 |
+ bfq_slice_idle = 1; |
12911 |
+ |
12912 |
+- if (bfq_timeout_async == 0) |
12913 |
+- bfq_timeout_async = 1; |
12914 |
+- |
12915 |
+ #ifdef CONFIG_BFQ_GROUP_IOSCHED |
12916 |
+ ret = blkcg_policy_register(&blkcg_policy_bfq); |
12917 |
+ if (ret) |
12918 |
+@@ -4370,23 +4842,34 @@ static int __init bfq_init(void) |
12919 |
+ * installed on the reference devices (see the comments before the |
12920 |
+ * definitions of the two arrays). |
12921 |
+ */ |
12922 |
+- T_slow[0] = msecs_to_jiffies(2600); |
12923 |
+- T_slow[1] = msecs_to_jiffies(1000); |
12924 |
+- T_fast[0] = msecs_to_jiffies(5500); |
12925 |
+- T_fast[1] = msecs_to_jiffies(2000); |
12926 |
++ T_slow[0] = msecs_to_jiffies(3500); |
12927 |
++ T_slow[1] = msecs_to_jiffies(1500); |
12928 |
++ T_fast[0] = msecs_to_jiffies(8000); |
12929 |
++ T_fast[1] = msecs_to_jiffies(3000); |
12930 |
+ |
12931 |
+ /* |
12932 |
+- * Thresholds that determine the switch between speed classes (see |
12933 |
+- * the comments before the definition of the array). |
12934 |
++ * Thresholds that determine the switch between speed classes |
12935 |
++ * (see the comments before the definition of the array |
12936 |
++ * device_speed_thresh). These thresholds are biased towards |
12937 |
++ * transitions to the fast class. This is safer than the |
12938 |
++ * opposite bias. In fact, a wrong transition to the slow |
12939 |
++ * class results in short weight-raising periods, because the |
12940 |
++ * speed of the device then tends to be higher that the |
12941 |
++ * reference peak rate. On the opposite end, a wrong |
12942 |
++ * transition to the fast class tends to increase |
12943 |
++ * weight-raising periods, because of the opposite reason. |
12944 |
+ */ |
12945 |
+- device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; |
12946 |
+- device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; |
12947 |
++ device_speed_thresh[0] = (4 * R_slow[0]) / 3; |
12948 |
++ device_speed_thresh[1] = (4 * R_slow[1]) / 3; |
12949 |
+ |
12950 |
+ ret = elv_register(&iosched_bfq); |
12951 |
+ if (ret) |
12952 |
+ goto err_pol_unreg; |
12953 |
+ |
12954 |
+- pr_info("BFQ I/O-scheduler: v7r11"); |
12955 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
12956 |
++ strcat(msg, " (with cgroups support)"); |
12957 |
++#endif |
12958 |
++ pr_info("%s", msg); |
12959 |
+ |
12960 |
+ return 0; |
12961 |
+ |
12962 |
+diff --git a/block/bfq-sched.c b/block/bfq-sched.c |
12963 |
+index a64fec1..e54b149 100644 |
12964 |
+--- a/block/bfq-sched.c |
12965 |
++++ b/block/bfq-sched.c |
12966 |
+@@ -7,9 +7,11 @@ |
12967 |
+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
12968 |
+ * Paolo Valente <paolo.valente@×××××××.it> |
12969 |
+ * |
12970 |
+- * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
12971 |
++ * Copyright (C) 2016 Paolo Valente <paolo.valente@×××××××.it> |
12972 |
+ */ |
12973 |
+ |
12974 |
++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); |
12975 |
++ |
12976 |
+ #ifdef CONFIG_BFQ_GROUP_IOSCHED |
12977 |
+ #define for_each_entity(entity) \ |
12978 |
+ for (; entity ; entity = entity->parent) |
12979 |
+@@ -22,8 +24,6 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, |
12980 |
+ int extract, |
12981 |
+ struct bfq_data *bfqd); |
12982 |
+ |
12983 |
+-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); |
12984 |
+- |
12985 |
+ static void bfq_update_budget(struct bfq_entity *next_in_service) |
12986 |
+ { |
12987 |
+ struct bfq_entity *bfqg_entity; |
12988 |
+@@ -48,6 +48,7 @@ static void bfq_update_budget(struct bfq_entity *next_in_service) |
12989 |
+ static int bfq_update_next_in_service(struct bfq_sched_data *sd) |
12990 |
+ { |
12991 |
+ struct bfq_entity *next_in_service; |
12992 |
++ struct bfq_queue *bfqq; |
12993 |
+ |
12994 |
+ if (sd->in_service_entity) |
12995 |
+ /* will update/requeue at the end of service */ |
12996 |
+@@ -65,14 +66,29 @@ static int bfq_update_next_in_service(struct bfq_sched_data *sd) |
12997 |
+ |
12998 |
+ if (next_in_service) |
12999 |
+ bfq_update_budget(next_in_service); |
13000 |
++ else |
13001 |
++ goto exit; |
13002 |
+ |
13003 |
++ bfqq = bfq_entity_to_bfqq(next_in_service); |
13004 |
++ if (bfqq) |
13005 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
13006 |
++ "update_next_in_service: chosen this queue"); |
13007 |
++ else { |
13008 |
++ struct bfq_group *bfqg = |
13009 |
++ container_of(next_in_service, |
13010 |
++ struct bfq_group, entity); |
13011 |
++ |
13012 |
++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, |
13013 |
++ "update_next_in_service: chosen this entity"); |
13014 |
++ } |
13015 |
++exit: |
13016 |
+ return 1; |
13017 |
+ } |
13018 |
+ |
13019 |
+ static void bfq_check_next_in_service(struct bfq_sched_data *sd, |
13020 |
+ struct bfq_entity *entity) |
13021 |
+ { |
13022 |
+- BUG_ON(sd->next_in_service != entity); |
13023 |
++ WARN_ON(sd->next_in_service != entity); |
13024 |
+ } |
13025 |
+ #else |
13026 |
+ #define for_each_entity(entity) \ |
13027 |
+@@ -151,20 +167,35 @@ static u64 bfq_delta(unsigned long service, unsigned long weight) |
13028 |
+ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) |
13029 |
+ { |
13030 |
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
13031 |
+- |
13032 |
++ unsigned long long start, finish, delta ; |
13033 |
+ BUG_ON(entity->weight == 0); |
13034 |
+ |
13035 |
+ entity->finish = entity->start + |
13036 |
+ bfq_delta(service, entity->weight); |
13037 |
+ |
13038 |
++ start = ((entity->start>>10)*1000)>>12; |
13039 |
++ finish = ((entity->finish>>10)*1000)>>12; |
13040 |
++ delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; |
13041 |
++ |
13042 |
+ if (bfqq) { |
13043 |
+ bfq_log_bfqq(bfqq->bfqd, bfqq, |
13044 |
+ "calc_finish: serv %lu, w %d", |
13045 |
+ service, entity->weight); |
13046 |
+ bfq_log_bfqq(bfqq->bfqd, bfqq, |
13047 |
+ "calc_finish: start %llu, finish %llu, delta %llu", |
13048 |
+- entity->start, entity->finish, |
13049 |
+- bfq_delta(service, entity->weight)); |
13050 |
++ start, finish, delta); |
13051 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
13052 |
++ } else { |
13053 |
++ struct bfq_group *bfqg = |
13054 |
++ container_of(entity, struct bfq_group, entity); |
13055 |
++ |
13056 |
++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, |
13057 |
++ "calc_finish group: serv %lu, w %d", |
13058 |
++ service, entity->weight); |
13059 |
++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, |
13060 |
++ "calc_finish group: start %llu, finish %llu, delta %llu", |
13061 |
++ start, finish, delta); |
13062 |
++#endif |
13063 |
+ } |
13064 |
+ } |
13065 |
+ |
13066 |
+@@ -386,8 +417,6 @@ static void bfq_active_insert(struct bfq_service_tree *st, |
13067 |
+ BUG_ON(!bfqg); |
13068 |
+ BUG_ON(!bfqd); |
13069 |
+ bfqg->active_entities++; |
13070 |
+- if (bfqg->active_entities == 2) |
13071 |
+- bfqd->active_numerous_groups++; |
13072 |
+ } |
13073 |
+ #endif |
13074 |
+ } |
13075 |
+@@ -399,7 +428,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, |
13076 |
+ static unsigned short bfq_ioprio_to_weight(int ioprio) |
13077 |
+ { |
13078 |
+ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); |
13079 |
+- return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio; |
13080 |
++ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF ; |
13081 |
+ } |
13082 |
+ |
13083 |
+ /** |
13084 |
+@@ -422,9 +451,9 @@ static void bfq_get_entity(struct bfq_entity *entity) |
13085 |
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
13086 |
+ |
13087 |
+ if (bfqq) { |
13088 |
+- atomic_inc(&bfqq->ref); |
13089 |
++ bfqq->ref++; |
13090 |
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", |
13091 |
+- bfqq, atomic_read(&bfqq->ref)); |
13092 |
++ bfqq, bfqq->ref); |
13093 |
+ } |
13094 |
+ } |
13095 |
+ |
13096 |
+@@ -499,10 +528,6 @@ static void bfq_active_extract(struct bfq_service_tree *st, |
13097 |
+ BUG_ON(!bfqd); |
13098 |
+ BUG_ON(!bfqg->active_entities); |
13099 |
+ bfqg->active_entities--; |
13100 |
+- if (bfqg->active_entities == 1) { |
13101 |
+- BUG_ON(!bfqd->active_numerous_groups); |
13102 |
+- bfqd->active_numerous_groups--; |
13103 |
+- } |
13104 |
+ } |
13105 |
+ #endif |
13106 |
+ } |
13107 |
+@@ -552,7 +577,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st, |
13108 |
+ if (bfqq) { |
13109 |
+ sd = entity->sched_data; |
13110 |
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", |
13111 |
+- bfqq, atomic_read(&bfqq->ref)); |
13112 |
++ bfqq, bfqq->ref); |
13113 |
+ bfq_put_queue(bfqq); |
13114 |
+ } |
13115 |
+ } |
13116 |
+@@ -628,12 +653,14 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, |
13117 |
+ if (entity->new_weight != entity->orig_weight) { |
13118 |
+ if (entity->new_weight < BFQ_MIN_WEIGHT || |
13119 |
+ entity->new_weight > BFQ_MAX_WEIGHT) { |
13120 |
+- printk(KERN_CRIT "update_weight_prio: " |
13121 |
+- "new_weight %d\n", |
13122 |
++ pr_crit("update_weight_prio: new_weight %d\n", |
13123 |
+ entity->new_weight); |
13124 |
+- BUG(); |
13125 |
++ if (entity->new_weight < BFQ_MIN_WEIGHT) |
13126 |
++ entity->new_weight = BFQ_MIN_WEIGHT; |
13127 |
++ else |
13128 |
++ entity->new_weight = BFQ_MAX_WEIGHT; |
13129 |
+ } |
13130 |
+- entity->orig_weight = entity->new_weight; |
13131 |
++ entity->orig_weight = entity->new_weight; |
13132 |
+ if (bfqq) |
13133 |
+ bfqq->ioprio = |
13134 |
+ bfq_weight_to_ioprio(entity->orig_weight); |
13135 |
+@@ -708,7 +735,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) |
13136 |
+ st = bfq_entity_service_tree(entity); |
13137 |
+ |
13138 |
+ entity->service += served; |
13139 |
+- BUG_ON(entity->service > entity->budget); |
13140 |
++ |
13141 |
+ BUG_ON(st->wsum == 0); |
13142 |
+ |
13143 |
+ st->vtime += bfq_delta(served, st->wsum); |
13144 |
+@@ -717,31 +744,69 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) |
13145 |
+ #ifdef CONFIG_BFQ_GROUP_IOSCHED |
13146 |
+ bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); |
13147 |
+ #endif |
13148 |
+- bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); |
13149 |
++ st = bfq_entity_service_tree(&bfqq->entity); |
13150 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", |
13151 |
++ served, ((st->vtime>>10)*1000)>>12, st); |
13152 |
+ } |
13153 |
+ |
13154 |
+ /** |
13155 |
+- * bfq_bfqq_charge_full_budget - set the service to the entity budget. |
13156 |
++ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length |
13157 |
++ * of the time interval during which bfqq has been in |
13158 |
++ * service. |
13159 |
++ * @bfqd: the device |
13160 |
+ * @bfqq: the queue that needs a service update. |
13161 |
++ * @time_ms: the amount of time during which the queue has received service |
13162 |
++ * |
13163 |
++ * If a queue does not consume its budget fast enough, then providing |
13164 |
++ * the queue with service fairness may impair throughput, more or less |
13165 |
++ * severely. For this reason, queues that consume their budget slowly |
13166 |
++ * are provided with time fairness instead of service fairness. This |
13167 |
++ * goal is achieved through the BFQ scheduling engine, even if such an |
13168 |
++ * engine works in the service, and not in the time domain. The trick |
13169 |
++ * is charging these queues with an inflated amount of service, equal |
13170 |
++ * to the amount of service that they would have received during their |
13171 |
++ * service slot if they had been fast, i.e., if their requests had |
13172 |
++ * been dispatched at a rate equal to the estimated peak rate. |
13173 |
+ * |
13174 |
+- * When it's not possible to be fair in the service domain, because |
13175 |
+- * a queue is not consuming its budget fast enough (the meaning of |
13176 |
+- * fast depends on the timeout parameter), we charge it a full |
13177 |
+- * budget. In this way we should obtain a sort of time-domain |
13178 |
+- * fairness among all the seeky/slow queues. |
13179 |
++ * It is worth noting that time fairness can cause important |
13180 |
++ * distortions in terms of bandwidth distribution, on devices with |
13181 |
++ * internal queueing. The reason is that I/O requests dispatched |
13182 |
++ * during the service slot of a queue may be served after that service |
13183 |
++ * slot is finished, and may have a total processing time loosely |
13184 |
++ * correlated with the duration of the service slot. This is |
13185 |
++ * especially true for short service slots. |
13186 |
+ */ |
13187 |
+-static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) |
13188 |
++static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
13189 |
++ unsigned long time_ms) |
13190 |
+ { |
13191 |
+ struct bfq_entity *entity = &bfqq->entity; |
13192 |
++ int tot_serv_to_charge = entity->service; |
13193 |
++ unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); |
13194 |
++ |
13195 |
++ if (time_ms > 0 && time_ms < timeout_ms) |
13196 |
++ tot_serv_to_charge = |
13197 |
++ (bfqd->bfq_max_budget * time_ms) / timeout_ms; |
13198 |
++ |
13199 |
++ if (tot_serv_to_charge < entity->service) |
13200 |
++ tot_serv_to_charge = entity->service; |
13201 |
++ |
13202 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
13203 |
++ "charge_time: %lu/%u ms, %d/%d/%d sectors", |
13204 |
++ time_ms, timeout_ms, entity->service, |
13205 |
++ tot_serv_to_charge, entity->budget); |
13206 |
+ |
13207 |
+- bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); |
13208 |
++ /* Increase budget to avoid inconsistencies */ |
13209 |
++ if (tot_serv_to_charge > entity->budget) |
13210 |
++ entity->budget = tot_serv_to_charge; |
13211 |
+ |
13212 |
+- bfq_bfqq_served(bfqq, entity->budget - entity->service); |
13213 |
++ bfq_bfqq_served(bfqq, |
13214 |
++ max_t(int, 0, tot_serv_to_charge - entity->service)); |
13215 |
+ } |
13216 |
+ |
13217 |
+ /** |
13218 |
+ * __bfq_activate_entity - activate an entity. |
13219 |
+ * @entity: the entity being activated. |
13220 |
++ * @non_blocking_wait_rq: true if this entity was waiting for a request |
13221 |
+ * |
13222 |
+ * Called whenever an entity is activated, i.e., it is not active and one |
13223 |
+ * of its children receives a new request, or has to be reactivated due to |
13224 |
+@@ -749,11 +814,16 @@ static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) |
13225 |
+ * service received if @entity is active) of the queue to calculate its |
13226 |
+ * timestamps. |
13227 |
+ */ |
13228 |
+-static void __bfq_activate_entity(struct bfq_entity *entity) |
13229 |
++static void __bfq_activate_entity(struct bfq_entity *entity, |
13230 |
++ bool non_blocking_wait_rq) |
13231 |
+ { |
13232 |
+ struct bfq_sched_data *sd = entity->sched_data; |
13233 |
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); |
13234 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
13235 |
++ bool backshifted = false; |
13236 |
+ |
13237 |
++ BUG_ON(!sd); |
13238 |
++ BUG_ON(!st); |
13239 |
+ if (entity == sd->in_service_entity) { |
13240 |
+ BUG_ON(entity->tree); |
13241 |
+ /* |
13242 |
+@@ -771,45 +841,133 @@ static void __bfq_activate_entity(struct bfq_entity *entity) |
13243 |
+ * old start time. |
13244 |
+ */ |
13245 |
+ bfq_active_extract(st, entity); |
13246 |
+- } else if (entity->tree == &st->idle) { |
13247 |
+- /* |
13248 |
+- * Must be on the idle tree, bfq_idle_extract() will |
13249 |
+- * check for that. |
13250 |
+- */ |
13251 |
+- bfq_idle_extract(st, entity); |
13252 |
+- entity->start = bfq_gt(st->vtime, entity->finish) ? |
13253 |
+- st->vtime : entity->finish; |
13254 |
+ } else { |
13255 |
+- /* |
13256 |
+- * The finish time of the entity may be invalid, and |
13257 |
+- * it is in the past for sure, otherwise the queue |
13258 |
+- * would have been on the idle tree. |
13259 |
+- */ |
13260 |
+- entity->start = st->vtime; |
13261 |
+- st->wsum += entity->weight; |
13262 |
+- bfq_get_entity(entity); |
13263 |
++ unsigned long long min_vstart; |
13264 |
++ |
13265 |
++ /* See comments on bfq_fqq_update_budg_for_activation */ |
13266 |
++ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { |
13267 |
++ backshifted = true; |
13268 |
++ min_vstart = entity->finish; |
13269 |
++ } else |
13270 |
++ min_vstart = st->vtime; |
13271 |
+ |
13272 |
+- BUG_ON(entity->on_st); |
13273 |
+- entity->on_st = 1; |
13274 |
++ if (entity->tree == &st->idle) { |
13275 |
++ /* |
13276 |
++ * Must be on the idle tree, bfq_idle_extract() will |
13277 |
++ * check for that. |
13278 |
++ */ |
13279 |
++ bfq_idle_extract(st, entity); |
13280 |
++ entity->start = bfq_gt(min_vstart, entity->finish) ? |
13281 |
++ min_vstart : entity->finish; |
13282 |
++ } else { |
13283 |
++ /* |
13284 |
++ * The finish time of the entity may be invalid, and |
13285 |
++ * it is in the past for sure, otherwise the queue |
13286 |
++ * would have been on the idle tree. |
13287 |
++ */ |
13288 |
++ entity->start = min_vstart; |
13289 |
++ st->wsum += entity->weight; |
13290 |
++ bfq_get_entity(entity); |
13291 |
++ |
13292 |
++ BUG_ON(entity->on_st); |
13293 |
++ entity->on_st = 1; |
13294 |
++ } |
13295 |
+ } |
13296 |
+ |
13297 |
+ st = __bfq_entity_update_weight_prio(st, entity); |
13298 |
+ bfq_calc_finish(entity, entity->budget); |
13299 |
++ |
13300 |
++ /* |
13301 |
++ * If some queues enjoy backshifting for a while, then their |
13302 |
++ * (virtual) finish timestamps may happen to become lower and |
13303 |
++ * lower than the system virtual time. In particular, if |
13304 |
++ * these queues often happen to be idle for short time |
13305 |
++ * periods, and during such time periods other queues with |
13306 |
++ * higher timestamps happen to be busy, then the backshifted |
13307 |
++ * timestamps of the former queues can become much lower than |
13308 |
++ * the system virtual time. In fact, to serve the queues with |
13309 |
++ * higher timestamps while the ones with lower timestamps are |
13310 |
++ * idle, the system virtual time may be pushed-up to much |
13311 |
++ * higher values than the finish timestamps of the idle |
13312 |
++ * queues. As a consequence, the finish timestamps of all new |
13313 |
++ * or newly activated queues may end up being much larger than |
13314 |
++ * those of lucky queues with backshifted timestamps. The |
13315 |
++ * latter queues may then monopolize the device for a lot of |
13316 |
++ * time. This would simply break service guarantees. |
13317 |
++ * |
13318 |
++ * To reduce this problem, push up a little bit the |
13319 |
++ * backshifted timestamps of the queue associated with this |
13320 |
++ * entity (only a queue can happen to have the backshifted |
13321 |
++ * flag set): just enough to let the finish timestamp of the |
13322 |
++ * queue be equal to the current value of the system virtual |
13323 |
++ * time. This may introduce a little unfairness among queues |
13324 |
++ * with backshifted timestamps, but it does not break |
13325 |
++ * worst-case fairness guarantees. |
13326 |
++ * |
13327 |
++ * As a special case, if bfqq is weight-raised, push up |
13328 |
++ * timestamps much less, to keep very low the probability that |
13329 |
++ * this push up causes the backshifted finish timestamps of |
13330 |
++ * weight-raised queues to become higher than the backshifted |
13331 |
++ * finish timestamps of non weight-raised queues. |
13332 |
++ */ |
13333 |
++ if (backshifted && bfq_gt(st->vtime, entity->finish)) { |
13334 |
++ unsigned long delta = st->vtime - entity->finish; |
13335 |
++ |
13336 |
++ if (bfqq) |
13337 |
++ delta /= bfqq->wr_coeff; |
13338 |
++ |
13339 |
++ entity->start += delta; |
13340 |
++ entity->finish += delta; |
13341 |
++ |
13342 |
++ if (bfqq) { |
13343 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
13344 |
++ "__activate_entity: new queue finish %llu", |
13345 |
++ ((entity->finish>>10)*1000)>>12); |
13346 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
13347 |
++ } else { |
13348 |
++ struct bfq_group *bfqg = |
13349 |
++ container_of(entity, struct bfq_group, entity); |
13350 |
++ |
13351 |
++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, |
13352 |
++ "__activate_entity: new group finish %llu", |
13353 |
++ ((entity->finish>>10)*1000)>>12); |
13354 |
++#endif |
13355 |
++ } |
13356 |
++ } |
13357 |
++ |
13358 |
+ bfq_active_insert(st, entity); |
13359 |
++ |
13360 |
++ if (bfqq) { |
13361 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
13362 |
++ "__activate_entity: queue %seligible in st %p", |
13363 |
++ entity->start <= st->vtime ? "" : "non ", st); |
13364 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
13365 |
++ } else { |
13366 |
++ struct bfq_group *bfqg = |
13367 |
++ container_of(entity, struct bfq_group, entity); |
13368 |
++ |
13369 |
++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, |
13370 |
++ "__activate_entity: group %seligible in st %p", |
13371 |
++ entity->start <= st->vtime ? "" : "non ", st); |
13372 |
++#endif |
13373 |
++ } |
13374 |
+ } |
13375 |
+ |
13376 |
+ /** |
13377 |
+ * bfq_activate_entity - activate an entity and its ancestors if necessary. |
13378 |
+ * @entity: the entity to activate. |
13379 |
++ * @non_blocking_wait_rq: true if this entity was waiting for a request |
13380 |
+ * |
13381 |
+ * Activate @entity and all the entities on the path from it to the root. |
13382 |
+ */ |
13383 |
+-static void bfq_activate_entity(struct bfq_entity *entity) |
13384 |
++static void bfq_activate_entity(struct bfq_entity *entity, |
13385 |
++ bool non_blocking_wait_rq) |
13386 |
+ { |
13387 |
+ struct bfq_sched_data *sd; |
13388 |
+ |
13389 |
+ for_each_entity(entity) { |
13390 |
+- __bfq_activate_entity(entity); |
13391 |
++ BUG_ON(!entity); |
13392 |
++ __bfq_activate_entity(entity, non_blocking_wait_rq); |
13393 |
+ |
13394 |
+ sd = entity->sched_data; |
13395 |
+ if (!bfq_update_next_in_service(sd)) |
13396 |
+@@ -890,23 +1048,24 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) |
13397 |
+ |
13398 |
+ if (!__bfq_deactivate_entity(entity, requeue)) |
13399 |
+ /* |
13400 |
+- * The parent entity is still backlogged, and |
13401 |
+- * we don't need to update it as it is still |
13402 |
+- * in service. |
13403 |
++ * next_in_service has not been changed, so |
13404 |
++ * no upwards update is needed |
13405 |
+ */ |
13406 |
+ break; |
13407 |
+ |
13408 |
+ if (sd->next_in_service) |
13409 |
+ /* |
13410 |
+- * The parent entity is still backlogged and |
13411 |
+- * the budgets on the path towards the root |
13412 |
+- * need to be updated. |
13413 |
++ * The parent entity is still backlogged, |
13414 |
++ * because next_in_service is not NULL, and |
13415 |
++ * next_in_service has been updated (see |
13416 |
++ * comment on the body of the above if): |
13417 |
++ * upwards update of the schedule is needed. |
13418 |
+ */ |
13419 |
+ goto update; |
13420 |
+ |
13421 |
+ /* |
13422 |
+- * If we reach there the parent is no more backlogged and |
13423 |
+- * we want to propagate the dequeue upwards. |
13424 |
++ * If we get here, then the parent is no more backlogged and |
13425 |
++ * we want to propagate the deactivation upwards. |
13426 |
+ */ |
13427 |
+ requeue = 1; |
13428 |
+ } |
13429 |
+@@ -916,9 +1075,23 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) |
13430 |
+ update: |
13431 |
+ entity = parent; |
13432 |
+ for_each_entity(entity) { |
13433 |
+- __bfq_activate_entity(entity); |
13434 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
13435 |
++ __bfq_activate_entity(entity, false); |
13436 |
+ |
13437 |
+ sd = entity->sched_data; |
13438 |
++ if (bfqq) |
13439 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
13440 |
++ "invoking udpdate_next for this queue"); |
13441 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
13442 |
++ else { |
13443 |
++ struct bfq_group *bfqg = |
13444 |
++ container_of(entity, |
13445 |
++ struct bfq_group, entity); |
13446 |
++ |
13447 |
++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, |
13448 |
++ "invoking udpdate_next for this entity"); |
13449 |
++ } |
13450 |
++#endif |
13451 |
+ if (!bfq_update_next_in_service(sd)) |
13452 |
+ break; |
13453 |
+ } |
13454 |
+@@ -997,10 +1170,11 @@ left: |
13455 |
+ * Update the virtual time in @st and return the first eligible entity |
13456 |
+ * it contains. |
13457 |
+ */ |
13458 |
+-static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, |
13459 |
+- bool force) |
13460 |
++static struct bfq_entity * |
13461 |
++__bfq_lookup_next_entity(struct bfq_service_tree *st, bool force) |
13462 |
+ { |
13463 |
+ struct bfq_entity *entity, *new_next_in_service = NULL; |
13464 |
++ struct bfq_queue *bfqq; |
13465 |
+ |
13466 |
+ if (RB_EMPTY_ROOT(&st->active)) |
13467 |
+ return NULL; |
13468 |
+@@ -1009,6 +1183,24 @@ static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, |
13469 |
+ entity = bfq_first_active_entity(st); |
13470 |
+ BUG_ON(bfq_gt(entity->start, st->vtime)); |
13471 |
+ |
13472 |
++ bfqq = bfq_entity_to_bfqq(entity); |
13473 |
++ if (bfqq) |
13474 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
13475 |
++ "__lookup_next: start %llu vtime %llu st %p", |
13476 |
++ ((entity->start>>10)*1000)>>12, |
13477 |
++ ((st->vtime>>10)*1000)>>12, st); |
13478 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
13479 |
++ else { |
13480 |
++ struct bfq_group *bfqg = |
13481 |
++ container_of(entity, struct bfq_group, entity); |
13482 |
++ |
13483 |
++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, |
13484 |
++ "__lookup_next: start %llu vtime %llu st %p", |
13485 |
++ ((entity->start>>10)*1000)>>12, |
13486 |
++ ((st->vtime>>10)*1000)>>12, st); |
13487 |
++ } |
13488 |
++#endif |
13489 |
++ |
13490 |
+ /* |
13491 |
+ * If the chosen entity does not match with the sched_data's |
13492 |
+ * next_in_service and we are forcedly serving the IDLE priority |
13493 |
+@@ -1045,10 +1237,28 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, |
13494 |
+ BUG_ON(sd->in_service_entity); |
13495 |
+ |
13496 |
+ if (bfqd && |
13497 |
+- jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { |
13498 |
++ jiffies - bfqd->bfq_class_idle_last_service > |
13499 |
++ BFQ_CL_IDLE_TIMEOUT) { |
13500 |
+ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, |
13501 |
+ true); |
13502 |
+ if (entity) { |
13503 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
13504 |
++ if (bfqq) |
13505 |
++ bfq_log_bfqq(bfqd, bfqq, |
13506 |
++ "idle chosen from st %p %d", |
13507 |
++ st + BFQ_IOPRIO_CLASSES - 1, |
13508 |
++ BFQ_IOPRIO_CLASSES - 1) ; |
13509 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
13510 |
++ else { |
13511 |
++ struct bfq_group *bfqg = |
13512 |
++ container_of(entity, struct bfq_group, entity); |
13513 |
++ |
13514 |
++ bfq_log_bfqg(bfqd, bfqg, |
13515 |
++ "idle chosen from st %p %d", |
13516 |
++ st + BFQ_IOPRIO_CLASSES - 1, |
13517 |
++ BFQ_IOPRIO_CLASSES - 1) ; |
13518 |
++ } |
13519 |
++#endif |
13520 |
+ i = BFQ_IOPRIO_CLASSES - 1; |
13521 |
+ bfqd->bfq_class_idle_last_service = jiffies; |
13522 |
+ sd->next_in_service = entity; |
13523 |
+@@ -1057,6 +1267,24 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, |
13524 |
+ for (; i < BFQ_IOPRIO_CLASSES; i++) { |
13525 |
+ entity = __bfq_lookup_next_entity(st + i, false); |
13526 |
+ if (entity) { |
13527 |
++ if (bfqd != NULL) { |
13528 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
13529 |
++ if (bfqq) |
13530 |
++ bfq_log_bfqq(bfqd, bfqq, |
13531 |
++ "chosen from st %p %d", |
13532 |
++ st + i, i) ; |
13533 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
13534 |
++ else { |
13535 |
++ struct bfq_group *bfqg = |
13536 |
++ container_of(entity, struct bfq_group, entity); |
13537 |
++ |
13538 |
++ bfq_log_bfqg(bfqd, bfqg, |
13539 |
++ "chosen from st %p %d", |
13540 |
++ st + i, i) ; |
13541 |
++ } |
13542 |
++#endif |
13543 |
++ } |
13544 |
++ |
13545 |
+ if (extract) { |
13546 |
+ bfq_check_next_in_service(sd, entity); |
13547 |
+ bfq_active_extract(st + i, entity); |
13548 |
+@@ -1070,6 +1298,13 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, |
13549 |
+ return entity; |
13550 |
+ } |
13551 |
+ |
13552 |
++static bool next_queue_may_preempt(struct bfq_data *bfqd) |
13553 |
++{ |
13554 |
++ struct bfq_sched_data *sd = &bfqd->root_group->sched_data; |
13555 |
++ |
13556 |
++ return sd->next_in_service != sd->in_service_entity; |
13557 |
++} |
13558 |
++ |
13559 |
+ /* |
13560 |
+ * Get next queue for service. |
13561 |
+ */ |
13562 |
+@@ -1086,7 +1321,36 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) |
13563 |
+ |
13564 |
+ sd = &bfqd->root_group->sched_data; |
13565 |
+ for (; sd ; sd = entity->my_sched_data) { |
13566 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
13567 |
++ if (entity) { |
13568 |
++ struct bfq_group *bfqg = |
13569 |
++ container_of(entity, struct bfq_group, entity); |
13570 |
++ |
13571 |
++ bfq_log_bfqg(bfqd, bfqg, |
13572 |
++ "get_next_queue: lookup in this group"); |
13573 |
++ } else |
13574 |
++ bfq_log_bfqg(bfqd, bfqd->root_group, |
13575 |
++ "get_next_queue: lookup in root group"); |
13576 |
++#endif |
13577 |
++ |
13578 |
+ entity = bfq_lookup_next_entity(sd, 1, bfqd); |
13579 |
++ |
13580 |
++ bfqq = bfq_entity_to_bfqq(entity); |
13581 |
++ if (bfqq) |
13582 |
++ bfq_log_bfqq(bfqd, bfqq, |
13583 |
++ "get_next_queue: this queue, finish %llu", |
13584 |
++ (((entity->finish>>10)*1000)>>10)>>2); |
13585 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
13586 |
++ else { |
13587 |
++ struct bfq_group *bfqg = |
13588 |
++ container_of(entity, struct bfq_group, entity); |
13589 |
++ |
13590 |
++ bfq_log_bfqg(bfqd, bfqg, |
13591 |
++ "get_next_queue: this entity, finish %llu", |
13592 |
++ (((entity->finish>>10)*1000)>>10)>>2); |
13593 |
++ } |
13594 |
++#endif |
13595 |
++ |
13596 |
+ BUG_ON(!entity); |
13597 |
+ entity->service = 0; |
13598 |
+ } |
13599 |
+@@ -1113,9 +1377,7 @@ static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
13600 |
+ { |
13601 |
+ struct bfq_entity *entity = &bfqq->entity; |
13602 |
+ |
13603 |
+- if (bfqq == bfqd->in_service_queue) |
13604 |
+- __bfq_bfqd_reset_in_service(bfqd); |
13605 |
+- |
13606 |
++ BUG_ON(bfqq == bfqd->in_service_queue); |
13607 |
+ bfq_deactivate_entity(entity, requeue); |
13608 |
+ } |
13609 |
+ |
13610 |
+@@ -1123,12 +1385,11 @@ static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
13611 |
+ { |
13612 |
+ struct bfq_entity *entity = &bfqq->entity; |
13613 |
+ |
13614 |
+- bfq_activate_entity(entity); |
13615 |
++ bfq_activate_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq)); |
13616 |
++ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); |
13617 |
+ } |
13618 |
+ |
13619 |
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED |
13620 |
+ static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); |
13621 |
+-#endif |
13622 |
+ |
13623 |
+ /* |
13624 |
+ * Called when the bfqq no longer has requests pending, remove it from |
13625 |
+@@ -1139,6 +1400,7 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
13626 |
+ { |
13627 |
+ BUG_ON(!bfq_bfqq_busy(bfqq)); |
13628 |
+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); |
13629 |
++ BUG_ON(bfqq == bfqd->in_service_queue); |
13630 |
+ |
13631 |
+ bfq_log_bfqq(bfqd, bfqq, "del from busy"); |
13632 |
+ |
13633 |
+@@ -1147,27 +1409,20 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
13634 |
+ BUG_ON(bfqd->busy_queues == 0); |
13635 |
+ bfqd->busy_queues--; |
13636 |
+ |
13637 |
+- if (!bfqq->dispatched) { |
13638 |
++ if (!bfqq->dispatched) |
13639 |
+ bfq_weights_tree_remove(bfqd, &bfqq->entity, |
13640 |
+ &bfqd->queue_weights_tree); |
13641 |
+- if (!blk_queue_nonrot(bfqd->queue)) { |
13642 |
+- BUG_ON(!bfqd->busy_in_flight_queues); |
13643 |
+- bfqd->busy_in_flight_queues--; |
13644 |
+- if (bfq_bfqq_constantly_seeky(bfqq)) { |
13645 |
+- BUG_ON(!bfqd-> |
13646 |
+- const_seeky_busy_in_flight_queues); |
13647 |
+- bfqd->const_seeky_busy_in_flight_queues--; |
13648 |
+- } |
13649 |
+- } |
13650 |
+- } |
13651 |
++ |
13652 |
+ if (bfqq->wr_coeff > 1) |
13653 |
+ bfqd->wr_busy_queues--; |
13654 |
+ |
13655 |
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED |
13656 |
+ bfqg_stats_update_dequeue(bfqq_group(bfqq)); |
13657 |
+-#endif |
13658 |
+ |
13659 |
++ BUG_ON(bfqq->entity.budget < 0); |
13660 |
++ |
13661 |
+ bfq_deactivate_bfqq(bfqd, bfqq, requeue); |
13662 |
++ |
13663 |
++ BUG_ON(bfqq->entity.budget < 0); |
13664 |
+ } |
13665 |
+ |
13666 |
+ /* |
13667 |
+@@ -1185,16 +1440,11 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
13668 |
+ bfq_mark_bfqq_busy(bfqq); |
13669 |
+ bfqd->busy_queues++; |
13670 |
+ |
13671 |
+- if (!bfqq->dispatched) { |
13672 |
++ if (!bfqq->dispatched) |
13673 |
+ if (bfqq->wr_coeff == 1) |
13674 |
+ bfq_weights_tree_add(bfqd, &bfqq->entity, |
13675 |
+ &bfqd->queue_weights_tree); |
13676 |
+- if (!blk_queue_nonrot(bfqd->queue)) { |
13677 |
+- bfqd->busy_in_flight_queues++; |
13678 |
+- if (bfq_bfqq_constantly_seeky(bfqq)) |
13679 |
+- bfqd->const_seeky_busy_in_flight_queues++; |
13680 |
+- } |
13681 |
+- } |
13682 |
++ |
13683 |
+ if (bfqq->wr_coeff > 1) |
13684 |
+ bfqd->wr_busy_queues++; |
13685 |
+ } |
13686 |
+diff --git a/block/bfq.h b/block/bfq.h |
13687 |
+index f73c942..b8ad02a 100644 |
13688 |
+--- a/block/bfq.h |
13689 |
++++ b/block/bfq.h |
13690 |
+@@ -1,5 +1,5 @@ |
13691 |
+ /* |
13692 |
+- * BFQ-v7r11 for 4.5.0: data structures and common functions prototypes. |
13693 |
++ * BFQ-v8 for 4.7.0: data structures and common functions prototypes. |
13694 |
+ * |
13695 |
+ * Based on ideas and code from CFQ: |
13696 |
+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
13697 |
+@@ -28,7 +28,7 @@ |
13698 |
+ |
13699 |
+ #define BFQ_DEFAULT_QUEUE_IOPRIO 4 |
13700 |
+ |
13701 |
+-#define BFQ_DEFAULT_GRP_WEIGHT 10 |
13702 |
++#define BFQ_WEIGHT_LEGACY_DFL 100 |
13703 |
+ #define BFQ_DEFAULT_GRP_IOPRIO 0 |
13704 |
+ #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE |
13705 |
+ |
13706 |
+@@ -36,12 +36,6 @@ struct bfq_entity; |
13707 |
+ |
13708 |
+ /** |
13709 |
+ * struct bfq_service_tree - per ioprio_class service tree. |
13710 |
+- * @active: tree for active entities (i.e., those backlogged). |
13711 |
+- * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). |
13712 |
+- * @first_idle: idle entity with minimum F_i. |
13713 |
+- * @last_idle: idle entity with maximum F_i. |
13714 |
+- * @vtime: scheduler virtual time. |
13715 |
+- * @wsum: scheduler weight sum; active and idle entities contribute to it. |
13716 |
+ * |
13717 |
+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each |
13718 |
+ * ioprio_class has its own independent scheduler, and so its own |
13719 |
+@@ -49,27 +43,28 @@ struct bfq_entity; |
13720 |
+ * of the containing bfqd. |
13721 |
+ */ |
13722 |
+ struct bfq_service_tree { |
13723 |
++ /* tree for active entities (i.e., those backlogged) */ |
13724 |
+ struct rb_root active; |
13725 |
++ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ |
13726 |
+ struct rb_root idle; |
13727 |
+ |
13728 |
+- struct bfq_entity *first_idle; |
13729 |
+- struct bfq_entity *last_idle; |
13730 |
++ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ |
13731 |
++ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ |
13732 |
+ |
13733 |
+- u64 vtime; |
13734 |
++ u64 vtime; /* scheduler virtual time */ |
13735 |
++ /* scheduler weight sum; active and idle entities contribute to it */ |
13736 |
+ unsigned long wsum; |
13737 |
+ }; |
13738 |
+ |
13739 |
+ /** |
13740 |
+ * struct bfq_sched_data - multi-class scheduler. |
13741 |
+- * @in_service_entity: entity in service. |
13742 |
+- * @next_in_service: head-of-the-line entity in the scheduler. |
13743 |
+- * @service_tree: array of service trees, one per ioprio_class. |
13744 |
+ * |
13745 |
+ * bfq_sched_data is the basic scheduler queue. It supports three |
13746 |
+- * ioprio_classes, and can be used either as a toplevel queue or as |
13747 |
+- * an intermediate queue on a hierarchical setup. |
13748 |
+- * @next_in_service points to the active entity of the sched_data |
13749 |
+- * service trees that will be scheduled next. |
13750 |
++ * ioprio_classes, and can be used either as a toplevel queue or as an |
13751 |
++ * intermediate queue on a hierarchical setup. @next_in_service |
13752 |
++ * points to the active entity of the sched_data service trees that |
13753 |
++ * will be scheduled next. It is used to reduce the number of steps |
13754 |
++ * needed for each hierarchical-schedule update. |
13755 |
+ * |
13756 |
+ * The supported ioprio_classes are the same as in CFQ, in descending |
13757 |
+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. |
13758 |
+@@ -79,48 +74,29 @@ struct bfq_service_tree { |
13759 |
+ * All the fields are protected by the queue lock of the containing bfqd. |
13760 |
+ */ |
13761 |
+ struct bfq_sched_data { |
13762 |
+- struct bfq_entity *in_service_entity; |
13763 |
++ struct bfq_entity *in_service_entity; /* entity in service */ |
13764 |
++ /* head-of-the-line entity in the scheduler (see comments above) */ |
13765 |
+ struct bfq_entity *next_in_service; |
13766 |
++ /* array of service trees, one per ioprio_class */ |
13767 |
+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; |
13768 |
+ }; |
13769 |
+ |
13770 |
+ /** |
13771 |
+ * struct bfq_weight_counter - counter of the number of all active entities |
13772 |
+ * with a given weight. |
13773 |
+- * @weight: weight of the entities that this counter refers to. |
13774 |
+- * @num_active: number of active entities with this weight. |
13775 |
+- * @weights_node: weights tree member (see bfq_data's @queue_weights_tree |
13776 |
+- * and @group_weights_tree). |
13777 |
+ */ |
13778 |
+ struct bfq_weight_counter { |
13779 |
+- short int weight; |
13780 |
+- unsigned int num_active; |
13781 |
++ short int weight; /* weight of the entities this counter refers to */ |
13782 |
++ unsigned int num_active; /* nr of active entities with this weight */ |
13783 |
++ /* |
13784 |
++ * Weights tree member (see bfq_data's @queue_weights_tree and |
13785 |
++ * @group_weights_tree) |
13786 |
++ */ |
13787 |
+ struct rb_node weights_node; |
13788 |
+ }; |
13789 |
+ |
13790 |
+ /** |
13791 |
+ * struct bfq_entity - schedulable entity. |
13792 |
+- * @rb_node: service_tree member. |
13793 |
+- * @weight_counter: pointer to the weight counter associated with this entity. |
13794 |
+- * @on_st: flag, true if the entity is on a tree (either the active or |
13795 |
+- * the idle one of its service_tree). |
13796 |
+- * @finish: B-WF2Q+ finish timestamp (aka F_i). |
13797 |
+- * @start: B-WF2Q+ start timestamp (aka S_i). |
13798 |
+- * @tree: tree the entity is enqueued into; %NULL if not on a tree. |
13799 |
+- * @min_start: minimum start time of the (active) subtree rooted at |
13800 |
+- * this entity; used for O(log N) lookups into active trees. |
13801 |
+- * @service: service received during the last round of service. |
13802 |
+- * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. |
13803 |
+- * @weight: weight of the queue |
13804 |
+- * @parent: parent entity, for hierarchical scheduling. |
13805 |
+- * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the |
13806 |
+- * associated scheduler queue, %NULL on leaf nodes. |
13807 |
+- * @sched_data: the scheduler queue this entity belongs to. |
13808 |
+- * @ioprio: the ioprio in use. |
13809 |
+- * @new_weight: when a weight change is requested, the new weight value. |
13810 |
+- * @orig_weight: original weight, used to implement weight boosting |
13811 |
+- * @prio_changed: flag, true when the user requested a weight, ioprio or |
13812 |
+- * ioprio_class change. |
13813 |
+ * |
13814 |
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the |
13815 |
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each |
13816 |
+@@ -147,27 +123,52 @@ struct bfq_weight_counter { |
13817 |
+ * containing bfqd. |
13818 |
+ */ |
13819 |
+ struct bfq_entity { |
13820 |
+- struct rb_node rb_node; |
13821 |
++ struct rb_node rb_node; /* service_tree member */ |
13822 |
++ /* pointer to the weight counter associated with this entity */ |
13823 |
+ struct bfq_weight_counter *weight_counter; |
13824 |
+ |
13825 |
++ /* |
13826 |
++ * flag, true if the entity is on a tree (either the active or |
13827 |
++ * the idle one of its service_tree). |
13828 |
++ */ |
13829 |
+ int on_st; |
13830 |
+ |
13831 |
+- u64 finish; |
13832 |
+- u64 start; |
13833 |
++ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ |
13834 |
++ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ |
13835 |
+ |
13836 |
++ /* tree the entity is enqueued into; %NULL if not on a tree */ |
13837 |
+ struct rb_root *tree; |
13838 |
+ |
13839 |
++ /* |
13840 |
++ * minimum start time of the (active) subtree rooted at this |
13841 |
++ * entity; used for O(log N) lookups into active trees |
13842 |
++ */ |
13843 |
+ u64 min_start; |
13844 |
+ |
13845 |
+- int service, budget; |
13846 |
+- unsigned short weight, new_weight; |
13847 |
++ /* amount of service received during the last service slot */ |
13848 |
++ int service; |
13849 |
++ |
13850 |
++ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ |
13851 |
++ int budget; |
13852 |
++ |
13853 |
++ unsigned short weight; /* weight of the queue */ |
13854 |
++ unsigned short new_weight; /* next weight if a change is in progress */ |
13855 |
++ |
13856 |
++ /* original weight, used to implement weight boosting */ |
13857 |
+ unsigned short orig_weight; |
13858 |
+ |
13859 |
++ /* parent entity, for hierarchical scheduling */ |
13860 |
+ struct bfq_entity *parent; |
13861 |
+ |
13862 |
++ /* |
13863 |
++ * For non-leaf nodes in the hierarchy, the associated |
13864 |
++ * scheduler queue, %NULL on leaf nodes. |
13865 |
++ */ |
13866 |
+ struct bfq_sched_data *my_sched_data; |
13867 |
++ /* the scheduler queue this entity belongs to */ |
13868 |
+ struct bfq_sched_data *sched_data; |
13869 |
+ |
13870 |
++ /* flag, set to request a weight, ioprio or ioprio_class change */ |
13871 |
+ int prio_changed; |
13872 |
+ }; |
13873 |
+ |
13874 |
+@@ -175,56 +176,6 @@ struct bfq_group; |
13875 |
+ |
13876 |
+ /** |
13877 |
+ * struct bfq_queue - leaf schedulable entity. |
13878 |
+- * @ref: reference counter. |
13879 |
+- * @bfqd: parent bfq_data. |
13880 |
+- * @new_ioprio: when an ioprio change is requested, the new ioprio value. |
13881 |
+- * @ioprio_class: the ioprio_class in use. |
13882 |
+- * @new_ioprio_class: when an ioprio_class change is requested, the new |
13883 |
+- * ioprio_class value. |
13884 |
+- * @new_bfqq: shared bfq_queue if queue is cooperating with |
13885 |
+- * one or more other queues. |
13886 |
+- * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). |
13887 |
+- * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). |
13888 |
+- * @sort_list: sorted list of pending requests. |
13889 |
+- * @next_rq: if fifo isn't expired, next request to serve. |
13890 |
+- * @queued: nr of requests queued in @sort_list. |
13891 |
+- * @allocated: currently allocated requests. |
13892 |
+- * @meta_pending: pending metadata requests. |
13893 |
+- * @fifo: fifo list of requests in sort_list. |
13894 |
+- * @entity: entity representing this queue in the scheduler. |
13895 |
+- * @max_budget: maximum budget allowed from the feedback mechanism. |
13896 |
+- * @budget_timeout: budget expiration (in jiffies). |
13897 |
+- * @dispatched: number of requests on the dispatch list or inside driver. |
13898 |
+- * @flags: status flags. |
13899 |
+- * @bfqq_list: node for active/idle bfqq list inside our bfqd. |
13900 |
+- * @burst_list_node: node for the device's burst list. |
13901 |
+- * @seek_samples: number of seeks sampled |
13902 |
+- * @seek_total: sum of the distances of the seeks sampled |
13903 |
+- * @seek_mean: mean seek distance |
13904 |
+- * @last_request_pos: position of the last request enqueued |
13905 |
+- * @requests_within_timer: number of consecutive pairs of request completion |
13906 |
+- * and arrival, such that the queue becomes idle |
13907 |
+- * after the completion, but the next request arrives |
13908 |
+- * within an idle time slice; used only if the queue's |
13909 |
+- * IO_bound has been cleared. |
13910 |
+- * @pid: pid of the process owning the queue, used for logging purposes. |
13911 |
+- * @last_wr_start_finish: start time of the current weight-raising period if |
13912 |
+- * the @bfq-queue is being weight-raised, otherwise |
13913 |
+- * finish time of the last weight-raising period |
13914 |
+- * @wr_cur_max_time: current max raising time for this queue |
13915 |
+- * @soft_rt_next_start: minimum time instant such that, only if a new |
13916 |
+- * request is enqueued after this time instant in an |
13917 |
+- * idle @bfq_queue with no outstanding requests, then |
13918 |
+- * the task associated with the queue it is deemed as |
13919 |
+- * soft real-time (see the comments to the function |
13920 |
+- * bfq_bfqq_softrt_next_start()) |
13921 |
+- * @last_idle_bklogged: time of the last transition of the @bfq_queue from |
13922 |
+- * idle to backlogged |
13923 |
+- * @service_from_backlogged: cumulative service received from the @bfq_queue |
13924 |
+- * since the last transition from idle to |
13925 |
+- * backlogged |
13926 |
+- * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the |
13927 |
+- * queue is shared |
13928 |
+ * |
13929 |
+ * A bfq_queue is a leaf request queue; it can be associated with an |
13930 |
+ * io_context or more, if it is async or shared between cooperating |
13931 |
+@@ -235,117 +186,163 @@ struct bfq_group; |
13932 |
+ * All the fields are protected by the queue lock of the containing bfqd. |
13933 |
+ */ |
13934 |
+ struct bfq_queue { |
13935 |
+- atomic_t ref; |
13936 |
++ /* reference counter */ |
13937 |
++ int ref; |
13938 |
++ /* parent bfq_data */ |
13939 |
+ struct bfq_data *bfqd; |
13940 |
+ |
13941 |
+- unsigned short ioprio, new_ioprio; |
13942 |
+- unsigned short ioprio_class, new_ioprio_class; |
13943 |
++ /* current ioprio and ioprio class */ |
13944 |
++ unsigned short ioprio, ioprio_class; |
13945 |
++ /* next ioprio and ioprio class if a change is in progress */ |
13946 |
++ unsigned short new_ioprio, new_ioprio_class; |
13947 |
+ |
13948 |
+- /* fields for cooperating queues handling */ |
13949 |
++ /* |
13950 |
++ * Shared bfq_queue if queue is cooperating with one or more |
13951 |
++ * other queues. |
13952 |
++ */ |
13953 |
+ struct bfq_queue *new_bfqq; |
13954 |
++ /* request-position tree member (see bfq_group's @rq_pos_tree) */ |
13955 |
+ struct rb_node pos_node; |
13956 |
++ /* request-position tree root (see bfq_group's @rq_pos_tree) */ |
13957 |
+ struct rb_root *pos_root; |
13958 |
+ |
13959 |
++ /* sorted list of pending requests */ |
13960 |
+ struct rb_root sort_list; |
13961 |
++ /* if fifo isn't expired, next request to serve */ |
13962 |
+ struct request *next_rq; |
13963 |
++ /* number of sync and async requests queued */ |
13964 |
+ int queued[2]; |
13965 |
++ /* number of sync and async requests currently allocated */ |
13966 |
+ int allocated[2]; |
13967 |
++ /* number of pending metadata requests */ |
13968 |
+ int meta_pending; |
13969 |
++ /* fifo list of requests in sort_list */ |
13970 |
+ struct list_head fifo; |
13971 |
+ |
13972 |
++ /* entity representing this queue in the scheduler */ |
13973 |
+ struct bfq_entity entity; |
13974 |
+ |
13975 |
++ /* maximum budget allowed from the feedback mechanism */ |
13976 |
+ int max_budget; |
13977 |
++ /* budget expiration (in jiffies) */ |
13978 |
+ unsigned long budget_timeout; |
13979 |
+ |
13980 |
++ /* number of requests on the dispatch list or inside driver */ |
13981 |
+ int dispatched; |
13982 |
+ |
13983 |
+- unsigned int flags; |
13984 |
++ unsigned int flags; /* status flags.*/ |
13985 |
+ |
13986 |
++ /* node for active/idle bfqq list inside parent bfqd */ |
13987 |
+ struct list_head bfqq_list; |
13988 |
+ |
13989 |
++ /* bit vector: a 1 for each seeky requests in history */ |
13990 |
++ u32 seek_history; |
13991 |
++ |
13992 |
++ /* node for the device's burst list */ |
13993 |
+ struct hlist_node burst_list_node; |
13994 |
+ |
13995 |
+- unsigned int seek_samples; |
13996 |
+- u64 seek_total; |
13997 |
+- sector_t seek_mean; |
13998 |
++ /* position of the last request enqueued */ |
13999 |
+ sector_t last_request_pos; |
14000 |
+ |
14001 |
++ /* Number of consecutive pairs of request completion and |
14002 |
++ * arrival, such that the queue becomes idle after the |
14003 |
++ * completion, but the next request arrives within an idle |
14004 |
++ * time slice; used only if the queue's IO_bound flag has been |
14005 |
++ * cleared. |
14006 |
++ */ |
14007 |
+ unsigned int requests_within_timer; |
14008 |
+ |
14009 |
++ /* pid of the process owning the queue, used for logging purposes */ |
14010 |
+ pid_t pid; |
14011 |
++ |
14012 |
++ /* |
14013 |
++ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL |
14014 |
++ * if the queue is shared. |
14015 |
++ */ |
14016 |
+ struct bfq_io_cq *bic; |
14017 |
+ |
14018 |
+- /* weight-raising fields */ |
14019 |
++ /* current maximum weight-raising time for this queue */ |
14020 |
+ unsigned long wr_cur_max_time; |
14021 |
++ /* |
14022 |
++ * Minimum time instant such that, only if a new request is |
14023 |
++ * enqueued after this time instant in an idle @bfq_queue with |
14024 |
++ * no outstanding requests, then the task associated with the |
14025 |
++ * queue it is deemed as soft real-time (see the comments on |
14026 |
++ * the function bfq_bfqq_softrt_next_start()) |
14027 |
++ */ |
14028 |
+ unsigned long soft_rt_next_start; |
14029 |
++ /* |
14030 |
++ * Start time of the current weight-raising period if |
14031 |
++ * the @bfq-queue is being weight-raised, otherwise |
14032 |
++ * finish time of the last weight-raising period. |
14033 |
++ */ |
14034 |
+ unsigned long last_wr_start_finish; |
14035 |
++ /* factor by which the weight of this queue is multiplied */ |
14036 |
+ unsigned int wr_coeff; |
14037 |
++ /* |
14038 |
++ * Time of the last transition of the @bfq_queue from idle to |
14039 |
++ * backlogged. |
14040 |
++ */ |
14041 |
+ unsigned long last_idle_bklogged; |
14042 |
++ /* |
14043 |
++ * Cumulative service received from the @bfq_queue since the |
14044 |
++ * last transition from idle to backlogged. |
14045 |
++ */ |
14046 |
+ unsigned long service_from_backlogged; |
14047 |
++ |
14048 |
++ unsigned long split_time; /* time of last split */ |
14049 |
+ }; |
14050 |
+ |
14051 |
+ /** |
14052 |
+ * struct bfq_ttime - per process thinktime stats. |
14053 |
+- * @ttime_total: total process thinktime |
14054 |
+- * @ttime_samples: number of thinktime samples |
14055 |
+- * @ttime_mean: average process thinktime |
14056 |
+ */ |
14057 |
+ struct bfq_ttime { |
14058 |
+- unsigned long last_end_request; |
14059 |
++ unsigned long last_end_request; /* completion time of last request */ |
14060 |
++ |
14061 |
++ unsigned long ttime_total; /* total process thinktime */ |
14062 |
++ unsigned long ttime_samples; /* number of thinktime samples */ |
14063 |
++ unsigned long ttime_mean; /* average process thinktime */ |
14064 |
+ |
14065 |
+- unsigned long ttime_total; |
14066 |
+- unsigned long ttime_samples; |
14067 |
+- unsigned long ttime_mean; |
14068 |
+ }; |
14069 |
+ |
14070 |
+ /** |
14071 |
+ * struct bfq_io_cq - per (request_queue, io_context) structure. |
14072 |
+- * @icq: associated io_cq structure |
14073 |
+- * @bfqq: array of two process queues, the sync and the async |
14074 |
+- * @ttime: associated @bfq_ttime struct |
14075 |
+- * @ioprio: per (request_queue, blkcg) ioprio. |
14076 |
+- * @blkcg_id: id of the blkcg the related io_cq belongs to. |
14077 |
+- * @wr_time_left: snapshot of the time left before weight raising ends |
14078 |
+- * for the sync queue associated to this process; this |
14079 |
+- * snapshot is taken to remember this value while the weight |
14080 |
+- * raising is suspended because the queue is merged with a |
14081 |
+- * shared queue, and is used to set @raising_cur_max_time |
14082 |
+- * when the queue is split from the shared queue and its |
14083 |
+- * weight is raised again |
14084 |
+- * @saved_idle_window: same purpose as the previous field for the idle |
14085 |
+- * window |
14086 |
+- * @saved_IO_bound: same purpose as the previous two fields for the I/O |
14087 |
+- * bound classification of a queue |
14088 |
+- * @saved_in_large_burst: same purpose as the previous fields for the |
14089 |
+- * value of the field keeping the queue's belonging |
14090 |
+- * to a large burst |
14091 |
+- * @was_in_burst_list: true if the queue belonged to a burst list |
14092 |
+- * before its merge with another cooperating queue |
14093 |
+- * @cooperations: counter of consecutive successful queue merges underwent |
14094 |
+- * by any of the process' @bfq_queues |
14095 |
+- * @failed_cooperations: counter of consecutive failed queue merges of any |
14096 |
+- * of the process' @bfq_queues |
14097 |
+ */ |
14098 |
+ struct bfq_io_cq { |
14099 |
++ /* associated io_cq structure */ |
14100 |
+ struct io_cq icq; /* must be the first member */ |
14101 |
++ /* array of two process queues, the sync and the async */ |
14102 |
+ struct bfq_queue *bfqq[2]; |
14103 |
++ /* associated @bfq_ttime struct */ |
14104 |
+ struct bfq_ttime ttime; |
14105 |
++ /* per (request_queue, blkcg) ioprio */ |
14106 |
+ int ioprio; |
14107 |
+- |
14108 |
+ #ifdef CONFIG_BFQ_GROUP_IOSCHED |
14109 |
+- uint64_t blkcg_id; /* the current blkcg ID */ |
14110 |
++ uint64_t blkcg_serial_nr; /* the current blkcg serial */ |
14111 |
+ #endif |
14112 |
+ |
14113 |
+- unsigned int wr_time_left; |
14114 |
++ /* |
14115 |
++ * Snapshot of the idle window before merging; taken to |
14116 |
++ * remember this value while the queue is merged, so as to be |
14117 |
++ * able to restore it in case of split. |
14118 |
++ */ |
14119 |
+ bool saved_idle_window; |
14120 |
++ /* |
14121 |
++ * Same purpose as the previous two fields for the I/O bound |
14122 |
++ * classification of a queue. |
14123 |
++ */ |
14124 |
+ bool saved_IO_bound; |
14125 |
+ |
14126 |
++ /* |
14127 |
++ * Same purpose as the previous fields for the value of the |
14128 |
++ * field keeping the queue's belonging to a large burst |
14129 |
++ */ |
14130 |
+ bool saved_in_large_burst; |
14131 |
++ /* |
14132 |
++ * True if the queue belonged to a burst list before its merge |
14133 |
++ * with another cooperating queue. |
14134 |
++ */ |
14135 |
+ bool was_in_burst_list; |
14136 |
+- |
14137 |
+- unsigned int cooperations; |
14138 |
+- unsigned int failed_cooperations; |
14139 |
+ }; |
14140 |
+ |
14141 |
+ enum bfq_device_speed { |
14142 |
+@@ -354,224 +351,216 @@ enum bfq_device_speed { |
14143 |
+ }; |
14144 |
+ |
14145 |
+ /** |
14146 |
+- * struct bfq_data - per device data structure. |
14147 |
+- * @queue: request queue for the managed device. |
14148 |
+- * @root_group: root bfq_group for the device. |
14149 |
+- * @active_numerous_groups: number of bfq_groups containing more than one |
14150 |
+- * active @bfq_entity. |
14151 |
+- * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by |
14152 |
+- * weight. Used to keep track of whether all @bfq_queues |
14153 |
+- * have the same weight. The tree contains one counter |
14154 |
+- * for each distinct weight associated to some active |
14155 |
+- * and not weight-raised @bfq_queue (see the comments to |
14156 |
+- * the functions bfq_weights_tree_[add|remove] for |
14157 |
+- * further details). |
14158 |
+- * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted |
14159 |
+- * by weight. Used to keep track of whether all |
14160 |
+- * @bfq_groups have the same weight. The tree contains |
14161 |
+- * one counter for each distinct weight associated to |
14162 |
+- * some active @bfq_group (see the comments to the |
14163 |
+- * functions bfq_weights_tree_[add|remove] for further |
14164 |
+- * details). |
14165 |
+- * @busy_queues: number of bfq_queues containing requests (including the |
14166 |
+- * queue in service, even if it is idling). |
14167 |
+- * @busy_in_flight_queues: number of @bfq_queues containing pending or |
14168 |
+- * in-flight requests, plus the @bfq_queue in |
14169 |
+- * service, even if idle but waiting for the |
14170 |
+- * possible arrival of its next sync request. This |
14171 |
+- * field is updated only if the device is rotational, |
14172 |
+- * but used only if the device is also NCQ-capable. |
14173 |
+- * The reason why the field is updated also for non- |
14174 |
+- * NCQ-capable rotational devices is related to the |
14175 |
+- * fact that the value of @hw_tag may be set also |
14176 |
+- * later than when busy_in_flight_queues may need to |
14177 |
+- * be incremented for the first time(s). Taking also |
14178 |
+- * this possibility into account, to avoid unbalanced |
14179 |
+- * increments/decrements, would imply more overhead |
14180 |
+- * than just updating busy_in_flight_queues |
14181 |
+- * regardless of the value of @hw_tag. |
14182 |
+- * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues |
14183 |
+- * (that is, seeky queues that expired |
14184 |
+- * for budget timeout at least once) |
14185 |
+- * containing pending or in-flight |
14186 |
+- * requests, including the in-service |
14187 |
+- * @bfq_queue if constantly seeky. This |
14188 |
+- * field is updated only if the device |
14189 |
+- * is rotational, but used only if the |
14190 |
+- * device is also NCQ-capable (see the |
14191 |
+- * comments to @busy_in_flight_queues). |
14192 |
+- * @wr_busy_queues: number of weight-raised busy @bfq_queues. |
14193 |
+- * @queued: number of queued requests. |
14194 |
+- * @rq_in_driver: number of requests dispatched and waiting for completion. |
14195 |
+- * @sync_flight: number of sync requests in the driver. |
14196 |
+- * @max_rq_in_driver: max number of reqs in driver in the last |
14197 |
+- * @hw_tag_samples completed requests. |
14198 |
+- * @hw_tag_samples: nr of samples used to calculate hw_tag. |
14199 |
+- * @hw_tag: flag set to one if the driver is showing a queueing behavior. |
14200 |
+- * @budgets_assigned: number of budgets assigned. |
14201 |
+- * @idle_slice_timer: timer set when idling for the next sequential request |
14202 |
+- * from the queue in service. |
14203 |
+- * @unplug_work: delayed work to restart dispatching on the request queue. |
14204 |
+- * @in_service_queue: bfq_queue in service. |
14205 |
+- * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. |
14206 |
+- * @last_position: on-disk position of the last served request. |
14207 |
+- * @last_budget_start: beginning of the last budget. |
14208 |
+- * @last_idling_start: beginning of the last idle slice. |
14209 |
+- * @peak_rate: peak transfer rate observed for a budget. |
14210 |
+- * @peak_rate_samples: number of samples used to calculate @peak_rate. |
14211 |
+- * @bfq_max_budget: maximum budget allotted to a bfq_queue before |
14212 |
+- * rescheduling. |
14213 |
+- * @active_list: list of all the bfq_queues active on the device. |
14214 |
+- * @idle_list: list of all the bfq_queues idle on the device. |
14215 |
+- * @bfq_fifo_expire: timeout for async/sync requests; when it expires |
14216 |
+- * requests are served in fifo order. |
14217 |
+- * @bfq_back_penalty: weight of backward seeks wrt forward ones. |
14218 |
+- * @bfq_back_max: maximum allowed backward seek. |
14219 |
+- * @bfq_slice_idle: maximum idling time. |
14220 |
+- * @bfq_user_max_budget: user-configured max budget value |
14221 |
+- * (0 for auto-tuning). |
14222 |
+- * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to |
14223 |
+- * async queues. |
14224 |
+- * @bfq_timeout: timeout for bfq_queues to consume their budget; used to |
14225 |
+- * to prevent seeky queues to impose long latencies to well |
14226 |
+- * behaved ones (this also implies that seeky queues cannot |
14227 |
+- * receive guarantees in the service domain; after a timeout |
14228 |
+- * they are charged for the whole allocated budget, to try |
14229 |
+- * to preserve a behavior reasonably fair among them, but |
14230 |
+- * without service-domain guarantees). |
14231 |
+- * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is |
14232 |
+- * no more granted any weight-raising. |
14233 |
+- * @bfq_failed_cooperations: number of consecutive failed cooperation |
14234 |
+- * chances after which weight-raising is restored |
14235 |
+- * to a queue subject to more than bfq_coop_thresh |
14236 |
+- * queue merges. |
14237 |
+- * @bfq_requests_within_timer: number of consecutive requests that must be |
14238 |
+- * issued within the idle time slice to set |
14239 |
+- * again idling to a queue which was marked as |
14240 |
+- * non-I/O-bound (see the definition of the |
14241 |
+- * IO_bound flag for further details). |
14242 |
+- * @last_ins_in_burst: last time at which a queue entered the current |
14243 |
+- * burst of queues being activated shortly after |
14244 |
+- * each other; for more details about this and the |
14245 |
+- * following parameters related to a burst of |
14246 |
+- * activations, see the comments to the function |
14247 |
+- * @bfq_handle_burst. |
14248 |
+- * @bfq_burst_interval: reference time interval used to decide whether a |
14249 |
+- * queue has been activated shortly after |
14250 |
+- * @last_ins_in_burst. |
14251 |
+- * @burst_size: number of queues in the current burst of queue activations. |
14252 |
+- * @bfq_large_burst_thresh: maximum burst size above which the current |
14253 |
+- * queue-activation burst is deemed as 'large'. |
14254 |
+- * @large_burst: true if a large queue-activation burst is in progress. |
14255 |
+- * @burst_list: head of the burst list (as for the above fields, more details |
14256 |
+- * in the comments to the function bfq_handle_burst). |
14257 |
+- * @low_latency: if set to true, low-latency heuristics are enabled. |
14258 |
+- * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised |
14259 |
+- * queue is multiplied. |
14260 |
+- * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). |
14261 |
+- * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. |
14262 |
+- * @bfq_wr_min_idle_time: minimum idle period after which weight-raising |
14263 |
+- * may be reactivated for a queue (in jiffies). |
14264 |
+- * @bfq_wr_min_inter_arr_async: minimum period between request arrivals |
14265 |
+- * after which weight-raising may be |
14266 |
+- * reactivated for an already busy queue |
14267 |
+- * (in jiffies). |
14268 |
+- * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, |
14269 |
+- * sectors per seconds. |
14270 |
+- * @RT_prod: cached value of the product R*T used for computing the maximum |
14271 |
+- * duration of the weight raising automatically. |
14272 |
+- * @device_speed: device-speed class for the low-latency heuristic. |
14273 |
+- * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. |
14274 |
++ * struct bfq_data - per-device data structure. |
14275 |
+ * |
14276 |
+ * All the fields are protected by the @queue lock. |
14277 |
+ */ |
14278 |
+ struct bfq_data { |
14279 |
++ /* request queue for the device */ |
14280 |
+ struct request_queue *queue; |
14281 |
+ |
14282 |
++ /* root bfq_group for the device */ |
14283 |
+ struct bfq_group *root_group; |
14284 |
+ |
14285 |
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED |
14286 |
+- int active_numerous_groups; |
14287 |
+-#endif |
14288 |
+- |
14289 |
++ /* |
14290 |
++ * rbtree of weight counters of @bfq_queues, sorted by |
14291 |
++ * weight. Used to keep track of whether all @bfq_queues have |
14292 |
++ * the same weight. The tree contains one counter for each |
14293 |
++ * distinct weight associated to some active and not |
14294 |
++ * weight-raised @bfq_queue (see the comments to the functions |
14295 |
++ * bfq_weights_tree_[add|remove] for further details). |
14296 |
++ */ |
14297 |
+ struct rb_root queue_weights_tree; |
14298 |
++ /* |
14299 |
++ * rbtree of non-queue @bfq_entity weight counters, sorted by |
14300 |
++ * weight. Used to keep track of whether all @bfq_groups have |
14301 |
++ * the same weight. The tree contains one counter for each |
14302 |
++ * distinct weight associated to some active @bfq_group (see |
14303 |
++ * the comments to the functions bfq_weights_tree_[add|remove] |
14304 |
++ * for further details). |
14305 |
++ */ |
14306 |
+ struct rb_root group_weights_tree; |
14307 |
+ |
14308 |
++ /* |
14309 |
++ * Number of bfq_queues containing requests (including the |
14310 |
++ * queue in service, even if it is idling). |
14311 |
++ */ |
14312 |
+ int busy_queues; |
14313 |
+- int busy_in_flight_queues; |
14314 |
+- int const_seeky_busy_in_flight_queues; |
14315 |
++ /* number of weight-raised busy @bfq_queues */ |
14316 |
+ int wr_busy_queues; |
14317 |
++ /* number of queued requests */ |
14318 |
+ int queued; |
14319 |
++ /* number of requests dispatched and waiting for completion */ |
14320 |
+ int rq_in_driver; |
14321 |
+- int sync_flight; |
14322 |
+ |
14323 |
++ /* |
14324 |
++ * Maximum number of requests in driver in the last |
14325 |
++ * @hw_tag_samples completed requests. |
14326 |
++ */ |
14327 |
+ int max_rq_in_driver; |
14328 |
++ /* number of samples used to calculate hw_tag */ |
14329 |
+ int hw_tag_samples; |
14330 |
++ /* flag set to one if the driver is showing a queueing behavior */ |
14331 |
+ int hw_tag; |
14332 |
+ |
14333 |
++ /* number of budgets assigned */ |
14334 |
+ int budgets_assigned; |
14335 |
+ |
14336 |
++ /* |
14337 |
++ * Timer set when idling (waiting) for the next request from |
14338 |
++ * the queue in service. |
14339 |
++ */ |
14340 |
+ struct timer_list idle_slice_timer; |
14341 |
++ /* delayed work to restart dispatching on the request queue */ |
14342 |
+ struct work_struct unplug_work; |
14343 |
+ |
14344 |
++ /* bfq_queue in service */ |
14345 |
+ struct bfq_queue *in_service_queue; |
14346 |
++ /* bfq_io_cq (bic) associated with the @in_service_queue */ |
14347 |
+ struct bfq_io_cq *in_service_bic; |
14348 |
+ |
14349 |
++ /* on-disk position of the last served request */ |
14350 |
+ sector_t last_position; |
14351 |
+ |
14352 |
++ /* beginning of the last budget */ |
14353 |
+ ktime_t last_budget_start; |
14354 |
++ /* beginning of the last idle slice */ |
14355 |
+ ktime_t last_idling_start; |
14356 |
++ /* number of samples used to calculate @peak_rate */ |
14357 |
+ int peak_rate_samples; |
14358 |
++ /* peak transfer rate observed for a budget */ |
14359 |
+ u64 peak_rate; |
14360 |
++ /* maximum budget allotted to a bfq_queue before rescheduling */ |
14361 |
+ int bfq_max_budget; |
14362 |
+ |
14363 |
++ /* list of all the bfq_queues active on the device */ |
14364 |
+ struct list_head active_list; |
14365 |
++ /* list of all the bfq_queues idle on the device */ |
14366 |
+ struct list_head idle_list; |
14367 |
+ |
14368 |
++ /* |
14369 |
++ * Timeout for async/sync requests; when it fires, requests |
14370 |
++ * are served in fifo order. |
14371 |
++ */ |
14372 |
+ unsigned int bfq_fifo_expire[2]; |
14373 |
++ /* weight of backward seeks wrt forward ones */ |
14374 |
+ unsigned int bfq_back_penalty; |
14375 |
++ /* maximum allowed backward seek */ |
14376 |
+ unsigned int bfq_back_max; |
14377 |
++ /* maximum idling time */ |
14378 |
+ unsigned int bfq_slice_idle; |
14379 |
++ /* last time CLASS_IDLE was served */ |
14380 |
+ u64 bfq_class_idle_last_service; |
14381 |
+ |
14382 |
++ /* user-configured max budget value (0 for auto-tuning) */ |
14383 |
+ int bfq_user_max_budget; |
14384 |
+- int bfq_max_budget_async_rq; |
14385 |
+- unsigned int bfq_timeout[2]; |
14386 |
+- |
14387 |
+- unsigned int bfq_coop_thresh; |
14388 |
+- unsigned int bfq_failed_cooperations; |
14389 |
++ /* |
14390 |
++ * Timeout for bfq_queues to consume their budget; used to |
14391 |
++ * prevent seeky queues from imposing long latencies to |
14392 |
++ * sequential or quasi-sequential ones (this also implies that |
14393 |
++ * seeky queues cannot receive guarantees in the service |
14394 |
++ * domain; after a timeout they are charged for the time they |
14395 |
++ * have been in service, to preserve fairness among them, but |
14396 |
++ * without service-domain guarantees). |
14397 |
++ */ |
14398 |
++ unsigned int bfq_timeout; |
14399 |
++ |
14400 |
++ /* |
14401 |
++ * Number of consecutive requests that must be issued within |
14402 |
++ * the idle time slice to set again idling to a queue which |
14403 |
++ * was marked as non-I/O-bound (see the definition of the |
14404 |
++ * IO_bound flag for further details). |
14405 |
++ */ |
14406 |
+ unsigned int bfq_requests_within_timer; |
14407 |
+ |
14408 |
++ /* |
14409 |
++ * Force device idling whenever needed to provide accurate |
14410 |
++ * service guarantees, without caring about throughput |
14411 |
++ * issues. CAVEAT: this may even increase latencies, in case |
14412 |
++ * of useless idling for processes that did stop doing I/O. |
14413 |
++ */ |
14414 |
++ bool strict_guarantees; |
14415 |
++ |
14416 |
++ /* |
14417 |
++ * Last time at which a queue entered the current burst of |
14418 |
++ * queues being activated shortly after each other; for more |
14419 |
++ * details about this and the following parameters related to |
14420 |
++ * a burst of activations, see the comments on the function |
14421 |
++ * bfq_handle_burst. |
14422 |
++ */ |
14423 |
+ unsigned long last_ins_in_burst; |
14424 |
++ /* |
14425 |
++ * Reference time interval used to decide whether a queue has |
14426 |
++ * been activated shortly after @last_ins_in_burst. |
14427 |
++ */ |
14428 |
+ unsigned long bfq_burst_interval; |
14429 |
++ /* number of queues in the current burst of queue activations */ |
14430 |
+ int burst_size; |
14431 |
++ |
14432 |
++ /* common parent entity for the queues in the burst */ |
14433 |
++ struct bfq_entity *burst_parent_entity; |
14434 |
++ /* Maximum burst size above which the current queue-activation |
14435 |
++ * burst is deemed as 'large'. |
14436 |
++ */ |
14437 |
+ unsigned long bfq_large_burst_thresh; |
14438 |
++ /* true if a large queue-activation burst is in progress */ |
14439 |
+ bool large_burst; |
14440 |
++ /* |
14441 |
++ * Head of the burst list (as for the above fields, more |
14442 |
++ * details in the comments on the function bfq_handle_burst). |
14443 |
++ */ |
14444 |
+ struct hlist_head burst_list; |
14445 |
+ |
14446 |
++ /* if set to true, low-latency heuristics are enabled */ |
14447 |
+ bool low_latency; |
14448 |
+- |
14449 |
+- /* parameters of the low_latency heuristics */ |
14450 |
++ /* |
14451 |
++ * Maximum factor by which the weight of a weight-raised queue |
14452 |
++ * is multiplied. |
14453 |
++ */ |
14454 |
+ unsigned int bfq_wr_coeff; |
14455 |
++ /* maximum duration of a weight-raising period (jiffies) */ |
14456 |
+ unsigned int bfq_wr_max_time; |
14457 |
++ |
14458 |
++ /* Maximum weight-raising duration for soft real-time processes */ |
14459 |
+ unsigned int bfq_wr_rt_max_time; |
14460 |
++ /* |
14461 |
++ * Minimum idle period after which weight-raising may be |
14462 |
++ * reactivated for a queue (in jiffies). |
14463 |
++ */ |
14464 |
+ unsigned int bfq_wr_min_idle_time; |
14465 |
++ /* |
14466 |
++ * Minimum period between request arrivals after which |
14467 |
++ * weight-raising may be reactivated for an already busy async |
14468 |
++ * queue (in jiffies). |
14469 |
++ */ |
14470 |
+ unsigned long bfq_wr_min_inter_arr_async; |
14471 |
++ |
14472 |
++ /* Max service-rate for a soft real-time queue, in sectors/sec */ |
14473 |
+ unsigned int bfq_wr_max_softrt_rate; |
14474 |
++ /* |
14475 |
++ * Cached value of the product R*T, used for computing the |
14476 |
++ * maximum duration of weight raising automatically. |
14477 |
++ */ |
14478 |
+ u64 RT_prod; |
14479 |
++ /* device-speed class for the low-latency heuristic */ |
14480 |
+ enum bfq_device_speed device_speed; |
14481 |
+ |
14482 |
++ /* fallback dummy bfqq for extreme OOM conditions */ |
14483 |
+ struct bfq_queue oom_bfqq; |
14484 |
+ }; |
14485 |
+ |
14486 |
+ enum bfqq_state_flags { |
14487 |
+- BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ |
14488 |
++ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ |
14489 |
++ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ |
14490 |
+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ |
14491 |
++ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* |
14492 |
++ * waiting for a request |
14493 |
++ * without idling the device |
14494 |
++ */ |
14495 |
+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ |
14496 |
+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ |
14497 |
+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ |
14498 |
+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ |
14499 |
+- BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ |
14500 |
+ BFQ_BFQQ_FLAG_IO_bound, /* |
14501 |
+ * bfqq has timed-out at least once |
14502 |
+ * having consumed at most 2/10 of |
14503 |
+@@ -581,17 +570,12 @@ enum bfqq_state_flags { |
14504 |
+ * bfqq activated in a large burst, |
14505 |
+ * see comments to bfq_handle_burst. |
14506 |
+ */ |
14507 |
+- BFQ_BFQQ_FLAG_constantly_seeky, /* |
14508 |
+- * bfqq has proved to be slow and |
14509 |
+- * seeky until budget timeout |
14510 |
+- */ |
14511 |
+ BFQ_BFQQ_FLAG_softrt_update, /* |
14512 |
+ * may need softrt-next-start |
14513 |
+ * update |
14514 |
+ */ |
14515 |
+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ |
14516 |
+- BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ |
14517 |
+- BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ |
14518 |
++ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ |
14519 |
+ }; |
14520 |
+ |
14521 |
+ #define BFQ_BFQQ_FNS(name) \ |
14522 |
+@@ -608,25 +592,53 @@ static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ |
14523 |
+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ |
14524 |
+ } |
14525 |
+ |
14526 |
++BFQ_BFQQ_FNS(just_created); |
14527 |
+ BFQ_BFQQ_FNS(busy); |
14528 |
+ BFQ_BFQQ_FNS(wait_request); |
14529 |
++BFQ_BFQQ_FNS(non_blocking_wait_rq); |
14530 |
+ BFQ_BFQQ_FNS(must_alloc); |
14531 |
+ BFQ_BFQQ_FNS(fifo_expire); |
14532 |
+ BFQ_BFQQ_FNS(idle_window); |
14533 |
+ BFQ_BFQQ_FNS(sync); |
14534 |
+-BFQ_BFQQ_FNS(budget_new); |
14535 |
+ BFQ_BFQQ_FNS(IO_bound); |
14536 |
+ BFQ_BFQQ_FNS(in_large_burst); |
14537 |
+-BFQ_BFQQ_FNS(constantly_seeky); |
14538 |
+ BFQ_BFQQ_FNS(coop); |
14539 |
+ BFQ_BFQQ_FNS(split_coop); |
14540 |
+-BFQ_BFQQ_FNS(just_split); |
14541 |
+ BFQ_BFQQ_FNS(softrt_update); |
14542 |
+ #undef BFQ_BFQQ_FNS |
14543 |
+ |
14544 |
+ /* Logging facilities. */ |
14545 |
+-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ |
14546 |
+- blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) |
14547 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
14548 |
++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); |
14549 |
++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); |
14550 |
++ |
14551 |
++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ |
14552 |
++ char __pbuf[128]; \ |
14553 |
++ \ |
14554 |
++ assert_spin_locked((bfqd)->queue->queue_lock); \ |
14555 |
++ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ |
14556 |
++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ |
14557 |
++ (bfqq)->pid, \ |
14558 |
++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ |
14559 |
++ __pbuf, ##args); \ |
14560 |
++} while (0) |
14561 |
++ |
14562 |
++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ |
14563 |
++ char __pbuf[128]; \ |
14564 |
++ \ |
14565 |
++ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ |
14566 |
++ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ |
14567 |
++} while (0) |
14568 |
++ |
14569 |
++#else /* CONFIG_BFQ_GROUP_IOSCHED */ |
14570 |
++ |
14571 |
++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ |
14572 |
++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ |
14573 |
++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ |
14574 |
++ ##args) |
14575 |
++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) |
14576 |
++ |
14577 |
++#endif /* CONFIG_BFQ_GROUP_IOSCHED */ |
14578 |
+ |
14579 |
+ #define bfq_log(bfqd, fmt, args...) \ |
14580 |
+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) |
14581 |
+@@ -640,15 +652,12 @@ enum bfqq_expiration { |
14582 |
+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ |
14583 |
+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ |
14584 |
+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ |
14585 |
++ BFQ_BFQQ_PREEMPTED /* preemption in progress */ |
14586 |
+ }; |
14587 |
+ |
14588 |
+-#ifdef CONFIG_BFQ_GROUP_IOSCHED |
14589 |
+ |
14590 |
+ struct bfqg_stats { |
14591 |
+- /* total bytes transferred */ |
14592 |
+- struct blkg_rwstat service_bytes; |
14593 |
+- /* total IOs serviced, post merge */ |
14594 |
+- struct blkg_rwstat serviced; |
14595 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
14596 |
+ /* number of ios merged */ |
14597 |
+ struct blkg_rwstat merged; |
14598 |
+ /* total time spent on device in ns, may not be accurate w/ queueing */ |
14599 |
+@@ -657,12 +666,8 @@ struct bfqg_stats { |
14600 |
+ struct blkg_rwstat wait_time; |
14601 |
+ /* number of IOs queued up */ |
14602 |
+ struct blkg_rwstat queued; |
14603 |
+- /* total sectors transferred */ |
14604 |
+- struct blkg_stat sectors; |
14605 |
+ /* total disk time and nr sectors dispatched by this group */ |
14606 |
+ struct blkg_stat time; |
14607 |
+- /* time not charged to this cgroup */ |
14608 |
+- struct blkg_stat unaccounted_time; |
14609 |
+ /* sum of number of ios queued across all samples */ |
14610 |
+ struct blkg_stat avg_queue_size_sum; |
14611 |
+ /* count of samples taken for average */ |
14612 |
+@@ -680,8 +685,10 @@ struct bfqg_stats { |
14613 |
+ uint64_t start_idle_time; |
14614 |
+ uint64_t start_empty_time; |
14615 |
+ uint16_t flags; |
14616 |
++#endif |
14617 |
+ }; |
14618 |
+ |
14619 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
14620 |
+ /* |
14621 |
+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. |
14622 |
+ * |
14623 |
+@@ -712,7 +719,7 @@ struct bfq_group_data { |
14624 |
+ * unused for the root group. Used to know whether there |
14625 |
+ * are groups with more than one active @bfq_entity |
14626 |
+ * (see the comments to the function |
14627 |
+- * bfq_bfqq_must_not_expire()). |
14628 |
++ * bfq_bfqq_may_idle()). |
14629 |
+ * @rq_pos_tree: rbtree sorted by next_request position, used when |
14630 |
+ * determining if two or more queues have interleaving |
14631 |
+ * requests (see bfq_find_close_cooperator()). |
14632 |
+@@ -745,7 +752,6 @@ struct bfq_group { |
14633 |
+ struct rb_root rq_pos_tree; |
14634 |
+ |
14635 |
+ struct bfqg_stats stats; |
14636 |
+- struct bfqg_stats dead_stats; /* stats pushed from dead children */ |
14637 |
+ }; |
14638 |
+ |
14639 |
+ #else |
14640 |
+@@ -767,11 +773,25 @@ bfq_entity_service_tree(struct bfq_entity *entity) |
14641 |
+ struct bfq_sched_data *sched_data = entity->sched_data; |
14642 |
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
14643 |
+ unsigned int idx = bfqq ? bfqq->ioprio_class - 1 : |
14644 |
+- BFQ_DEFAULT_GRP_CLASS; |
14645 |
++ BFQ_DEFAULT_GRP_CLASS - 1; |
14646 |
+ |
14647 |
+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); |
14648 |
+ BUG_ON(sched_data == NULL); |
14649 |
+ |
14650 |
++ if (bfqq) |
14651 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
14652 |
++ "entity_service_tree %p %d", |
14653 |
++ sched_data->service_tree + idx, idx) ; |
14654 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
14655 |
++ else { |
14656 |
++ struct bfq_group *bfqg = |
14657 |
++ container_of(entity, struct bfq_group, entity); |
14658 |
++ |
14659 |
++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, |
14660 |
++ "entity_service_tree %p %d", |
14661 |
++ sched_data->service_tree + idx, idx) ; |
14662 |
++ } |
14663 |
++#endif |
14664 |
+ return sched_data->service_tree + idx; |
14665 |
+ } |
14666 |
+ |
14667 |
+@@ -791,47 +811,6 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) |
14668 |
+ return bic->icq.q->elevator->elevator_data; |
14669 |
+ } |
14670 |
+ |
14671 |
+-/** |
14672 |
+- * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. |
14673 |
+- * @ptr: a pointer to a bfqd. |
14674 |
+- * @flags: storage for the flags to be saved. |
14675 |
+- * |
14676 |
+- * This function allows bfqg->bfqd to be protected by the |
14677 |
+- * queue lock of the bfqd they reference; the pointer is dereferenced |
14678 |
+- * under RCU, so the storage for bfqd is assured to be safe as long |
14679 |
+- * as the RCU read side critical section does not end. After the |
14680 |
+- * bfqd->queue->queue_lock is taken the pointer is rechecked, to be |
14681 |
+- * sure that no other writer accessed it. If we raced with a writer, |
14682 |
+- * the function returns NULL, with the queue unlocked, otherwise it |
14683 |
+- * returns the dereferenced pointer, with the queue locked. |
14684 |
+- */ |
14685 |
+-static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) |
14686 |
+-{ |
14687 |
+- struct bfq_data *bfqd; |
14688 |
+- |
14689 |
+- rcu_read_lock(); |
14690 |
+- bfqd = rcu_dereference(*(struct bfq_data **)ptr); |
14691 |
+- |
14692 |
+- if (bfqd != NULL) { |
14693 |
+- spin_lock_irqsave(bfqd->queue->queue_lock, *flags); |
14694 |
+- if (ptr == NULL) |
14695 |
+- printk(KERN_CRIT "get_bfqd_locked pointer NULL\n"); |
14696 |
+- else if (*ptr == bfqd) |
14697 |
+- goto out; |
14698 |
+- spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); |
14699 |
+- } |
14700 |
+- |
14701 |
+- bfqd = NULL; |
14702 |
+-out: |
14703 |
+- rcu_read_unlock(); |
14704 |
+- return bfqd; |
14705 |
+-} |
14706 |
+- |
14707 |
+-static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) |
14708 |
+-{ |
14709 |
+- spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); |
14710 |
+-} |
14711 |
+- |
14712 |
+ #ifdef CONFIG_BFQ_GROUP_IOSCHED |
14713 |
+ |
14714 |
+ static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) |
14715 |
+@@ -857,11 +836,13 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); |
14716 |
+ static void bfq_put_queue(struct bfq_queue *bfqq); |
14717 |
+ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); |
14718 |
+ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, |
14719 |
+- struct bio *bio, int is_sync, |
14720 |
+- struct bfq_io_cq *bic, gfp_t gfp_mask); |
14721 |
++ struct bio *bio, bool is_sync, |
14722 |
++ struct bfq_io_cq *bic); |
14723 |
+ static void bfq_end_wr_async_queues(struct bfq_data *bfqd, |
14724 |
+ struct bfq_group *bfqg); |
14725 |
++#ifdef CONFIG_BFQ_GROUP_IOSCHED |
14726 |
+ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); |
14727 |
++#endif |
14728 |
+ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); |
14729 |
+ |
14730 |
+ #endif /* _BFQ_H */ |
14731 |
+-- |
14732 |
+1.9.1 |
14733 |
+ |