Gentoo Archives: gentoo-commits

From: "Tom Wijsman (tomwij)" <tomwij@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] linux-patches r2666 - in genpatches-2.6/trunk: 3.13 3.14
Date: Fri, 07 Feb 2014 15:42:40
Message-Id: 20140207154235.ECCAA2004C@flycatcher.gentoo.org
1 Author: tomwij
2 Date: 2014-02-07 15:42:35 +0000 (Fri, 07 Feb 2014)
3 New Revision: 2666
4
5 Added:
6 genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch
7 genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1
8 genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch
9 genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch
10 genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1
11 genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch
12 Removed:
13 genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch
14 genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1
15 genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch
16 Modified:
17 genpatches-2.6/trunk/3.13/0000_README
18 genpatches-2.6/trunk/3.14/0000_README
19 Log:
20 Updated experimental BFQ patches to new revision v7r1.
21
22 Modified: genpatches-2.6/trunk/3.13/0000_README
23 ===================================================================
24 --- genpatches-2.6/trunk/3.13/0000_README 2014-02-07 14:46:59 UTC (rev 2665)
25 +++ genpatches-2.6/trunk/3.13/0000_README 2014-02-07 15:42:35 UTC (rev 2666)
26 @@ -91,17 +91,17 @@
27 From: Tom Wijsman <TomWij@g.o>
28 Desc: Add Gentoo Linux support config settings and defaults.
29
30 -Patch: 5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7-3.13.patch
31 +Patch: 5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7r1-3.13.patch
32 From: http://algo.ing.unimo.it/people/paolo/disk_sched/
33 -Desc: BFQ v7 patch 1 for 3.13: Build, cgroups and kconfig bits
34 +Desc: BFQ v7r1 patch 1 for 3.13: Build, cgroups and kconfig bits
35
36 -Patch: 5000_BFQ-2-block-introduce-the-v7-I-O-sched-for-3.13.patch1
37 +Patch: 5000_BFQ-2-block-introduce-the-v7r1-I-O-sched-for-3.13.patch1
38 From: http://algo.ing.unimo.it/people/paolo/disk_sched/
39 -Desc: BFQ v7 patch 2 for 3.13: BFQ Scheduler
40 +Desc: BFQ v7r1 patch 2 for 3.13: BFQ Scheduler
41
42 -Patch: 5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7-for-3.13.0.patch
43 +Patch: 5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r1-for-3.13.0.patch
44 From: http://algo.ing.unimo.it/people/paolo/disk_sched/
45 -Desc: BFQ v7 patch 3 for 3.13: Early Queue Merge (EQM)
46 +Desc: BFQ v7r1 patch 3 for 3.13: Early Queue Merge (EQM)
47
48 Patch: 5000_enable-additional-cpu-optimizations-for-gcc.patch
49 From: https://github.com/graysky2/kernel_gcc_patch/
50
51 Deleted: genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch
52 ===================================================================
53 --- genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch 2014-02-07 14:46:59 UTC (rev 2665)
54 +++ genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch 2014-02-07 15:42:35 UTC (rev 2666)
55 @@ -1,104 +0,0 @@
56 -From 7f029ed2a02bea57b791c032d6242129c3372a84 Mon Sep 17 00:00:00 2001
57 -From: Paolo Valente <paolo.valente@×××××××.it>
58 -Date: Tue, 3 Sep 2013 16:50:42 +0200
59 -Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7-3.13
60 -
61 -Update Kconfig.iosched and do the related Makefile changes to include
62 -kernel configuration options for BFQ. Also add the bfqio controller
63 -to the cgroups subsystem.
64 -
65 -Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
66 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
67 ----
68 - block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++
69 - block/Makefile | 1 +
70 - include/linux/cgroup_subsys.h | 4 ++++
71 - 3 files changed, 37 insertions(+)
72 -
73 -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
74 -index 421bef9..8f552ba 100644
75 ---- a/block/Kconfig.iosched
76 -+++ b/block/Kconfig.iosched
77 -@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED
78 - ---help---
79 - Enable group IO scheduling in CFQ.
80 -
81 -+config IOSCHED_BFQ
82 -+ tristate "BFQ I/O scheduler"
83 -+ default n
84 -+ ---help---
85 -+ The BFQ I/O scheduler tries to distribute bandwidth among
86 -+ all processes according to their weights.
87 -+ It aims at distributing the bandwidth as desired, independently of
88 -+ the disk parameters and with any workload. It also tries to
89 -+ guarantee low latency to interactive and soft real-time
90 -+ applications. If compiled built-in (saying Y here), BFQ can
91 -+ be configured to support hierarchical scheduling.
92 -+
93 -+config CGROUP_BFQIO
94 -+ bool "BFQ hierarchical scheduling support"
95 -+ depends on CGROUPS && IOSCHED_BFQ=y
96 -+ default n
97 -+ ---help---
98 -+ Enable hierarchical scheduling in BFQ, using the cgroups
99 -+ filesystem interface. The name of the subsystem will be
100 -+ bfqio.
101 -+
102 - choice
103 - prompt "Default I/O scheduler"
104 - default DEFAULT_CFQ
105 -@@ -52,6 +73,16 @@ choice
106 - config DEFAULT_CFQ
107 - bool "CFQ" if IOSCHED_CFQ=y
108 -
109 -+ config DEFAULT_BFQ
110 -+ bool "BFQ" if IOSCHED_BFQ=y
111 -+ help
112 -+ Selects BFQ as the default I/O scheduler which will be
113 -+ used by default for all block devices.
114 -+ The BFQ I/O scheduler aims at distributing the bandwidth
115 -+ as desired, independently of the disk parameters and with
116 -+ any workload. It also tries to guarantee low latency to
117 -+ interactive and soft real-time applications.
118 -+
119 - config DEFAULT_NOOP
120 - bool "No-op"
121 -
122 -@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED
123 - string
124 - default "deadline" if DEFAULT_DEADLINE
125 - default "cfq" if DEFAULT_CFQ
126 -+ default "bfq" if DEFAULT_BFQ
127 - default "noop" if DEFAULT_NOOP
128 -
129 - endmenu
130 -diff --git a/block/Makefile b/block/Makefile
131 -index 20645e8..cbd83fb 100644
132 ---- a/block/Makefile
133 -+++ b/block/Makefile
134 -@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
135 - obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
136 - obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
137 - obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
138 -+obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
139 -
140 - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
141 - obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
142 -diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
143 -index b613ffd..43c5dc9 100644
144 ---- a/include/linux/cgroup_subsys.h
145 -+++ b/include/linux/cgroup_subsys.h
146 -@@ -39,6 +39,10 @@ SUBSYS(net_cls)
147 - SUBSYS(blkio)
148 - #endif
149 -
150 -+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)
151 -+SUBSYS(bfqio)
152 -+#endif
153 -+
154 - #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)
155 - SUBSYS(perf)
156 - #endif
157 ---
158 -1.8.5.2
159 -
160
161 Added: genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch
162 ===================================================================
163 --- genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch (rev 0)
164 +++ genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch 2014-02-07 15:42:35 UTC (rev 2666)
165 @@ -0,0 +1,104 @@
166 +From ae1b820a5286601aa9d5426459f8f3de658342b4 Mon Sep 17 00:00:00 2001
167 +From: Paolo Valente <paolo.valente@×××××××.it>
168 +Date: Tue, 3 Sep 2013 16:50:42 +0200
169 +Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r1-3.13
170 +
171 +Update Kconfig.iosched and do the related Makefile changes to include
172 +kernel configuration options for BFQ. Also add the bfqio controller
173 +to the cgroups subsystem.
174 +
175 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
176 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
177 +---
178 + block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++
179 + block/Makefile | 1 +
180 + include/linux/cgroup_subsys.h | 4 ++++
181 + 3 files changed, 37 insertions(+)
182 +
183 +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
184 +index 421bef9..8f552ba 100644
185 +--- a/block/Kconfig.iosched
186 ++++ b/block/Kconfig.iosched
187 +@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED
188 + ---help---
189 + Enable group IO scheduling in CFQ.
190 +
191 ++config IOSCHED_BFQ
192 ++ tristate "BFQ I/O scheduler"
193 ++ default n
194 ++ ---help---
195 ++ The BFQ I/O scheduler tries to distribute bandwidth among
196 ++ all processes according to their weights.
197 ++ It aims at distributing the bandwidth as desired, independently of
198 ++ the disk parameters and with any workload. It also tries to
199 ++ guarantee low latency to interactive and soft real-time
200 ++ applications. If compiled built-in (saying Y here), BFQ can
201 ++ be configured to support hierarchical scheduling.
202 ++
203 ++config CGROUP_BFQIO
204 ++ bool "BFQ hierarchical scheduling support"
205 ++ depends on CGROUPS && IOSCHED_BFQ=y
206 ++ default n
207 ++ ---help---
208 ++ Enable hierarchical scheduling in BFQ, using the cgroups
209 ++ filesystem interface. The name of the subsystem will be
210 ++ bfqio.
211 ++
212 + choice
213 + prompt "Default I/O scheduler"
214 + default DEFAULT_CFQ
215 +@@ -52,6 +73,16 @@ choice
216 + config DEFAULT_CFQ
217 + bool "CFQ" if IOSCHED_CFQ=y
218 +
219 ++ config DEFAULT_BFQ
220 ++ bool "BFQ" if IOSCHED_BFQ=y
221 ++ help
222 ++ Selects BFQ as the default I/O scheduler which will be
223 ++ used by default for all block devices.
224 ++ The BFQ I/O scheduler aims at distributing the bandwidth
225 ++ as desired, independently of the disk parameters and with
226 ++ any workload. It also tries to guarantee low latency to
227 ++ interactive and soft real-time applications.
228 ++
229 + config DEFAULT_NOOP
230 + bool "No-op"
231 +
232 +@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED
233 + string
234 + default "deadline" if DEFAULT_DEADLINE
235 + default "cfq" if DEFAULT_CFQ
236 ++ default "bfq" if DEFAULT_BFQ
237 + default "noop" if DEFAULT_NOOP
238 +
239 + endmenu
240 +diff --git a/block/Makefile b/block/Makefile
241 +index 20645e8..cbd83fb 100644
242 +--- a/block/Makefile
243 ++++ b/block/Makefile
244 +@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
245 + obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
246 + obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
247 + obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
248 ++obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
249 +
250 + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
251 + obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
252 +diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
253 +index b613ffd..43c5dc9 100644
254 +--- a/include/linux/cgroup_subsys.h
255 ++++ b/include/linux/cgroup_subsys.h
256 +@@ -39,6 +39,10 @@ SUBSYS(net_cls)
257 + SUBSYS(blkio)
258 + #endif
259 +
260 ++#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)
261 ++SUBSYS(bfqio)
262 ++#endif
263 ++
264 + #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)
265 + SUBSYS(perf)
266 + #endif
267 +--
268 +1.8.5.2
269 +
270
271 Deleted: genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1
272 ===================================================================
273 --- genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1 2014-02-07 14:46:59 UTC (rev 2665)
274 +++ genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1 2014-02-07 15:42:35 UTC (rev 2666)
275 @@ -1,6008 +0,0 @@
276 -From 3747f129106ce58fbad1b8f05cc836a6addd8588 Mon Sep 17 00:00:00 2001
277 -From: Paolo Valente <paolo.valente@×××××××.it>
278 -Date: Thu, 9 May 2013 19:10:02 +0200
279 -Subject: [PATCH 2/3] block: introduce the BFQ-v7 I/O sched for 3.13
280 -
281 -Add the BFQ-v7 I/O scheduler to 3.13.
282 -The general structure is borrowed from CFQ, as much of the code for
283 -handling I/O contexts Over time, several useful features have been
284 -ported from CFQ as well (details in the changelog in README.BFQ). A
285 -(bfq_)queue is associated to each task doing I/O on a device, and each
286 -time a scheduling decision has to be made a queue is selected and served
287 -until it expires.
288 -
289 - - Slices are given in the service domain: tasks are assigned
290 - budgets, measured in number of sectors. Once got the disk, a task
291 - must however consume its assigned budget within a configurable
292 - maximum time (by default, the maximum possible value of the
293 - budgets is automatically computed to comply with this timeout).
294 - This allows the desired latency vs "throughput boosting" tradeoff
295 - to be set.
296 -
297 - - Budgets are scheduled according to a variant of WF2Q+, implemented
298 - using an augmented rb-tree to take eligibility into account while
299 - preserving an O(log N) overall complexity.
300 -
301 - - A low-latency tunable is provided; if enabled, both interactive
302 - and soft real-time applications are guaranteed a very low latency.
303 -
304 - - Latency guarantees are preserved also in the presence of NCQ.
305 -
306 - - Also with flash-based devices, a high throughput is achieved
307 - while still preserving latency guarantees.
308 -
309 - - BFQ features Early Queue Merge (EQM), a sort of fusion of the
310 - cooperating-queue-merging and the preemption mechanisms present
311 - in CFQ. EQM is in fact a unified mechanism that tries to get a
312 - sequential read pattern, and hence a high throughput, with any
313 - set of processes performing interleaved I/O over a contiguous
314 - sequence of sectors.
315 -
316 - - BFQ supports full hierarchical scheduling, exporting a cgroups
317 - interface. Since each node has a full scheduler, each group can
318 - be assigned its own weight.
319 -
320 - - If the cgroups interface is not used, only I/O priorities can be
321 - assigned to processes, with ioprio values mapped to weights
322 - with the relation weight = IOPRIO_BE_NR - ioprio.
323 -
324 - - ioprio classes are served in strict priority order, i.e., lower
325 - priority queues are not served as long as there are higher
326 - priority queues. Among queues in the same class the bandwidth is
327 - distributed in proportion to the weight of each queue. A very
328 - thin extra bandwidth is however guaranteed to the Idle class, to
329 - prevent it from starving.
330 -
331 -Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
332 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
333 ----
334 - block/bfq-cgroup.c | 910 ++++++++++++++
335 - block/bfq-ioc.c | 36 +
336 - block/bfq-iosched.c | 3268 +++++++++++++++++++++++++++++++++++++++++++++++++++
337 - block/bfq-sched.c | 1077 +++++++++++++++++
338 - block/bfq.h | 614 ++++++++++
339 - 5 files changed, 5905 insertions(+)
340 - create mode 100644 block/bfq-cgroup.c
341 - create mode 100644 block/bfq-ioc.c
342 - create mode 100644 block/bfq-iosched.c
343 - create mode 100644 block/bfq-sched.c
344 - create mode 100644 block/bfq.h
345 -
346 -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
347 -new file mode 100644
348 -index 0000000..b889acf
349 ---- /dev/null
350 -+++ b/block/bfq-cgroup.c
351 -@@ -0,0 +1,910 @@
352 -+/*
353 -+ * BFQ: CGROUPS support.
354 -+ *
355 -+ * Based on ideas and code from CFQ:
356 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
357 -+ *
358 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
359 -+ * Paolo Valente <paolo.valente@×××××××.it>
360 -+ *
361 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
362 -+ *
363 -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
364 -+ */
365 -+
366 -+#ifdef CONFIG_CGROUP_BFQIO
367 -+
368 -+static DEFINE_MUTEX(bfqio_mutex);
369 -+
370 -+static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
371 -+{
372 -+ return bgrp ? !bgrp->online : false;
373 -+}
374 -+
375 -+static struct bfqio_cgroup bfqio_root_cgroup = {
376 -+ .weight = BFQ_DEFAULT_GRP_WEIGHT,
377 -+ .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
378 -+ .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
379 -+};
380 -+
381 -+static inline void bfq_init_entity(struct bfq_entity *entity,
382 -+ struct bfq_group *bfqg)
383 -+{
384 -+ entity->weight = entity->new_weight;
385 -+ entity->orig_weight = entity->new_weight;
386 -+ entity->ioprio = entity->new_ioprio;
387 -+ entity->ioprio_class = entity->new_ioprio_class;
388 -+ entity->parent = bfqg->my_entity;
389 -+ entity->sched_data = &bfqg->sched_data;
390 -+}
391 -+
392 -+static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
393 -+{
394 -+ return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
395 -+}
396 -+
397 -+/*
398 -+ * Search the bfq_group for bfqd into the hash table (by now only a list)
399 -+ * of bgrp. Must be called under rcu_read_lock().
400 -+ */
401 -+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
402 -+ struct bfq_data *bfqd)
403 -+{
404 -+ struct bfq_group *bfqg;
405 -+ void *key;
406 -+
407 -+ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
408 -+ key = rcu_dereference(bfqg->bfqd);
409 -+ if (key == bfqd)
410 -+ return bfqg;
411 -+ }
412 -+
413 -+ return NULL;
414 -+}
415 -+
416 -+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
417 -+ struct bfq_group *bfqg)
418 -+{
419 -+ struct bfq_entity *entity = &bfqg->entity;
420 -+
421 -+ /*
422 -+ * If the weight of the entity has never been set via the sysfs
423 -+ * interface, then bgrp->weight == 0. In this case we initialize
424 -+ * the weight from the current ioprio value. Otherwise, the group
425 -+ * weight, if set, has priority over the ioprio value.
426 -+ */
427 -+ if (bgrp->weight == 0) {
428 -+ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
429 -+ entity->new_ioprio = bgrp->ioprio;
430 -+ } else {
431 -+ entity->new_weight = bgrp->weight;
432 -+ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
433 -+ }
434 -+ entity->orig_weight = entity->weight = entity->new_weight;
435 -+ entity->ioprio = entity->new_ioprio;
436 -+ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
437 -+ entity->my_sched_data = &bfqg->sched_data;
438 -+}
439 -+
440 -+static inline void bfq_group_set_parent(struct bfq_group *bfqg,
441 -+ struct bfq_group *parent)
442 -+{
443 -+ struct bfq_entity *entity;
444 -+
445 -+ BUG_ON(parent == NULL);
446 -+ BUG_ON(bfqg == NULL);
447 -+
448 -+ entity = &bfqg->entity;
449 -+ entity->parent = parent->my_entity;
450 -+ entity->sched_data = &parent->sched_data;
451 -+}
452 -+
453 -+/**
454 -+ * bfq_group_chain_alloc - allocate a chain of groups.
455 -+ * @bfqd: queue descriptor.
456 -+ * @css: the leaf cgroup_subsys_state this chain starts from.
457 -+ *
458 -+ * Allocate a chain of groups starting from the one belonging to
459 -+ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
460 -+ * to the root has already an allocated group on @bfqd.
461 -+ */
462 -+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
463 -+ struct cgroup_subsys_state *css)
464 -+{
465 -+ struct bfqio_cgroup *bgrp;
466 -+ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
467 -+
468 -+ for (; css != NULL; css = css->parent) {
469 -+ bgrp = css_to_bfqio(css);
470 -+
471 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
472 -+ if (bfqg != NULL) {
473 -+ /*
474 -+ * All the cgroups in the path from there to the
475 -+ * root must have a bfq_group for bfqd, so we don't
476 -+ * need any more allocations.
477 -+ */
478 -+ break;
479 -+ }
480 -+
481 -+ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
482 -+ if (bfqg == NULL)
483 -+ goto cleanup;
484 -+
485 -+ bfq_group_init_entity(bgrp, bfqg);
486 -+ bfqg->my_entity = &bfqg->entity;
487 -+
488 -+ if (leaf == NULL) {
489 -+ leaf = bfqg;
490 -+ prev = leaf;
491 -+ } else {
492 -+ bfq_group_set_parent(prev, bfqg);
493 -+ /*
494 -+ * Build a list of allocated nodes using the bfqd
495 -+ * filed, that is still unused and will be initialized
496 -+ * only after the node will be connected.
497 -+ */
498 -+ prev->bfqd = bfqg;
499 -+ prev = bfqg;
500 -+ }
501 -+ }
502 -+
503 -+ return leaf;
504 -+
505 -+cleanup:
506 -+ while (leaf != NULL) {
507 -+ prev = leaf;
508 -+ leaf = leaf->bfqd;
509 -+ kfree(prev);
510 -+ }
511 -+
512 -+ return NULL;
513 -+}
514 -+
515 -+/**
516 -+ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
517 -+ * @bfqd: the queue descriptor.
518 -+ * @css: the leaf cgroup_subsys_state to start from.
519 -+ * @leaf: the leaf group (to be associated to @cgroup).
520 -+ *
521 -+ * Try to link a chain of groups to a cgroup hierarchy, connecting the
522 -+ * nodes bottom-up, so we can be sure that when we find a cgroup in the
523 -+ * hierarchy that already as a group associated to @bfqd all the nodes
524 -+ * in the path to the root cgroup have one too.
525 -+ *
526 -+ * On locking: the queue lock protects the hierarchy (there is a hierarchy
527 -+ * per device) while the bfqio_cgroup lock protects the list of groups
528 -+ * belonging to the same cgroup.
529 -+ */
530 -+static void bfq_group_chain_link(struct bfq_data *bfqd,
531 -+ struct cgroup_subsys_state *css,
532 -+ struct bfq_group *leaf)
533 -+{
534 -+ struct bfqio_cgroup *bgrp;
535 -+ struct bfq_group *bfqg, *next, *prev = NULL;
536 -+ unsigned long flags;
537 -+
538 -+ assert_spin_locked(bfqd->queue->queue_lock);
539 -+
540 -+ for (; css != NULL && leaf != NULL; css = css->parent) {
541 -+ bgrp = css_to_bfqio(css);
542 -+ next = leaf->bfqd;
543 -+
544 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
545 -+ BUG_ON(bfqg != NULL);
546 -+
547 -+ spin_lock_irqsave(&bgrp->lock, flags);
548 -+
549 -+ rcu_assign_pointer(leaf->bfqd, bfqd);
550 -+ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
551 -+ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
552 -+
553 -+ spin_unlock_irqrestore(&bgrp->lock, flags);
554 -+
555 -+ prev = leaf;
556 -+ leaf = next;
557 -+ }
558 -+
559 -+ BUG_ON(css == NULL && leaf != NULL);
560 -+ if (css != NULL && prev != NULL) {
561 -+ bgrp = css_to_bfqio(css);
562 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
563 -+ bfq_group_set_parent(prev, bfqg);
564 -+ }
565 -+}
566 -+
567 -+/**
568 -+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
569 -+ * @bfqd: queue descriptor.
570 -+ * @cgroup: cgroup being searched for.
571 -+ *
572 -+ * Return a group associated to @bfqd in @cgroup, allocating one if
573 -+ * necessary. When a group is returned all the cgroups in the path
574 -+ * to the root have a group associated to @bfqd.
575 -+ *
576 -+ * If the allocation fails, return the root group: this breaks guarantees
577 -+ * but is a safe fallbak. If this loss becames a problem it can be
578 -+ * mitigated using the equivalent weight (given by the product of the
579 -+ * weights of the groups in the path from @group to the root) in the
580 -+ * root scheduler.
581 -+ *
582 -+ * We allocate all the missing nodes in the path from the leaf cgroup
583 -+ * to the root and we connect the nodes only after all the allocations
584 -+ * have been successful.
585 -+ */
586 -+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
587 -+ struct cgroup_subsys_state *css)
588 -+{
589 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
590 -+ struct bfq_group *bfqg;
591 -+
592 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
593 -+ if (bfqg != NULL)
594 -+ return bfqg;
595 -+
596 -+ bfqg = bfq_group_chain_alloc(bfqd, css);
597 -+ if (bfqg != NULL)
598 -+ bfq_group_chain_link(bfqd, css, bfqg);
599 -+ else
600 -+ bfqg = bfqd->root_group;
601 -+
602 -+ return bfqg;
603 -+}
604 -+
605 -+/**
606 -+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
607 -+ * @bfqd: queue descriptor.
608 -+ * @bfqq: the queue to move.
609 -+ * @entity: @bfqq's entity.
610 -+ * @bfqg: the group to move to.
611 -+ *
612 -+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
613 -+ * it on the new one. Avoid putting the entity on the old group idle tree.
614 -+ *
615 -+ * Must be called under the queue lock; the cgroup owning @bfqg must
616 -+ * not disappear (by now this just means that we are called under
617 -+ * rcu_read_lock()).
618 -+ */
619 -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
620 -+ struct bfq_entity *entity, struct bfq_group *bfqg)
621 -+{
622 -+ int busy, resume;
623 -+
624 -+ busy = bfq_bfqq_busy(bfqq);
625 -+ resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
626 -+
627 -+ BUG_ON(resume && !entity->on_st);
628 -+ BUG_ON(busy && !resume && entity->on_st &&
629 -+ bfqq != bfqd->in_service_queue);
630 -+
631 -+ if (busy) {
632 -+ BUG_ON(atomic_read(&bfqq->ref) < 2);
633 -+
634 -+ if (!resume)
635 -+ bfq_del_bfqq_busy(bfqd, bfqq, 0);
636 -+ else
637 -+ bfq_deactivate_bfqq(bfqd, bfqq, 0);
638 -+ } else if (entity->on_st)
639 -+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
640 -+
641 -+ /*
642 -+ * Here we use a reference to bfqg. We don't need a refcounter
643 -+ * as the cgroup reference will not be dropped, so that its
644 -+ * destroy() callback will not be invoked.
645 -+ */
646 -+ entity->parent = bfqg->my_entity;
647 -+ entity->sched_data = &bfqg->sched_data;
648 -+
649 -+ if (busy && resume)
650 -+ bfq_activate_bfqq(bfqd, bfqq);
651 -+
652 -+ if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)
653 -+ bfq_schedule_dispatch(bfqd);
654 -+}
655 -+
656 -+/**
657 -+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
658 -+ * @bfqd: the queue descriptor.
659 -+ * @bic: the bic to move.
660 -+ * @cgroup: the cgroup to move to.
661 -+ *
662 -+ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
663 -+ * has to make sure that the reference to cgroup is valid across the call.
664 -+ *
665 -+ * NOTE: an alternative approach might have been to store the current
666 -+ * cgroup in bfqq and getting a reference to it, reducing the lookup
667 -+ * time here, at the price of slightly more complex code.
668 -+ */
669 -+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
670 -+ struct bfq_io_cq *bic,
671 -+ struct cgroup_subsys_state *css)
672 -+{
673 -+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
674 -+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
675 -+ struct bfq_entity *entity;
676 -+ struct bfq_group *bfqg;
677 -+ struct bfqio_cgroup *bgrp;
678 -+
679 -+ bgrp = css_to_bfqio(css);
680 -+
681 -+ bfqg = bfq_find_alloc_group(bfqd, css);
682 -+ if (async_bfqq != NULL) {
683 -+ entity = &async_bfqq->entity;
684 -+
685 -+ if (entity->sched_data != &bfqg->sched_data) {
686 -+ bic_set_bfqq(bic, NULL, 0);
687 -+ bfq_log_bfqq(bfqd, async_bfqq,
688 -+ "bic_change_group: %p %d",
689 -+ async_bfqq, atomic_read(&async_bfqq->ref));
690 -+ bfq_put_queue(async_bfqq);
691 -+ }
692 -+ }
693 -+
694 -+ if (sync_bfqq != NULL) {
695 -+ entity = &sync_bfqq->entity;
696 -+ if (entity->sched_data != &bfqg->sched_data)
697 -+ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
698 -+ }
699 -+
700 -+ return bfqg;
701 -+}
702 -+
703 -+/**
704 -+ * bfq_bic_change_cgroup - move @bic to @cgroup.
705 -+ * @bic: the bic being migrated.
706 -+ * @cgroup: the destination cgroup.
707 -+ *
708 -+ * When the task owning @bic is moved to @cgroup, @bic is immediately
709 -+ * moved into its new parent group.
710 -+ */
711 -+static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
712 -+ struct cgroup_subsys_state *css)
713 -+{
714 -+ struct bfq_data *bfqd;
715 -+ unsigned long uninitialized_var(flags);
716 -+
717 -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
718 -+ &flags);
719 -+ if (bfqd != NULL) {
720 -+ __bfq_bic_change_cgroup(bfqd, bic, css);
721 -+ bfq_put_bfqd_unlock(bfqd, &flags);
722 -+ }
723 -+}
724 -+
725 -+/**
726 -+ * bfq_bic_update_cgroup - update the cgroup of @bic.
727 -+ * @bic: the @bic to update.
728 -+ *
729 -+ * Make sure that @bic is enqueued in the cgroup of the current task.
730 -+ * We need this in addition to moving bics during the cgroup attach
731 -+ * phase because the task owning @bic could be at its first disk
732 -+ * access or we may end up in the root cgroup as the result of a
733 -+ * memory allocation failure and here we try to move to the right
734 -+ * group.
735 -+ *
736 -+ * Must be called under the queue lock. It is safe to use the returned
737 -+ * value even after the rcu_read_unlock() as the migration/destruction
738 -+ * paths act under the queue lock too. IOW it is impossible to race with
739 -+ * group migration/destruction and end up with an invalid group as:
740 -+ * a) here cgroup has not yet been destroyed, nor its destroy callback
741 -+ * has started execution, as current holds a reference to it,
742 -+ * b) if it is destroyed after rcu_read_unlock() [after current is
743 -+ * migrated to a different cgroup] its attach() callback will have
744 -+ * taken care of remove all the references to the old cgroup data.
745 -+ */
746 -+static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
747 -+{
748 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
749 -+ struct bfq_group *bfqg;
750 -+ struct cgroup_subsys_state *css;
751 -+
752 -+ BUG_ON(bfqd == NULL);
753 -+
754 -+ rcu_read_lock();
755 -+ css = task_css(current, bfqio_subsys_id);
756 -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
757 -+ rcu_read_unlock();
758 -+
759 -+ return bfqg;
760 -+}
761 -+
762 -+/**
763 -+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
764 -+ * @st: the service tree being flushed.
765 -+ */
766 -+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
767 -+{
768 -+ struct bfq_entity *entity = st->first_idle;
769 -+
770 -+ for (; entity != NULL; entity = st->first_idle)
771 -+ __bfq_deactivate_entity(entity, 0);
772 -+}
773 -+
774 -+/**
775 -+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
776 -+ * @bfqd: the device data structure with the root group.
777 -+ * @entity: the entity to move.
778 -+ */
779 -+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
780 -+ struct bfq_entity *entity)
781 -+{
782 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
783 -+
784 -+ BUG_ON(bfqq == NULL);
785 -+ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
786 -+ return;
787 -+}
788 -+
789 -+/**
790 -+ * bfq_reparent_active_entities - move to the root group all active entities.
791 -+ * @bfqd: the device data structure with the root group.
792 -+ * @bfqg: the group to move from.
793 -+ * @st: the service tree with the entities.
794 -+ *
795 -+ * Needs queue_lock to be taken and reference to be valid over the call.
796 -+ */
797 -+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
798 -+ struct bfq_group *bfqg,
799 -+ struct bfq_service_tree *st)
800 -+{
801 -+ struct rb_root *active = &st->active;
802 -+ struct bfq_entity *entity = NULL;
803 -+
804 -+ if (!RB_EMPTY_ROOT(&st->active))
805 -+ entity = bfq_entity_of(rb_first(active));
806 -+
807 -+ for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))
808 -+ bfq_reparent_leaf_entity(bfqd, entity);
809 -+
810 -+ if (bfqg->sched_data.active_entity != NULL)
811 -+ bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);
812 -+
813 -+ return;
814 -+}
815 -+
816 -+/**
817 -+ * bfq_destroy_group - destroy @bfqg.
818 -+ * @bgrp: the bfqio_cgroup containing @bfqg.
819 -+ * @bfqg: the group being destroyed.
820 -+ *
821 -+ * Destroy @bfqg, making sure that it is not referenced from its parent.
822 -+ */
823 -+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
824 -+{
825 -+ struct bfq_data *bfqd;
826 -+ struct bfq_service_tree *st;
827 -+ struct bfq_entity *entity = bfqg->my_entity;
828 -+ unsigned long uninitialized_var(flags);
829 -+ int i;
830 -+
831 -+ hlist_del(&bfqg->group_node);
832 -+
833 -+ /*
834 -+ * Empty all service_trees belonging to this group before deactivating
835 -+ * the group itself.
836 -+ */
837 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
838 -+ st = bfqg->sched_data.service_tree + i;
839 -+
840 -+ /*
841 -+ * The idle tree may still contain bfq_queues belonging
842 -+ * to exited task because they never migrated to a different
843 -+ * cgroup from the one being destroyed now. Noone else
844 -+ * can access them so it's safe to act without any lock.
845 -+ */
846 -+ bfq_flush_idle_tree(st);
847 -+
848 -+ /*
849 -+ * It may happen that some queues are still active
850 -+ * (busy) upon group destruction (if the corresponding
851 -+ * processes have been forced to terminate). We move
852 -+ * all the leaf entities corresponding to these queues
853 -+ * to the root_group.
854 -+ * Also, it may happen that the group has an entity
855 -+ * under service, which is disconnected from the active
856 -+ * tree: it must be moved, too.
857 -+ * There is no need to put the sync queues, as the
858 -+ * scheduler has taken no reference.
859 -+ */
860 -+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
861 -+ if (bfqd != NULL) {
862 -+ bfq_reparent_active_entities(bfqd, bfqg, st);
863 -+ bfq_put_bfqd_unlock(bfqd, &flags);
864 -+ }
865 -+ BUG_ON(!RB_EMPTY_ROOT(&st->active));
866 -+ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
867 -+ }
868 -+ BUG_ON(bfqg->sched_data.next_active != NULL);
869 -+ BUG_ON(bfqg->sched_data.active_entity != NULL);
870 -+
871 -+ /*
872 -+ * We may race with device destruction, take extra care when
873 -+ * dereferencing bfqg->bfqd.
874 -+ */
875 -+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
876 -+ if (bfqd != NULL) {
877 -+ hlist_del(&bfqg->bfqd_node);
878 -+ __bfq_deactivate_entity(entity, 0);
879 -+ bfq_put_async_queues(bfqd, bfqg);
880 -+ bfq_put_bfqd_unlock(bfqd, &flags);
881 -+ }
882 -+ BUG_ON(entity->tree != NULL);
883 -+
884 -+ /*
885 -+ * No need to defer the kfree() to the end of the RCU grace
886 -+ * period: we are called from the destroy() callback of our
887 -+ * cgroup, so we can be sure that noone is a) still using
888 -+ * this cgroup or b) doing lookups in it.
889 -+ */
890 -+ kfree(bfqg);
891 -+}
892 -+
893 -+static void bfq_end_raising_async(struct bfq_data *bfqd)
894 -+{
895 -+ struct hlist_node *tmp;
896 -+ struct bfq_group *bfqg;
897 -+
898 -+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
899 -+ bfq_end_raising_async_queues(bfqd, bfqg);
900 -+ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
901 -+}
902 -+
903 -+/**
904 -+ * bfq_disconnect_groups - diconnect @bfqd from all its groups.
905 -+ * @bfqd: the device descriptor being exited.
906 -+ *
907 -+ * When the device exits we just make sure that no lookup can return
908 -+ * the now unused group structures. They will be deallocated on cgroup
909 -+ * destruction.
910 -+ */
911 -+static void bfq_disconnect_groups(struct bfq_data *bfqd)
912 -+{
913 -+ struct hlist_node *tmp;
914 -+ struct bfq_group *bfqg;
915 -+
916 -+ bfq_log(bfqd, "disconnect_groups beginning");
917 -+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
918 -+ hlist_del(&bfqg->bfqd_node);
919 -+
920 -+ __bfq_deactivate_entity(bfqg->my_entity, 0);
921 -+
922 -+ /*
923 -+ * Don't remove from the group hash, just set an
924 -+ * invalid key. No lookups can race with the
925 -+ * assignment as bfqd is being destroyed; this
926 -+ * implies also that new elements cannot be added
927 -+ * to the list.
928 -+ */
929 -+ rcu_assign_pointer(bfqg->bfqd, NULL);
930 -+
931 -+ bfq_log(bfqd, "disconnect_groups: put async for group %p",
932 -+ bfqg);
933 -+ bfq_put_async_queues(bfqd, bfqg);
934 -+ }
935 -+}
936 -+
937 -+static inline void bfq_free_root_group(struct bfq_data *bfqd)
938 -+{
939 -+ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
940 -+ struct bfq_group *bfqg = bfqd->root_group;
941 -+
942 -+ bfq_put_async_queues(bfqd, bfqg);
943 -+
944 -+ spin_lock_irq(&bgrp->lock);
945 -+ hlist_del_rcu(&bfqg->group_node);
946 -+ spin_unlock_irq(&bgrp->lock);
947 -+
948 -+ /*
949 -+ * No need to synchronize_rcu() here: since the device is gone
950 -+ * there cannot be any read-side access to its root_group.
951 -+ */
952 -+ kfree(bfqg);
953 -+}
954 -+
955 -+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
956 -+{
957 -+ struct bfq_group *bfqg;
958 -+ struct bfqio_cgroup *bgrp;
959 -+ int i;
960 -+
961 -+ bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);
962 -+ if (bfqg == NULL)
963 -+ return NULL;
964 -+
965 -+ bfqg->entity.parent = NULL;
966 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
967 -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
968 -+
969 -+ bgrp = &bfqio_root_cgroup;
970 -+ spin_lock_irq(&bgrp->lock);
971 -+ rcu_assign_pointer(bfqg->bfqd, bfqd);
972 -+ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
973 -+ spin_unlock_irq(&bgrp->lock);
974 -+
975 -+ return bfqg;
976 -+}
977 -+
978 -+#define SHOW_FUNCTION(__VAR) \
979 -+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
980 -+ struct cftype *cftype) \
981 -+{ \
982 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
983 -+ u64 ret = -ENODEV; \
984 -+ \
985 -+ mutex_lock(&bfqio_mutex); \
986 -+ if (bfqio_is_removed(bgrp)) \
987 -+ goto out_unlock; \
988 -+ \
989 -+ spin_lock_irq(&bgrp->lock); \
990 -+ ret = bgrp->__VAR; \
991 -+ spin_unlock_irq(&bgrp->lock); \
992 -+ \
993 -+out_unlock: \
994 -+ mutex_unlock(&bfqio_mutex); \
995 -+ return ret; \
996 -+}
997 -+
998 -+SHOW_FUNCTION(weight);
999 -+SHOW_FUNCTION(ioprio);
1000 -+SHOW_FUNCTION(ioprio_class);
1001 -+#undef SHOW_FUNCTION
1002 -+
1003 -+#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
1004 -+static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
1005 -+ struct cftype *cftype, \
1006 -+ u64 val) \
1007 -+{ \
1008 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
1009 -+ struct bfq_group *bfqg; \
1010 -+ int ret = -EINVAL; \
1011 -+ \
1012 -+ if (val < (__MIN) || val > (__MAX)) \
1013 -+ return ret; \
1014 -+ \
1015 -+ ret = -ENODEV; \
1016 -+ mutex_lock(&bfqio_mutex); \
1017 -+ if (bfqio_is_removed(bgrp)) \
1018 -+ goto out_unlock; \
1019 -+ ret = 0; \
1020 -+ \
1021 -+ spin_lock_irq(&bgrp->lock); \
1022 -+ bgrp->__VAR = (unsigned short)val; \
1023 -+ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
1024 -+ /* \
1025 -+ * Setting the ioprio_changed flag of the entity \
1026 -+ * to 1 with new_##__VAR == ##__VAR would re-set \
1027 -+ * the value of the weight to its ioprio mapping. \
1028 -+ * Set the flag only if necessary. \
1029 -+ */ \
1030 -+ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
1031 -+ bfqg->entity.new_##__VAR = (unsigned short)val; \
1032 -+ smp_wmb(); \
1033 -+ bfqg->entity.ioprio_changed = 1; \
1034 -+ } \
1035 -+ } \
1036 -+ spin_unlock_irq(&bgrp->lock); \
1037 -+ \
1038 -+out_unlock: \
1039 -+ mutex_unlock(&bfqio_mutex); \
1040 -+ return ret; \
1041 -+}
1042 -+
1043 -+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
1044 -+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
1045 -+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
1046 -+#undef STORE_FUNCTION
1047 -+
1048 -+static struct cftype bfqio_files[] = {
1049 -+ {
1050 -+ .name = "weight",
1051 -+ .read_u64 = bfqio_cgroup_weight_read,
1052 -+ .write_u64 = bfqio_cgroup_weight_write,
1053 -+ },
1054 -+ {
1055 -+ .name = "ioprio",
1056 -+ .read_u64 = bfqio_cgroup_ioprio_read,
1057 -+ .write_u64 = bfqio_cgroup_ioprio_write,
1058 -+ },
1059 -+ {
1060 -+ .name = "ioprio_class",
1061 -+ .read_u64 = bfqio_cgroup_ioprio_class_read,
1062 -+ .write_u64 = bfqio_cgroup_ioprio_class_write,
1063 -+ },
1064 -+ { }, /* terminate */
1065 -+};
1066 -+
1067 -+static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state
1068 -+ *parent_css)
1069 -+{
1070 -+ struct bfqio_cgroup *bgrp;
1071 -+
1072 -+ if (parent_css != NULL) {
1073 -+ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
1074 -+ if (bgrp == NULL)
1075 -+ return ERR_PTR(-ENOMEM);
1076 -+ } else
1077 -+ bgrp = &bfqio_root_cgroup;
1078 -+
1079 -+ spin_lock_init(&bgrp->lock);
1080 -+ INIT_HLIST_HEAD(&bgrp->group_data);
1081 -+ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
1082 -+ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
1083 -+
1084 -+ return &bgrp->css;
1085 -+}
1086 -+
1087 -+/*
1088 -+ * We cannot support shared io contexts, as we have no means to support
1089 -+ * two tasks with the same ioc in two different groups without major rework
1090 -+ * of the main bic/bfqq data structures. By now we allow a task to change
1091 -+ * its cgroup only if it's the only owner of its ioc; the drawback of this
1092 -+ * behavior is that a group containing a task that forked using CLONE_IO
1093 -+ * will not be destroyed until the tasks sharing the ioc die.
1094 -+ */
1095 -+static int bfqio_can_attach(struct cgroup_subsys_state *css,
1096 -+ struct cgroup_taskset *tset)
1097 -+{
1098 -+ struct task_struct *task;
1099 -+ struct io_context *ioc;
1100 -+ int ret = 0;
1101 -+
1102 -+ cgroup_taskset_for_each(task, css, tset) {
1103 -+ /*
1104 -+ * task_lock() is needed to avoid races with
1105 -+ * exit_io_context()
1106 -+ */
1107 -+ task_lock(task);
1108 -+ ioc = task->io_context;
1109 -+ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
1110 -+ /*
1111 -+ * ioc == NULL means that the task is either too young
1112 -+ * or exiting: if it has still no ioc the ioc can't be
1113 -+ * shared, if the task is exiting the attach will fail
1114 -+ * anyway, no matter what we return here.
1115 -+ */
1116 -+ ret = -EINVAL;
1117 -+ task_unlock(task);
1118 -+ if (ret)
1119 -+ break;
1120 -+ }
1121 -+
1122 -+ return ret;
1123 -+}
1124 -+
1125 -+static void bfqio_attach(struct cgroup_subsys_state *css,
1126 -+ struct cgroup_taskset *tset)
1127 -+{
1128 -+ struct task_struct *task;
1129 -+ struct io_context *ioc;
1130 -+ struct io_cq *icq;
1131 -+
1132 -+ /*
1133 -+ * IMPORTANT NOTE: The move of more than one process at a time to a
1134 -+ * new group has not yet been tested.
1135 -+ */
1136 -+ cgroup_taskset_for_each(task, css, tset) {
1137 -+ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1138 -+ if (ioc) {
1139 -+ /*
1140 -+ * Handle cgroup change here.
1141 -+ */
1142 -+ rcu_read_lock();
1143 -+ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
1144 -+ if (!strncmp(
1145 -+ icq->q->elevator->type->elevator_name,
1146 -+ "bfq", ELV_NAME_MAX))
1147 -+ bfq_bic_change_cgroup(icq_to_bic(icq),
1148 -+ css);
1149 -+ rcu_read_unlock();
1150 -+ put_io_context(ioc);
1151 -+ }
1152 -+ }
1153 -+}
1154 -+
1155 -+static void bfqio_destroy(struct cgroup_subsys_state *css)
1156 -+{
1157 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
1158 -+ struct hlist_node *tmp;
1159 -+ struct bfq_group *bfqg;
1160 -+
1161 -+ /*
1162 -+ * Since we are destroying the cgroup, there are no more tasks
1163 -+ * referencing it, and all the RCU grace periods that may have
1164 -+ * referenced it are ended (as the destruction of the parent
1165 -+ * cgroup is RCU-safe); bgrp->group_data will not be accessed by
1166 -+ * anything else and we don't need any synchronization.
1167 -+ */
1168 -+ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
1169 -+ bfq_destroy_group(bgrp, bfqg);
1170 -+
1171 -+ BUG_ON(!hlist_empty(&bgrp->group_data));
1172 -+
1173 -+ kfree(bgrp);
1174 -+}
1175 -+
1176 -+static int bfqio_css_online(struct cgroup_subsys_state *css)
1177 -+{
1178 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
1179 -+
1180 -+ mutex_lock(&bfqio_mutex);
1181 -+ bgrp->online = true;
1182 -+ mutex_unlock(&bfqio_mutex);
1183 -+
1184 -+ return 0;
1185 -+}
1186 -+
1187 -+static void bfqio_css_offline(struct cgroup_subsys_state *css)
1188 -+{
1189 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
1190 -+
1191 -+ mutex_lock(&bfqio_mutex);
1192 -+ bgrp->online = false;
1193 -+ mutex_unlock(&bfqio_mutex);
1194 -+}
1195 -+
1196 -+struct cgroup_subsys bfqio_subsys = {
1197 -+ .name = "bfqio",
1198 -+ .css_alloc = bfqio_create,
1199 -+ .css_online = bfqio_css_online,
1200 -+ .css_offline = bfqio_css_offline,
1201 -+ .can_attach = bfqio_can_attach,
1202 -+ .attach = bfqio_attach,
1203 -+ .css_free = bfqio_destroy,
1204 -+ .subsys_id = bfqio_subsys_id,
1205 -+ .base_cftypes = bfqio_files,
1206 -+};
1207 -+#else
1208 -+static inline void bfq_init_entity(struct bfq_entity *entity,
1209 -+ struct bfq_group *bfqg)
1210 -+{
1211 -+ entity->weight = entity->new_weight;
1212 -+ entity->orig_weight = entity->new_weight;
1213 -+ entity->ioprio = entity->new_ioprio;
1214 -+ entity->ioprio_class = entity->new_ioprio_class;
1215 -+ entity->sched_data = &bfqg->sched_data;
1216 -+}
1217 -+
1218 -+static inline struct bfq_group *
1219 -+bfq_bic_update_cgroup(struct bfq_io_cq *bic)
1220 -+{
1221 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
1222 -+ return bfqd->root_group;
1223 -+}
1224 -+
1225 -+static inline void bfq_bfqq_move(struct bfq_data *bfqd,
1226 -+ struct bfq_queue *bfqq,
1227 -+ struct bfq_entity *entity,
1228 -+ struct bfq_group *bfqg)
1229 -+{
1230 -+}
1231 -+
1232 -+static void bfq_end_raising_async(struct bfq_data *bfqd)
1233 -+{
1234 -+ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
1235 -+}
1236 -+
1237 -+static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
1238 -+{
1239 -+ bfq_put_async_queues(bfqd, bfqd->root_group);
1240 -+}
1241 -+
1242 -+static inline void bfq_free_root_group(struct bfq_data *bfqd)
1243 -+{
1244 -+ kfree(bfqd->root_group);
1245 -+}
1246 -+
1247 -+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
1248 -+{
1249 -+ struct bfq_group *bfqg;
1250 -+ int i;
1251 -+
1252 -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
1253 -+ if (bfqg == NULL)
1254 -+ return NULL;
1255 -+
1256 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
1257 -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
1258 -+
1259 -+ return bfqg;
1260 -+}
1261 -+#endif
1262 -diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
1263 -new file mode 100644
1264 -index 0000000..7f6b000
1265 ---- /dev/null
1266 -+++ b/block/bfq-ioc.c
1267 -@@ -0,0 +1,36 @@
1268 -+/*
1269 -+ * BFQ: I/O context handling.
1270 -+ *
1271 -+ * Based on ideas and code from CFQ:
1272 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1273 -+ *
1274 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1275 -+ * Paolo Valente <paolo.valente@×××××××.it>
1276 -+ *
1277 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1278 -+ */
1279 -+
1280 -+/**
1281 -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
1282 -+ * @icq: the iocontext queue.
1283 -+ */
1284 -+static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
1285 -+{
1286 -+ /* bic->icq is the first member, %NULL will convert to %NULL */
1287 -+ return container_of(icq, struct bfq_io_cq, icq);
1288 -+}
1289 -+
1290 -+/**
1291 -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
1292 -+ * @bfqd: the lookup key.
1293 -+ * @ioc: the io_context of the process doing I/O.
1294 -+ *
1295 -+ * Queue lock must be held.
1296 -+ */
1297 -+static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
1298 -+ struct io_context *ioc)
1299 -+{
1300 -+ if (ioc)
1301 -+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
1302 -+ return NULL;
1303 -+}
1304 -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
1305 -new file mode 100644
1306 -index 0000000..7670400
1307 ---- /dev/null
1308 -+++ b/block/bfq-iosched.c
1309 -@@ -0,0 +1,3268 @@
1310 -+/*
1311 -+ * BFQ, or Budget Fair Queueing, disk scheduler.
1312 -+ *
1313 -+ * Based on ideas and code from CFQ:
1314 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1315 -+ *
1316 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1317 -+ * Paolo Valente <paolo.valente@×××××××.it>
1318 -+ *
1319 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1320 -+ *
1321 -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
1322 -+ *
1323 -+ * BFQ is a proportional share disk scheduling algorithm based on the
1324 -+ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in
1325 -+ * number of sectors, to tasks instead of time slices. The disk is not granted
1326 -+ * to the in-service task for a given time slice, but until it has exahusted
1327 -+ * its assigned budget. This change from the time to the service domain allows
1328 -+ * BFQ to distribute the disk bandwidth among tasks as desired, without any
1329 -+ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an
1330 -+ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to
1331 -+ * their budgets (more precisely BFQ schedules queues associated to tasks).
1332 -+ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to
1333 -+ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low
1334 -+ * latencies to interactive and soft real-time applications.
1335 -+ *
1336 -+ * BFQ is described in [1], where also a reference to the initial, more
1337 -+ * theoretical paper on BFQ can be found. The interested reader can find in
1338 -+ * the latter paper full details on the main algorithm as well as formulas of
1339 -+ * the guarantees, plus formal proofs of all the properties. With respect to
1340 -+ * the version of BFQ presented in these papers, this implementation adds a
1341 -+ * few more heuristics, such as the one that guarantees a low latency to soft
1342 -+ * real-time applications, and a hierarchical extension based on H-WF2Q+.
1343 -+ *
1344 -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
1345 -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
1346 -+ * complexity derives from the one introduced with EEVDF in [3].
1347 -+ *
1348 -+ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
1349 -+ * with the BFQ Disk I/O Scheduler'',
1350 -+ * Proceedings of the 5th Annual International Systems and Storage
1351 -+ * Conference (SYSTOR '12), June 2012.
1352 -+ *
1353 -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
1354 -+ *
1355 -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
1356 -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
1357 -+ * Oct 1997.
1358 -+ *
1359 -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
1360 -+ *
1361 -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
1362 -+ * First: A Flexible and Accurate Mechanism for Proportional Share
1363 -+ * Resource Allocation,'' technical report.
1364 -+ *
1365 -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
1366 -+ */
1367 -+#include <linux/module.h>
1368 -+#include <linux/slab.h>
1369 -+#include <linux/blkdev.h>
1370 -+#include <linux/cgroup.h>
1371 -+#include <linux/elevator.h>
1372 -+#include <linux/jiffies.h>
1373 -+#include <linux/rbtree.h>
1374 -+#include <linux/ioprio.h>
1375 -+#include "bfq.h"
1376 -+#include "blk.h"
1377 -+
1378 -+/* Max number of dispatches in one round of service. */
1379 -+static const int bfq_quantum = 4;
1380 -+
1381 -+/* Expiration time of sync (0) and async (1) requests, in jiffies. */
1382 -+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
1383 -+
1384 -+/* Maximum backwards seek, in KiB. */
1385 -+static const int bfq_back_max = 16 * 1024;
1386 -+
1387 -+/* Penalty of a backwards seek, in number of sectors. */
1388 -+static const int bfq_back_penalty = 2;
1389 -+
1390 -+/* Idling period duration, in jiffies. */
1391 -+static int bfq_slice_idle = HZ / 125;
1392 -+
1393 -+/* Default maximum budget values, in sectors and number of requests. */
1394 -+static const int bfq_default_max_budget = 16 * 1024;
1395 -+static const int bfq_max_budget_async_rq = 4;
1396 -+
1397 -+/*
1398 -+ * Async to sync throughput distribution is controlled as follows:
1399 -+ * when an async request is served, the entity is charged the number
1400 -+ * of sectors of the request, multipled by the factor below
1401 -+ */
1402 -+static const int bfq_async_charge_factor = 10;
1403 -+
1404 -+/* Default timeout values, in jiffies, approximating CFQ defaults. */
1405 -+static const int bfq_timeout_sync = HZ / 8;
1406 -+static int bfq_timeout_async = HZ / 25;
1407 -+
1408 -+struct kmem_cache *bfq_pool;
1409 -+
1410 -+/* Below this threshold (in ms), we consider thinktime immediate. */
1411 -+#define BFQ_MIN_TT 2
1412 -+
1413 -+/* hw_tag detection: parallel requests threshold and min samples needed. */
1414 -+#define BFQ_HW_QUEUE_THRESHOLD 4
1415 -+#define BFQ_HW_QUEUE_SAMPLES 32
1416 -+
1417 -+#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
1418 -+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
1419 -+
1420 -+/* Min samples used for peak rate estimation (for autotuning). */
1421 -+#define BFQ_PEAK_RATE_SAMPLES 32
1422 -+
1423 -+/* Shift used for peak rate fixed precision calculations. */
1424 -+#define BFQ_RATE_SHIFT 16
1425 -+
1426 -+/*
1427 -+ * The duration of the weight raising for interactive applications is
1428 -+ * computed automatically (as default behaviour), using the following
1429 -+ * formula: duration = (R / r) * T, where r is the peak rate of the
1430 -+ * disk, and R and T are two reference parameters. In particular, R is
1431 -+ * the peak rate of a reference disk, and T is about the maximum time
1432 -+ * for starting popular large applications on that disk, under BFQ and
1433 -+ * while reading two files in parallel. Finally, BFQ uses two
1434 -+ * different pairs (R, T) depending on whether the disk is rotational
1435 -+ * or non-rotational.
1436 -+ */
1437 -+#define T_rot (msecs_to_jiffies(5500))
1438 -+#define T_nonrot (msecs_to_jiffies(2000))
1439 -+/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
1440 -+#define R_rot 17415
1441 -+#define R_nonrot 34791
1442 -+
1443 -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
1444 -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1445 -+
1446 -+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
1447 -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
1448 -+
1449 -+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
1450 -+
1451 -+#include "bfq-ioc.c"
1452 -+#include "bfq-sched.c"
1453 -+#include "bfq-cgroup.c"
1454 -+
1455 -+#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
1456 -+ IOPRIO_CLASS_IDLE)
1457 -+#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
1458 -+ IOPRIO_CLASS_RT)
1459 -+
1460 -+#define bfq_sample_valid(samples) ((samples) > 80)
1461 -+
1462 -+/*
1463 -+ * We regard a request as SYNC, if either it's a read or has the SYNC bit
1464 -+ * set (in which case it could also be a direct WRITE).
1465 -+ */
1466 -+static inline int bfq_bio_sync(struct bio *bio)
1467 -+{
1468 -+ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
1469 -+ return 1;
1470 -+
1471 -+ return 0;
1472 -+}
1473 -+
1474 -+/*
1475 -+ * Scheduler run of queue, if there are requests pending and no one in the
1476 -+ * driver that will restart queueing.
1477 -+ */
1478 -+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
1479 -+{
1480 -+ if (bfqd->queued != 0) {
1481 -+ bfq_log(bfqd, "schedule dispatch");
1482 -+ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
1483 -+ }
1484 -+}
1485 -+
1486 -+/*
1487 -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1488 -+ * We choose the request that is closesr to the head right now. Distance
1489 -+ * behind the head is penalized and only allowed to a certain extent.
1490 -+ */
1491 -+static struct request *bfq_choose_req(struct bfq_data *bfqd,
1492 -+ struct request *rq1,
1493 -+ struct request *rq2,
1494 -+ sector_t last)
1495 -+{
1496 -+ sector_t s1, s2, d1 = 0, d2 = 0;
1497 -+ unsigned long back_max;
1498 -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
1499 -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
1500 -+ unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1501 -+
1502 -+ if (rq1 == NULL || rq1 == rq2)
1503 -+ return rq2;
1504 -+ if (rq2 == NULL)
1505 -+ return rq1;
1506 -+
1507 -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
1508 -+ return rq1;
1509 -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
1510 -+ return rq2;
1511 -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
1512 -+ return rq1;
1513 -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
1514 -+ return rq2;
1515 -+
1516 -+ s1 = blk_rq_pos(rq1);
1517 -+ s2 = blk_rq_pos(rq2);
1518 -+
1519 -+ /*
1520 -+ * By definition, 1KiB is 2 sectors.
1521 -+ */
1522 -+ back_max = bfqd->bfq_back_max * 2;
1523 -+
1524 -+ /*
1525 -+ * Strict one way elevator _except_ in the case where we allow
1526 -+ * short backward seeks which are biased as twice the cost of a
1527 -+ * similar forward seek.
1528 -+ */
1529 -+ if (s1 >= last)
1530 -+ d1 = s1 - last;
1531 -+ else if (s1 + back_max >= last)
1532 -+ d1 = (last - s1) * bfqd->bfq_back_penalty;
1533 -+ else
1534 -+ wrap |= BFQ_RQ1_WRAP;
1535 -+
1536 -+ if (s2 >= last)
1537 -+ d2 = s2 - last;
1538 -+ else if (s2 + back_max >= last)
1539 -+ d2 = (last - s2) * bfqd->bfq_back_penalty;
1540 -+ else
1541 -+ wrap |= BFQ_RQ2_WRAP;
1542 -+
1543 -+ /* Found required data */
1544 -+
1545 -+ /*
1546 -+ * By doing switch() on the bit mask "wrap" we avoid having to
1547 -+ * check two variables for all permutations: --> faster!
1548 -+ */
1549 -+ switch (wrap) {
1550 -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1551 -+ if (d1 < d2)
1552 -+ return rq1;
1553 -+ else if (d2 < d1)
1554 -+ return rq2;
1555 -+ else {
1556 -+ if (s1 >= s2)
1557 -+ return rq1;
1558 -+ else
1559 -+ return rq2;
1560 -+ }
1561 -+
1562 -+ case BFQ_RQ2_WRAP:
1563 -+ return rq1;
1564 -+ case BFQ_RQ1_WRAP:
1565 -+ return rq2;
1566 -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
1567 -+ default:
1568 -+ /*
1569 -+ * Since both rqs are wrapped,
1570 -+ * start with the one that's further behind head
1571 -+ * (--> only *one* back seek required),
1572 -+ * since back seek takes more time than forward.
1573 -+ */
1574 -+ if (s1 <= s2)
1575 -+ return rq1;
1576 -+ else
1577 -+ return rq2;
1578 -+ }
1579 -+}
1580 -+
1581 -+static struct bfq_queue *
1582 -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
1583 -+ sector_t sector, struct rb_node **ret_parent,
1584 -+ struct rb_node ***rb_link)
1585 -+{
1586 -+ struct rb_node **p, *parent;
1587 -+ struct bfq_queue *bfqq = NULL;
1588 -+
1589 -+ parent = NULL;
1590 -+ p = &root->rb_node;
1591 -+ while (*p) {
1592 -+ struct rb_node **n;
1593 -+
1594 -+ parent = *p;
1595 -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
1596 -+
1597 -+ /*
1598 -+ * Sort strictly based on sector. Smallest to the left,
1599 -+ * largest to the right.
1600 -+ */
1601 -+ if (sector > blk_rq_pos(bfqq->next_rq))
1602 -+ n = &(*p)->rb_right;
1603 -+ else if (sector < blk_rq_pos(bfqq->next_rq))
1604 -+ n = &(*p)->rb_left;
1605 -+ else
1606 -+ break;
1607 -+ p = n;
1608 -+ bfqq = NULL;
1609 -+ }
1610 -+
1611 -+ *ret_parent = parent;
1612 -+ if (rb_link)
1613 -+ *rb_link = p;
1614 -+
1615 -+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
1616 -+ (long long unsigned)sector,
1617 -+ bfqq != NULL ? bfqq->pid : 0);
1618 -+
1619 -+ return bfqq;
1620 -+}
1621 -+
1622 -+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1623 -+{
1624 -+ struct rb_node **p, *parent;
1625 -+ struct bfq_queue *__bfqq;
1626 -+
1627 -+ if (bfqq->pos_root != NULL) {
1628 -+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
1629 -+ bfqq->pos_root = NULL;
1630 -+ }
1631 -+
1632 -+ if (bfq_class_idle(bfqq))
1633 -+ return;
1634 -+ if (!bfqq->next_rq)
1635 -+ return;
1636 -+
1637 -+ bfqq->pos_root = &bfqd->rq_pos_tree;
1638 -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
1639 -+ blk_rq_pos(bfqq->next_rq), &parent, &p);
1640 -+ if (__bfqq == NULL) {
1641 -+ rb_link_node(&bfqq->pos_node, parent, p);
1642 -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
1643 -+ } else
1644 -+ bfqq->pos_root = NULL;
1645 -+}
1646 -+
1647 -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
1648 -+ struct bfq_queue *bfqq,
1649 -+ struct request *last)
1650 -+{
1651 -+ struct rb_node *rbnext = rb_next(&last->rb_node);
1652 -+ struct rb_node *rbprev = rb_prev(&last->rb_node);
1653 -+ struct request *next = NULL, *prev = NULL;
1654 -+
1655 -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1656 -+
1657 -+ if (rbprev != NULL)
1658 -+ prev = rb_entry_rq(rbprev);
1659 -+
1660 -+ if (rbnext != NULL)
1661 -+ next = rb_entry_rq(rbnext);
1662 -+ else {
1663 -+ rbnext = rb_first(&bfqq->sort_list);
1664 -+ if (rbnext && rbnext != &last->rb_node)
1665 -+ next = rb_entry_rq(rbnext);
1666 -+ }
1667 -+
1668 -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
1669 -+}
1670 -+
1671 -+static void bfq_del_rq_rb(struct request *rq)
1672 -+{
1673 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1674 -+ struct bfq_data *bfqd = bfqq->bfqd;
1675 -+ const int sync = rq_is_sync(rq);
1676 -+
1677 -+ BUG_ON(bfqq->queued[sync] == 0);
1678 -+ bfqq->queued[sync]--;
1679 -+ bfqd->queued--;
1680 -+
1681 -+ elv_rb_del(&bfqq->sort_list, rq);
1682 -+
1683 -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
1684 -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
1685 -+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
1686 -+ /*
1687 -+ * Remove queue from request-position tree as it is empty.
1688 -+ */
1689 -+ if (bfqq->pos_root != NULL) {
1690 -+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
1691 -+ bfqq->pos_root = NULL;
1692 -+ }
1693 -+ }
1694 -+}
1695 -+
1696 -+/* see the definition of bfq_async_charge_factor for details */
1697 -+static inline unsigned long bfq_serv_to_charge(struct request *rq,
1698 -+ struct bfq_queue *bfqq)
1699 -+{
1700 -+ return blk_rq_sectors(rq) *
1701 -+ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
1702 -+ bfq_async_charge_factor));
1703 -+}
1704 -+
1705 -+/**
1706 -+ * bfq_updated_next_req - update the queue after a new next_rq selection.
1707 -+ * @bfqd: the device data the queue belongs to.
1708 -+ * @bfqq: the queue to update.
1709 -+ *
1710 -+ * If the first request of a queue changes we make sure that the queue
1711 -+ * has enough budget to serve at least its first request (if the
1712 -+ * request has grown). We do this because if the queue has not enough
1713 -+ * budget for its first request, it has to go through two dispatch
1714 -+ * rounds to actually get it dispatched.
1715 -+ */
1716 -+static void bfq_updated_next_req(struct bfq_data *bfqd,
1717 -+ struct bfq_queue *bfqq)
1718 -+{
1719 -+ struct bfq_entity *entity = &bfqq->entity;
1720 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1721 -+ struct request *next_rq = bfqq->next_rq;
1722 -+ unsigned long new_budget;
1723 -+
1724 -+ if (next_rq == NULL)
1725 -+ return;
1726 -+
1727 -+ if (bfqq == bfqd->in_service_queue)
1728 -+ /*
1729 -+ * In order not to break guarantees, budgets cannot be
1730 -+ * changed after an entity has been selected.
1731 -+ */
1732 -+ return;
1733 -+
1734 -+ BUG_ON(entity->tree != &st->active);
1735 -+ BUG_ON(entity == entity->sched_data->active_entity);
1736 -+
1737 -+ new_budget = max_t(unsigned long, bfqq->max_budget,
1738 -+ bfq_serv_to_charge(next_rq, bfqq));
1739 -+ entity->budget = new_budget;
1740 -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
1741 -+ bfq_activate_bfqq(bfqd, bfqq);
1742 -+}
1743 -+
1744 -+static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
1745 -+{
1746 -+ u64 dur;
1747 -+
1748 -+ if (bfqd->bfq_raising_max_time > 0)
1749 -+ return bfqd->bfq_raising_max_time;
1750 -+
1751 -+ dur = bfqd->RT_prod;
1752 -+ do_div(dur, bfqd->peak_rate);
1753 -+
1754 -+ return dur;
1755 -+}
1756 -+
1757 -+static void bfq_add_rq_rb(struct request *rq)
1758 -+{
1759 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1760 -+ struct bfq_entity *entity = &bfqq->entity;
1761 -+ struct bfq_data *bfqd = bfqq->bfqd;
1762 -+ struct request *next_rq, *prev;
1763 -+ unsigned long old_raising_coeff = bfqq->raising_coeff;
1764 -+ int idle_for_long_time = 0;
1765 -+
1766 -+ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
1767 -+ bfqq->queued[rq_is_sync(rq)]++;
1768 -+ bfqd->queued++;
1769 -+
1770 -+ elv_rb_add(&bfqq->sort_list, rq);
1771 -+
1772 -+ /*
1773 -+ * Check if this request is a better next-serve candidate.
1774 -+ */
1775 -+ prev = bfqq->next_rq;
1776 -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
1777 -+ BUG_ON(next_rq == NULL);
1778 -+ bfqq->next_rq = next_rq;
1779 -+
1780 -+ /*
1781 -+ * Adjust priority tree position, if next_rq changes.
1782 -+ */
1783 -+ if (prev != bfqq->next_rq)
1784 -+ bfq_rq_pos_tree_add(bfqd, bfqq);
1785 -+
1786 -+ if (!bfq_bfqq_busy(bfqq)) {
1787 -+ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
1788 -+ time_is_before_jiffies(bfqq->soft_rt_next_start);
1789 -+ idle_for_long_time = time_is_before_jiffies(
1790 -+ bfqq->budget_timeout +
1791 -+ bfqd->bfq_raising_min_idle_time);
1792 -+ entity->budget = max_t(unsigned long, bfqq->max_budget,
1793 -+ bfq_serv_to_charge(next_rq, bfqq));
1794 -+
1795 -+ if (!bfqd->low_latency)
1796 -+ goto add_bfqq_busy;
1797 -+
1798 -+ /*
1799 -+ * If the queue is not being boosted and has been idle
1800 -+ * for enough time, start a weight-raising period
1801 -+ */
1802 -+ if (old_raising_coeff == 1 &&
1803 -+ (idle_for_long_time || soft_rt)) {
1804 -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1805 -+ if (idle_for_long_time)
1806 -+ bfqq->raising_cur_max_time =
1807 -+ bfq_wrais_duration(bfqd);
1808 -+ else
1809 -+ bfqq->raising_cur_max_time =
1810 -+ bfqd->bfq_raising_rt_max_time;
1811 -+ bfq_log_bfqq(bfqd, bfqq,
1812 -+ "wrais starting at %llu msec,"
1813 -+ "rais_max_time %u",
1814 -+ bfqq->last_rais_start_finish,
1815 -+ jiffies_to_msecs(bfqq->
1816 -+ raising_cur_max_time));
1817 -+ } else if (old_raising_coeff > 1) {
1818 -+ if (idle_for_long_time)
1819 -+ bfqq->raising_cur_max_time =
1820 -+ bfq_wrais_duration(bfqd);
1821 -+ else if (bfqq->raising_cur_max_time ==
1822 -+ bfqd->bfq_raising_rt_max_time &&
1823 -+ !soft_rt) {
1824 -+ bfqq->raising_coeff = 1;
1825 -+ bfq_log_bfqq(bfqd, bfqq,
1826 -+ "wrais ending at %llu msec,"
1827 -+ "rais_max_time %u",
1828 -+ bfqq->last_rais_start_finish,
1829 -+ jiffies_to_msecs(bfqq->
1830 -+ raising_cur_max_time));
1831 -+ } else if ((bfqq->last_rais_start_finish +
1832 -+ bfqq->raising_cur_max_time <
1833 -+ jiffies + bfqd->bfq_raising_rt_max_time) &&
1834 -+ soft_rt) {
1835 -+ /*
1836 -+ *
1837 -+ * The remaining weight-raising time is lower
1838 -+ * than bfqd->bfq_raising_rt_max_time, which
1839 -+ * means that the application is enjoying
1840 -+ * weight raising either because deemed soft rt
1841 -+ * in the near past, or because deemed
1842 -+ * interactive a long ago. In both cases,
1843 -+ * resetting now the current remaining weight-
1844 -+ * raising time for the application to the
1845 -+ * weight-raising duration for soft rt
1846 -+ * applications would not cause any latency
1847 -+ * increase for the application (as the new
1848 -+ * duration would be higher than the remaining
1849 -+ * time).
1850 -+ *
1851 -+ * In addition, the application is now meeting
1852 -+ * the requirements for being deemed soft rt.
1853 -+ * In the end we can correctly and safely
1854 -+ * (re)charge the weight-raising duration for
1855 -+ * the application with the weight-raising
1856 -+ * duration for soft rt applications.
1857 -+ *
1858 -+ * In particular, doing this recharge now, i.e.,
1859 -+ * before the weight-raising period for the
1860 -+ * application finishes, reduces the probability
1861 -+ * of the following negative scenario:
1862 -+ * 1) the weight of a soft rt application is
1863 -+ * raised at startup (as for any newly
1864 -+ * created application),
1865 -+ * 2) since the application is not interactive,
1866 -+ * at a certain time weight-raising is
1867 -+ * stopped for the application,
1868 -+ * 3) at that time the application happens to
1869 -+ * still have pending requests, and hence
1870 -+ * is destined to not have a chance to be
1871 -+ * deemed soft rt before these requests are
1872 -+ * completed (see the comments to the
1873 -+ * function bfq_bfqq_softrt_next_start()
1874 -+ * for details on soft rt detection),
1875 -+ * 4) these pending requests experience a high
1876 -+ * latency because the application is not
1877 -+ * weight-raised while they are pending.
1878 -+ */
1879 -+ bfqq->last_rais_start_finish = jiffies;
1880 -+ bfqq->raising_cur_max_time =
1881 -+ bfqd->bfq_raising_rt_max_time;
1882 -+ }
1883 -+ }
1884 -+ if (old_raising_coeff != bfqq->raising_coeff)
1885 -+ entity->ioprio_changed = 1;
1886 -+add_bfqq_busy:
1887 -+ bfqq->last_idle_bklogged = jiffies;
1888 -+ bfqq->service_from_backlogged = 0;
1889 -+ bfq_clear_bfqq_softrt_update(bfqq);
1890 -+ bfq_add_bfqq_busy(bfqd, bfqq);
1891 -+ } else {
1892 -+ if (bfqd->low_latency && old_raising_coeff == 1 &&
1893 -+ !rq_is_sync(rq) &&
1894 -+ bfqq->last_rais_start_finish +
1895 -+ time_is_before_jiffies(
1896 -+ bfqd->bfq_raising_min_inter_arr_async)) {
1897 -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1898 -+ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
1899 -+
1900 -+ bfqd->raised_busy_queues++;
1901 -+ entity->ioprio_changed = 1;
1902 -+ bfq_log_bfqq(bfqd, bfqq,
1903 -+ "non-idle wrais starting at %llu msec,"
1904 -+ "rais_max_time %u",
1905 -+ bfqq->last_rais_start_finish,
1906 -+ jiffies_to_msecs(bfqq->
1907 -+ raising_cur_max_time));
1908 -+ }
1909 -+ bfq_updated_next_req(bfqd, bfqq);
1910 -+ }
1911 -+
1912 -+ if (bfqd->low_latency &&
1913 -+ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
1914 -+ idle_for_long_time))
1915 -+ bfqq->last_rais_start_finish = jiffies;
1916 -+}
1917 -+
1918 -+static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
1919 -+{
1920 -+ elv_rb_del(&bfqq->sort_list, rq);
1921 -+ bfqq->queued[rq_is_sync(rq)]--;
1922 -+ bfqq->bfqd->queued--;
1923 -+ bfq_add_rq_rb(rq);
1924 -+}
1925 -+
1926 -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
1927 -+ struct bio *bio)
1928 -+{
1929 -+ struct task_struct *tsk = current;
1930 -+ struct bfq_io_cq *bic;
1931 -+ struct bfq_queue *bfqq;
1932 -+
1933 -+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
1934 -+ if (bic == NULL)
1935 -+ return NULL;
1936 -+
1937 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
1938 -+ if (bfqq != NULL)
1939 -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
1940 -+
1941 -+ return NULL;
1942 -+}
1943 -+
1944 -+static void bfq_activate_request(struct request_queue *q, struct request *rq)
1945 -+{
1946 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1947 -+
1948 -+ bfqd->rq_in_driver++;
1949 -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1950 -+ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
1951 -+ (long long unsigned)bfqd->last_position);
1952 -+}
1953 -+
1954 -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
1955 -+{
1956 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1957 -+
1958 -+ WARN_ON(bfqd->rq_in_driver == 0);
1959 -+ bfqd->rq_in_driver--;
1960 -+}
1961 -+
1962 -+static void bfq_remove_request(struct request *rq)
1963 -+{
1964 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1965 -+ struct bfq_data *bfqd = bfqq->bfqd;
1966 -+
1967 -+ if (bfqq->next_rq == rq) {
1968 -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
1969 -+ bfq_updated_next_req(bfqd, bfqq);
1970 -+ }
1971 -+
1972 -+ list_del_init(&rq->queuelist);
1973 -+ bfq_del_rq_rb(rq);
1974 -+
1975 -+ if (rq->cmd_flags & REQ_META) {
1976 -+ WARN_ON(bfqq->meta_pending == 0);
1977 -+ bfqq->meta_pending--;
1978 -+ }
1979 -+}
1980 -+
1981 -+static int bfq_merge(struct request_queue *q, struct request **req,
1982 -+ struct bio *bio)
1983 -+{
1984 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1985 -+ struct request *__rq;
1986 -+
1987 -+ __rq = bfq_find_rq_fmerge(bfqd, bio);
1988 -+ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
1989 -+ *req = __rq;
1990 -+ return ELEVATOR_FRONT_MERGE;
1991 -+ }
1992 -+
1993 -+ return ELEVATOR_NO_MERGE;
1994 -+}
1995 -+
1996 -+static void bfq_merged_request(struct request_queue *q, struct request *req,
1997 -+ int type)
1998 -+{
1999 -+ if (type == ELEVATOR_FRONT_MERGE) {
2000 -+ struct bfq_queue *bfqq = RQ_BFQQ(req);
2001 -+
2002 -+ bfq_reposition_rq_rb(bfqq, req);
2003 -+ }
2004 -+}
2005 -+
2006 -+static void bfq_merged_requests(struct request_queue *q, struct request *rq,
2007 -+ struct request *next)
2008 -+{
2009 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
2010 -+
2011 -+ /*
2012 -+ * Reposition in fifo if next is older than rq.
2013 -+ */
2014 -+ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2015 -+ time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
2016 -+ list_move(&rq->queuelist, &next->queuelist);
2017 -+ rq_set_fifo_time(rq, rq_fifo_time(next));
2018 -+ }
2019 -+
2020 -+ if (bfqq->next_rq == next)
2021 -+ bfqq->next_rq = rq;
2022 -+
2023 -+ bfq_remove_request(next);
2024 -+}
2025 -+
2026 -+/* Must be called with bfqq != NULL */
2027 -+static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)
2028 -+{
2029 -+ BUG_ON(bfqq == NULL);
2030 -+ if (bfq_bfqq_busy(bfqq))
2031 -+ bfqq->bfqd->raised_busy_queues--;
2032 -+ bfqq->raising_coeff = 1;
2033 -+ bfqq->raising_cur_max_time = 0;
2034 -+ /* Trigger a weight change on the next activation of the queue */
2035 -+ bfqq->entity.ioprio_changed = 1;
2036 -+}
2037 -+
2038 -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
2039 -+ struct bfq_group *bfqg)
2040 -+{
2041 -+ int i, j;
2042 -+
2043 -+ for (i = 0; i < 2; i++)
2044 -+ for (j = 0; j < IOPRIO_BE_NR; j++)
2045 -+ if (bfqg->async_bfqq[i][j] != NULL)
2046 -+ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);
2047 -+ if (bfqg->async_idle_bfqq != NULL)
2048 -+ bfq_bfqq_end_raising(bfqg->async_idle_bfqq);
2049 -+}
2050 -+
2051 -+static void bfq_end_raising(struct bfq_data *bfqd)
2052 -+{
2053 -+ struct bfq_queue *bfqq;
2054 -+
2055 -+ spin_lock_irq(bfqd->queue->queue_lock);
2056 -+
2057 -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
2058 -+ bfq_bfqq_end_raising(bfqq);
2059 -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
2060 -+ bfq_bfqq_end_raising(bfqq);
2061 -+ bfq_end_raising_async(bfqd);
2062 -+
2063 -+ spin_unlock_irq(bfqd->queue->queue_lock);
2064 -+}
2065 -+
2066 -+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
2067 -+ struct bio *bio)
2068 -+{
2069 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
2070 -+ struct bfq_io_cq *bic;
2071 -+ struct bfq_queue *bfqq;
2072 -+
2073 -+ /*
2074 -+ * Disallow merge of a sync bio into an async request.
2075 -+ */
2076 -+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
2077 -+ return 0;
2078 -+
2079 -+ /*
2080 -+ * Lookup the bfqq that this bio will be queued with. Allow
2081 -+ * merge only if rq is queued there.
2082 -+ * Queue lock is held here.
2083 -+ */
2084 -+ bic = bfq_bic_lookup(bfqd, current->io_context);
2085 -+ if (bic == NULL)
2086 -+ return 0;
2087 -+
2088 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
2089 -+ return bfqq == RQ_BFQQ(rq);
2090 -+}
2091 -+
2092 -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
2093 -+ struct bfq_queue *bfqq)
2094 -+{
2095 -+ if (bfqq != NULL) {
2096 -+ bfq_mark_bfqq_must_alloc(bfqq);
2097 -+ bfq_mark_bfqq_budget_new(bfqq);
2098 -+ bfq_clear_bfqq_fifo_expire(bfqq);
2099 -+
2100 -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
2101 -+
2102 -+ bfq_log_bfqq(bfqd, bfqq,
2103 -+ "set_in_service_queue, cur-budget = %lu",
2104 -+ bfqq->entity.budget);
2105 -+ }
2106 -+
2107 -+ bfqd->in_service_queue = bfqq;
2108 -+}
2109 -+
2110 -+/*
2111 -+ * Get and set a new queue for service.
2112 -+ */
2113 -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
2114 -+ struct bfq_queue *bfqq)
2115 -+{
2116 -+ if (!bfqq)
2117 -+ bfqq = bfq_get_next_queue(bfqd);
2118 -+ else
2119 -+ bfq_get_next_queue_forced(bfqd, bfqq);
2120 -+
2121 -+ __bfq_set_in_service_queue(bfqd, bfqq);
2122 -+ return bfqq;
2123 -+}
2124 -+
2125 -+static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
2126 -+ struct request *rq)
2127 -+{
2128 -+ if (blk_rq_pos(rq) >= bfqd->last_position)
2129 -+ return blk_rq_pos(rq) - bfqd->last_position;
2130 -+ else
2131 -+ return bfqd->last_position - blk_rq_pos(rq);
2132 -+}
2133 -+
2134 -+/*
2135 -+ * Return true if bfqq has no request pending and rq is close enough to
2136 -+ * bfqd->last_position, or if rq is closer to bfqd->last_position than
2137 -+ * bfqq->next_rq
2138 -+ */
2139 -+static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
2140 -+{
2141 -+ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
2142 -+}
2143 -+
2144 -+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
2145 -+{
2146 -+ struct rb_root *root = &bfqd->rq_pos_tree;
2147 -+ struct rb_node *parent, *node;
2148 -+ struct bfq_queue *__bfqq;
2149 -+ sector_t sector = bfqd->last_position;
2150 -+
2151 -+ if (RB_EMPTY_ROOT(root))
2152 -+ return NULL;
2153 -+
2154 -+ /*
2155 -+ * First, if we find a request starting at the end of the last
2156 -+ * request, choose it.
2157 -+ */
2158 -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
2159 -+ if (__bfqq != NULL)
2160 -+ return __bfqq;
2161 -+
2162 -+ /*
2163 -+ * If the exact sector wasn't found, the parent of the NULL leaf
2164 -+ * will contain the closest sector (rq_pos_tree sorted by next_request
2165 -+ * position).
2166 -+ */
2167 -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
2168 -+ if (bfq_rq_close(bfqd, __bfqq->next_rq))
2169 -+ return __bfqq;
2170 -+
2171 -+ if (blk_rq_pos(__bfqq->next_rq) < sector)
2172 -+ node = rb_next(&__bfqq->pos_node);
2173 -+ else
2174 -+ node = rb_prev(&__bfqq->pos_node);
2175 -+ if (node == NULL)
2176 -+ return NULL;
2177 -+
2178 -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
2179 -+ if (bfq_rq_close(bfqd, __bfqq->next_rq))
2180 -+ return __bfqq;
2181 -+
2182 -+ return NULL;
2183 -+}
2184 -+
2185 -+/*
2186 -+ * bfqd - obvious
2187 -+ * cur_bfqq - passed in so that we don't decide that the current queue
2188 -+ * is closely cooperating with itself.
2189 -+ *
2190 -+ * We are assuming that cur_bfqq has dispatched at least one request,
2191 -+ * and that bfqd->last_position reflects a position on the disk associated
2192 -+ * with the I/O issued by cur_bfqq.
2193 -+ */
2194 -+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
2195 -+ struct bfq_queue *cur_bfqq)
2196 -+{
2197 -+ struct bfq_queue *bfqq;
2198 -+
2199 -+ if (bfq_class_idle(cur_bfqq))
2200 -+ return NULL;
2201 -+ if (!bfq_bfqq_sync(cur_bfqq))
2202 -+ return NULL;
2203 -+ if (BFQQ_SEEKY(cur_bfqq))
2204 -+ return NULL;
2205 -+
2206 -+ /* If device has only one backlogged bfq_queue, don't search. */
2207 -+ if (bfqd->busy_queues == 1)
2208 -+ return NULL;
2209 -+
2210 -+ /*
2211 -+ * We should notice if some of the queues are cooperating, e.g.
2212 -+ * working closely on the same area of the disk. In that case,
2213 -+ * we can group them together and don't waste time idling.
2214 -+ */
2215 -+ bfqq = bfqq_close(bfqd);
2216 -+ if (bfqq == NULL || bfqq == cur_bfqq)
2217 -+ return NULL;
2218 -+
2219 -+ /*
2220 -+ * Do not merge queues from different bfq_groups.
2221 -+ */
2222 -+ if (bfqq->entity.parent != cur_bfqq->entity.parent)
2223 -+ return NULL;
2224 -+
2225 -+ /*
2226 -+ * It only makes sense to merge sync queues.
2227 -+ */
2228 -+ if (!bfq_bfqq_sync(bfqq))
2229 -+ return NULL;
2230 -+ if (BFQQ_SEEKY(bfqq))
2231 -+ return NULL;
2232 -+
2233 -+ /*
2234 -+ * Do not merge queues of different priority classes.
2235 -+ */
2236 -+ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
2237 -+ return NULL;
2238 -+
2239 -+ return bfqq;
2240 -+}
2241 -+
2242 -+/*
2243 -+ * If enough samples have been computed, return the current max budget
2244 -+ * stored in bfqd, which is dynamically updated according to the
2245 -+ * estimated disk peak rate; otherwise return the default max budget
2246 -+ */
2247 -+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
2248 -+{
2249 -+ if (bfqd->budgets_assigned < 194)
2250 -+ return bfq_default_max_budget;
2251 -+ else
2252 -+ return bfqd->bfq_max_budget;
2253 -+}
2254 -+
2255 -+/*
2256 -+ * Return min budget, which is a fraction of the current or default
2257 -+ * max budget (trying with 1/32)
2258 -+ */
2259 -+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
2260 -+{
2261 -+ if (bfqd->budgets_assigned < 194)
2262 -+ return bfq_default_max_budget / 32;
2263 -+ else
2264 -+ return bfqd->bfq_max_budget / 32;
2265 -+}
2266 -+
2267 -+/*
2268 -+ * Decides whether idling should be done for given device and
2269 -+ * given in-service queue.
2270 -+ */
2271 -+static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
2272 -+ struct bfq_queue *in_service_bfqq)
2273 -+{
2274 -+ if (in_service_bfqq == NULL)
2275 -+ return false;
2276 -+ /*
2277 -+ * If device is SSD it has no seek penalty, disable idling; but
2278 -+ * do so only if:
2279 -+ * - device does not support queuing, otherwise we still have
2280 -+ * a problem with sync vs async workloads;
2281 -+ * - the queue is not weight-raised, to preserve guarantees.
2282 -+ */
2283 -+ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
2284 -+ in_service_bfqq->raising_coeff == 1);
2285 -+}
2286 -+
2287 -+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
2288 -+{
2289 -+ struct bfq_queue *bfqq = bfqd->in_service_queue;
2290 -+ struct bfq_io_cq *bic;
2291 -+ unsigned long sl;
2292 -+
2293 -+ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
2294 -+
2295 -+ /* Tasks have exited, don't wait. */
2296 -+ bic = bfqd->in_service_bic;
2297 -+ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
2298 -+ return;
2299 -+
2300 -+ bfq_mark_bfqq_wait_request(bfqq);
2301 -+
2302 -+ /*
2303 -+ * We don't want to idle for seeks, but we do want to allow
2304 -+ * fair distribution of slice time for a process doing back-to-back
2305 -+ * seeks. So allow a little bit of time for him to submit a new rq.
2306 -+ *
2307 -+ * To prevent processes with (partly) seeky workloads from
2308 -+ * being too ill-treated, grant them a small fraction of the
2309 -+ * assigned budget before reducing the waiting time to
2310 -+ * BFQ_MIN_TT. This happened to help reduce latency.
2311 -+ */
2312 -+ sl = bfqd->bfq_slice_idle;
2313 -+ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
2314 -+ bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
2315 -+ bfqq->raising_coeff == 1)
2316 -+ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
2317 -+ else if (bfqq->raising_coeff > 1)
2318 -+ sl = sl * 3;
2319 -+ bfqd->last_idling_start = ktime_get();
2320 -+ mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
2321 -+ bfq_log(bfqd, "arm idle: %u/%u ms",
2322 -+ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
2323 -+}
2324 -+
2325 -+/*
2326 -+ * Set the maximum time for the in-service queue to consume its
2327 -+ * budget. This prevents seeky processes from lowering the disk
2328 -+ * throughput (always guaranteed with a time slice scheme as in CFQ).
2329 -+ */
2330 -+static void bfq_set_budget_timeout(struct bfq_data *bfqd)
2331 -+{
2332 -+ struct bfq_queue *bfqq = bfqd->in_service_queue;
2333 -+ unsigned int timeout_coeff;
2334 -+ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
2335 -+ timeout_coeff = 1;
2336 -+ else
2337 -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
2338 -+
2339 -+ bfqd->last_budget_start = ktime_get();
2340 -+
2341 -+ bfq_clear_bfqq_budget_new(bfqq);
2342 -+ bfqq->budget_timeout = jiffies +
2343 -+ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
2344 -+
2345 -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
2346 -+ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
2347 -+ timeout_coeff));
2348 -+}
2349 -+
2350 -+/*
2351 -+ * Move request from internal lists to the request queue dispatch list.
2352 -+ */
2353 -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
2354 -+{
2355 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
2356 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
2357 -+
2358 -+ bfq_remove_request(rq);
2359 -+ bfqq->dispatched++;
2360 -+ elv_dispatch_sort(q, rq);
2361 -+
2362 -+ if (bfq_bfqq_sync(bfqq))
2363 -+ bfqd->sync_flight++;
2364 -+}
2365 -+
2366 -+/*
2367 -+ * Return expired entry, or NULL to just start from scratch in rbtree.
2368 -+ */
2369 -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
2370 -+{
2371 -+ struct request *rq = NULL;
2372 -+
2373 -+ if (bfq_bfqq_fifo_expire(bfqq))
2374 -+ return NULL;
2375 -+
2376 -+ bfq_mark_bfqq_fifo_expire(bfqq);
2377 -+
2378 -+ if (list_empty(&bfqq->fifo))
2379 -+ return NULL;
2380 -+
2381 -+ rq = rq_entry_fifo(bfqq->fifo.next);
2382 -+
2383 -+ if (time_before(jiffies, rq_fifo_time(rq)))
2384 -+ return NULL;
2385 -+
2386 -+ return rq;
2387 -+}
2388 -+
2389 -+/*
2390 -+ * Must be called with the queue_lock held.
2391 -+ */
2392 -+static int bfqq_process_refs(struct bfq_queue *bfqq)
2393 -+{
2394 -+ int process_refs, io_refs;
2395 -+
2396 -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
2397 -+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
2398 -+ BUG_ON(process_refs < 0);
2399 -+ return process_refs;
2400 -+}
2401 -+
2402 -+static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
2403 -+{
2404 -+ int process_refs, new_process_refs;
2405 -+ struct bfq_queue *__bfqq;
2406 -+
2407 -+ /*
2408 -+ * If there are no process references on the new_bfqq, then it is
2409 -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
2410 -+ * may have dropped their last reference (not just their last process
2411 -+ * reference).
2412 -+ */
2413 -+ if (!bfqq_process_refs(new_bfqq))
2414 -+ return;
2415 -+
2416 -+ /* Avoid a circular list and skip interim queue merges. */
2417 -+ while ((__bfqq = new_bfqq->new_bfqq)) {
2418 -+ if (__bfqq == bfqq)
2419 -+ return;
2420 -+ new_bfqq = __bfqq;
2421 -+ }
2422 -+
2423 -+ process_refs = bfqq_process_refs(bfqq);
2424 -+ new_process_refs = bfqq_process_refs(new_bfqq);
2425 -+ /*
2426 -+ * If the process for the bfqq has gone away, there is no
2427 -+ * sense in merging the queues.
2428 -+ */
2429 -+ if (process_refs == 0 || new_process_refs == 0)
2430 -+ return;
2431 -+
2432 -+ /*
2433 -+ * Merge in the direction of the lesser amount of work.
2434 -+ */
2435 -+ if (new_process_refs >= process_refs) {
2436 -+ bfqq->new_bfqq = new_bfqq;
2437 -+ atomic_add(process_refs, &new_bfqq->ref);
2438 -+ } else {
2439 -+ new_bfqq->new_bfqq = bfqq;
2440 -+ atomic_add(new_process_refs, &bfqq->ref);
2441 -+ }
2442 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
2443 -+ new_bfqq->pid);
2444 -+}
2445 -+
2446 -+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
2447 -+{
2448 -+ struct bfq_entity *entity = &bfqq->entity;
2449 -+ return entity->budget - entity->service;
2450 -+}
2451 -+
2452 -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2453 -+{
2454 -+ BUG_ON(bfqq != bfqd->in_service_queue);
2455 -+
2456 -+ __bfq_bfqd_reset_in_service(bfqd);
2457 -+
2458 -+ /*
2459 -+ * If this bfqq is shared between multiple processes, check
2460 -+ * to make sure that those processes are still issuing I/Os
2461 -+ * within the mean seek distance. If not, it may be time to
2462 -+ * break the queues apart again.
2463 -+ */
2464 -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
2465 -+ bfq_mark_bfqq_split_coop(bfqq);
2466 -+
2467 -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
2468 -+ /*
2469 -+ * overloading budget_timeout field to store when
2470 -+ * the queue remains with no backlog, used by
2471 -+ * the weight-raising mechanism
2472 -+ */
2473 -+ bfqq->budget_timeout = jiffies;
2474 -+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
2475 -+ } else {
2476 -+ bfq_activate_bfqq(bfqd, bfqq);
2477 -+ /*
2478 -+ * Resort priority tree of potential close cooperators.
2479 -+ */
2480 -+ bfq_rq_pos_tree_add(bfqd, bfqq);
2481 -+ }
2482 -+}
2483 -+
2484 -+/**
2485 -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
2486 -+ * @bfqd: device data.
2487 -+ * @bfqq: queue to update.
2488 -+ * @reason: reason for expiration.
2489 -+ *
2490 -+ * Handle the feedback on @bfqq budget. See the body for detailed
2491 -+ * comments.
2492 -+ */
2493 -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
2494 -+ struct bfq_queue *bfqq,
2495 -+ enum bfqq_expiration reason)
2496 -+{
2497 -+ struct request *next_rq;
2498 -+ unsigned long budget, min_budget;
2499 -+
2500 -+ budget = bfqq->max_budget;
2501 -+ min_budget = bfq_min_budget(bfqd);
2502 -+
2503 -+ BUG_ON(bfqq != bfqd->in_service_queue);
2504 -+
2505 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
2506 -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
2507 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
2508 -+ budget, bfq_min_budget(bfqd));
2509 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
2510 -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
2511 -+
2512 -+ if (bfq_bfqq_sync(bfqq)) {
2513 -+ switch (reason) {
2514 -+ /*
2515 -+ * Caveat: in all the following cases we trade latency
2516 -+ * for throughput.
2517 -+ */
2518 -+ case BFQ_BFQQ_TOO_IDLE:
2519 -+ /*
2520 -+ * This is the only case where we may reduce
2521 -+ * the budget: if there is no requets of the
2522 -+ * process still waiting for completion, then
2523 -+ * we assume (tentatively) that the timer has
2524 -+ * expired because the batch of requests of
2525 -+ * the process could have been served with a
2526 -+ * smaller budget. Hence, betting that
2527 -+ * process will behave in the same way when it
2528 -+ * becomes backlogged again, we reduce its
2529 -+ * next budget. As long as we guess right,
2530 -+ * this budget cut reduces the latency
2531 -+ * experienced by the process.
2532 -+ *
2533 -+ * However, if there are still outstanding
2534 -+ * requests, then the process may have not yet
2535 -+ * issued its next request just because it is
2536 -+ * still waiting for the completion of some of
2537 -+ * the still oustanding ones. So in this
2538 -+ * subcase we do not reduce its budget, on the
2539 -+ * contrary we increase it to possibly boost
2540 -+ * the throughput, as discussed in the
2541 -+ * comments to the BUDGET_TIMEOUT case.
2542 -+ */
2543 -+ if (bfqq->dispatched > 0) /* still oustanding reqs */
2544 -+ budget = min(budget * 2, bfqd->bfq_max_budget);
2545 -+ else {
2546 -+ if (budget > 5 * min_budget)
2547 -+ budget -= 4 * min_budget;
2548 -+ else
2549 -+ budget = min_budget;
2550 -+ }
2551 -+ break;
2552 -+ case BFQ_BFQQ_BUDGET_TIMEOUT:
2553 -+ /*
2554 -+ * We double the budget here because: 1) it
2555 -+ * gives the chance to boost the throughput if
2556 -+ * this is not a seeky process (which may have
2557 -+ * bumped into this timeout because of, e.g.,
2558 -+ * ZBR), 2) together with charge_full_budget
2559 -+ * it helps give seeky processes higher
2560 -+ * timestamps, and hence be served less
2561 -+ * frequently.
2562 -+ */
2563 -+ budget = min(budget * 2, bfqd->bfq_max_budget);
2564 -+ break;
2565 -+ case BFQ_BFQQ_BUDGET_EXHAUSTED:
2566 -+ /*
2567 -+ * The process still has backlog, and did not
2568 -+ * let either the budget timeout or the disk
2569 -+ * idling timeout expire. Hence it is not
2570 -+ * seeky, has a short thinktime and may be
2571 -+ * happy with a higher budget too. So
2572 -+ * definitely increase the budget of this good
2573 -+ * candidate to boost the disk throughput.
2574 -+ */
2575 -+ budget = min(budget * 4, bfqd->bfq_max_budget);
2576 -+ break;
2577 -+ case BFQ_BFQQ_NO_MORE_REQUESTS:
2578 -+ /*
2579 -+ * Leave the budget unchanged.
2580 -+ */
2581 -+ default:
2582 -+ return;
2583 -+ }
2584 -+ } else /* async queue */
2585 -+ /* async queues get always the maximum possible budget
2586 -+ * (their ability to dispatch is limited by
2587 -+ * @bfqd->bfq_max_budget_async_rq).
2588 -+ */
2589 -+ budget = bfqd->bfq_max_budget;
2590 -+
2591 -+ bfqq->max_budget = budget;
2592 -+
2593 -+ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
2594 -+ bfqq->max_budget > bfqd->bfq_max_budget)
2595 -+ bfqq->max_budget = bfqd->bfq_max_budget;
2596 -+
2597 -+ /*
2598 -+ * Make sure that we have enough budget for the next request.
2599 -+ * Since the finish time of the bfqq must be kept in sync with
2600 -+ * the budget, be sure to call __bfq_bfqq_expire() after the
2601 -+ * update.
2602 -+ */
2603 -+ next_rq = bfqq->next_rq;
2604 -+ if (next_rq != NULL)
2605 -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
2606 -+ bfq_serv_to_charge(next_rq, bfqq));
2607 -+ else
2608 -+ bfqq->entity.budget = bfqq->max_budget;
2609 -+
2610 -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
2611 -+ next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
2612 -+ bfqq->entity.budget);
2613 -+}
2614 -+
2615 -+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
2616 -+{
2617 -+ unsigned long max_budget;
2618 -+
2619 -+ /*
2620 -+ * The max_budget calculated when autotuning is equal to the
2621 -+ * amount of sectors transfered in timeout_sync at the
2622 -+ * estimated peak rate.
2623 -+ */
2624 -+ max_budget = (unsigned long)(peak_rate * 1000 *
2625 -+ timeout >> BFQ_RATE_SHIFT);
2626 -+
2627 -+ return max_budget;
2628 -+}
2629 -+
2630 -+/*
2631 -+ * In addition to updating the peak rate, checks whether the process
2632 -+ * is "slow", and returns 1 if so. This slow flag is used, in addition
2633 -+ * to the budget timeout, to reduce the amount of service provided to
2634 -+ * seeky processes, and hence reduce their chances to lower the
2635 -+ * throughput. See the code for more details.
2636 -+ */
2637 -+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2638 -+ int compensate, enum bfqq_expiration reason)
2639 -+{
2640 -+ u64 bw, usecs, expected, timeout;
2641 -+ ktime_t delta;
2642 -+ int update = 0;
2643 -+
2644 -+ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
2645 -+ return 0;
2646 -+
2647 -+ if (compensate)
2648 -+ delta = bfqd->last_idling_start;
2649 -+ else
2650 -+ delta = ktime_get();
2651 -+ delta = ktime_sub(delta, bfqd->last_budget_start);
2652 -+ usecs = ktime_to_us(delta);
2653 -+
2654 -+ /* Don't trust short/unrealistic values. */
2655 -+ if (usecs < 100 || usecs >= LONG_MAX)
2656 -+ return 0;
2657 -+
2658 -+ /*
2659 -+ * Calculate the bandwidth for the last slice. We use a 64 bit
2660 -+ * value to store the peak rate, in sectors per usec in fixed
2661 -+ * point math. We do so to have enough precision in the estimate
2662 -+ * and to avoid overflows.
2663 -+ */
2664 -+ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
2665 -+ do_div(bw, (unsigned long)usecs);
2666 -+
2667 -+ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
2668 -+
2669 -+ /*
2670 -+ * Use only long (> 20ms) intervals to filter out spikes for
2671 -+ * the peak rate estimation.
2672 -+ */
2673 -+ if (usecs > 20000) {
2674 -+ if (bw > bfqd->peak_rate ||
2675 -+ (!BFQQ_SEEKY(bfqq) &&
2676 -+ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
2677 -+ bfq_log(bfqd, "measured bw =%llu", bw);
2678 -+ /*
2679 -+ * To smooth oscillations use a low-pass filter with
2680 -+ * alpha=7/8, i.e.,
2681 -+ * new_rate = (7/8) * old_rate + (1/8) * bw
2682 -+ */
2683 -+ do_div(bw, 8);
2684 -+ if (bw == 0)
2685 -+ return 0;
2686 -+ bfqd->peak_rate *= 7;
2687 -+ do_div(bfqd->peak_rate, 8);
2688 -+ bfqd->peak_rate += bw;
2689 -+ update = 1;
2690 -+ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
2691 -+ }
2692 -+
2693 -+ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
2694 -+
2695 -+ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
2696 -+ bfqd->peak_rate_samples++;
2697 -+
2698 -+ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
2699 -+ update && bfqd->bfq_user_max_budget == 0) {
2700 -+ bfqd->bfq_max_budget =
2701 -+ bfq_calc_max_budget(bfqd->peak_rate, timeout);
2702 -+ bfq_log(bfqd, "new max_budget=%lu",
2703 -+ bfqd->bfq_max_budget);
2704 -+ }
2705 -+ }
2706 -+
2707 -+ /*
2708 -+ * If the process has been served for a too short time
2709 -+ * interval to let its possible sequential accesses prevail on
2710 -+ * the initial seek time needed to move the disk head on the
2711 -+ * first sector it requested, then give the process a chance
2712 -+ * and for the moment return false.
2713 -+ */
2714 -+ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
2715 -+ return 0;
2716 -+
2717 -+ /*
2718 -+ * A process is considered ``slow'' (i.e., seeky, so that we
2719 -+ * cannot treat it fairly in the service domain, as it would
2720 -+ * slow down too much the other processes) if, when a slice
2721 -+ * ends for whatever reason, it has received service at a
2722 -+ * rate that would not be high enough to complete the budget
2723 -+ * before the budget timeout expiration.
2724 -+ */
2725 -+ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
2726 -+
2727 -+ /*
2728 -+ * Caveat: processes doing IO in the slower disk zones will
2729 -+ * tend to be slow(er) even if not seeky. And the estimated
2730 -+ * peak rate will actually be an average over the disk
2731 -+ * surface. Hence, to not be too harsh with unlucky processes,
2732 -+ * we keep a budget/3 margin of safety before declaring a
2733 -+ * process slow.
2734 -+ */
2735 -+ return expected > (4 * bfqq->entity.budget) / 3;
2736 -+}
2737 -+
2738 -+/*
2739 -+ * To be deemed as soft real-time, an application must meet two requirements.
2740 -+ * The first is that the application must not require an average bandwidth
2741 -+ * higher than the approximate bandwidth required to playback or record a
2742 -+ * compressed high-definition video.
2743 -+ * The next function is invoked on the completion of the last request of a
2744 -+ * batch, to compute the next-start time instant, soft_rt_next_start, such
2745 -+ * that, if the next request of the application does not arrive before
2746 -+ * soft_rt_next_start, then the above requirement on the bandwidth is met.
2747 -+ *
2748 -+ * The second requirement is that the request pattern of the application is
2749 -+ * isochronous, i.e., that, after issuing a request or a batch of requests, the
2750 -+ * application stops for a while, then issues a new batch, and so on. For this
2751 -+ * reason the next function is invoked to compute soft_rt_next_start only for
2752 -+ * applications that meet this requirement, whereas soft_rt_next_start is set
2753 -+ * to infinity for applications that do not.
2754 -+ *
2755 -+ * Unfortunately, even a greedy application may happen to behave in an
2756 -+ * isochronous way if several processes are competing for the CPUs. In fact,
2757 -+ * in this scenario the application stops issuing requests while the CPUs are
2758 -+ * busy serving other processes, then restarts, then stops again for a while,
2759 -+ * and so on. In addition, if the disk achieves a low enough throughput with
2760 -+ * the request pattern issued by the application, then the above bandwidth
2761 -+ * requirement may happen to be met too. To prevent such a greedy application
2762 -+ * to be deemed as soft real-time, a further rule is used in the computation
2763 -+ * of soft_rt_next_start: soft_rt_next_start must be higher than the current
2764 -+ * time plus the maximum time for which the arrival of a request is waited
2765 -+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This
2766 -+ * filters out greedy applications, as the latter issue instead their next
2767 -+ * request as soon as possible after the last one has been completed (in
2768 -+ * contrast, when a batch of requests is completed, a soft real-time
2769 -+ * application spends some time processing data).
2770 -+ *
2771 -+ * Actually, the last filter may easily generate false positives if: only
2772 -+ * bfqd->bfq_slice_idle is used as a reference time interval, and one or
2773 -+ * both the following two cases occur:
2774 -+ * 1) HZ is so low that the duration of a jiffie is comparable to or higher
2775 -+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
2776 -+ * HZ=100.
2777 -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
2778 -+ * for a while, then suddenly 'jump' by several units to recover the lost
2779 -+ * increments. This seems to happen, e.g., inside virtual machines.
2780 -+ * To address this issue, we do not use as a reference time interval just
2781 -+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
2782 -+ * particular we add the minimum number of jiffies for which the filter seems
2783 -+ * to be quite precise also in embedded systems and KVM/QEMU virtual machines.
2784 -+ */
2785 -+static inline u64 bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
2786 -+ struct bfq_queue *bfqq)
2787 -+{
2788 -+ return max(bfqq->last_idle_bklogged +
2789 -+ HZ * bfqq->service_from_backlogged /
2790 -+ bfqd->bfq_raising_max_softrt_rate,
2791 -+ (u64)jiffies + bfqq->bfqd->bfq_slice_idle + 4);
2792 -+}
2793 -+
2794 -+/**
2795 -+ * bfq_bfqq_expire - expire a queue.
2796 -+ * @bfqd: device owning the queue.
2797 -+ * @bfqq: the queue to expire.
2798 -+ * @compensate: if true, compensate for the time spent idling.
2799 -+ * @reason: the reason causing the expiration.
2800 -+ *
2801 -+ *
2802 -+ * If the process associated to the queue is slow (i.e., seeky), or in
2803 -+ * case of budget timeout, or, finally, if it is async, we
2804 -+ * artificially charge it an entire budget (independently of the
2805 -+ * actual service it received). As a consequence, the queue will get
2806 -+ * higher timestamps than the correct ones upon reactivation, and
2807 -+ * hence it will be rescheduled as if it had received more service
2808 -+ * than what it actually received. In the end, this class of processes
2809 -+ * will receive less service in proportion to how slowly they consume
2810 -+ * their budgets (and hence how seriously they tend to lower the
2811 -+ * throughput).
2812 -+ *
2813 -+ * In contrast, when a queue expires because it has been idling for
2814 -+ * too much or because it exhausted its budget, we do not touch the
2815 -+ * amount of service it has received. Hence when the queue will be
2816 -+ * reactivated and its timestamps updated, the latter will be in sync
2817 -+ * with the actual service received by the queue until expiration.
2818 -+ *
2819 -+ * Charging a full budget to the first type of queues and the exact
2820 -+ * service to the others has the effect of using the WF2Q+ policy to
2821 -+ * schedule the former on a timeslice basis, without violating the
2822 -+ * service domain guarantees of the latter.
2823 -+ */
2824 -+static void bfq_bfqq_expire(struct bfq_data *bfqd,
2825 -+ struct bfq_queue *bfqq,
2826 -+ int compensate,
2827 -+ enum bfqq_expiration reason)
2828 -+{
2829 -+ int slow;
2830 -+ BUG_ON(bfqq != bfqd->in_service_queue);
2831 -+
2832 -+ /* Update disk peak rate for autotuning and check whether the
2833 -+ * process is slow (see bfq_update_peak_rate).
2834 -+ */
2835 -+ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
2836 -+
2837 -+ /*
2838 -+ * As above explained, 'punish' slow (i.e., seeky), timed-out
2839 -+ * and async queues, to favor sequential sync workloads.
2840 -+ *
2841 -+ * Processes doing IO in the slower disk zones will tend to be
2842 -+ * slow(er) even if not seeky. Hence, since the estimated peak
2843 -+ * rate is actually an average over the disk surface, these
2844 -+ * processes may timeout just for bad luck. To avoid punishing
2845 -+ * them we do not charge a full budget to a process that
2846 -+ * succeeded in consuming at least 2/3 of its budget.
2847 -+ */
2848 -+ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
2849 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
2850 -+ bfq_bfqq_charge_full_budget(bfqq);
2851 -+
2852 -+ bfqq->service_from_backlogged += bfqq->entity.service;
2853 -+
2854 -+ if (bfqd->low_latency && bfqq->raising_coeff == 1)
2855 -+ bfqq->last_rais_start_finish = jiffies;
2856 -+
2857 -+ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {
2858 -+ if (reason != BFQ_BFQQ_BUDGET_TIMEOUT &&
2859 -+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) {
2860 -+ /*
2861 -+ * If we get here, then the request pattern is
2862 -+ * isochronous (see the comments to the function
2863 -+ * bfq_bfqq_softrt_next_start()). However, if the
2864 -+ * queue still has in-flight requests, then it is
2865 -+ * better to postpone the computation of next_start
2866 -+ * to the next request completion. In fact, if we
2867 -+ * computed it now, then the application might pass
2868 -+ * the greedy-application filter improperly, because
2869 -+ * the arrival of its next request may happen to be
2870 -+ * higher than (jiffies + bfqq->bfqd->bfq_slice_idle)
2871 -+ * not because the application is truly soft real-
2872 -+ * time, but just because the application is currently
2873 -+ * waiting for the completion of some request before
2874 -+ * issuing, as quickly as possible, its next request.
2875 -+ */
2876 -+ if (bfqq->dispatched > 0) {
2877 -+ bfqq->soft_rt_next_start = -1;
2878 -+ bfq_mark_bfqq_softrt_update(bfqq);
2879 -+ } else
2880 -+ bfqq->soft_rt_next_start =
2881 -+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
2882 -+ } else
2883 -+ bfqq->soft_rt_next_start = -1; /* infinity */
2884 -+ }
2885 -+
2886 -+ bfq_log_bfqq(bfqd, bfqq,
2887 -+ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
2888 -+ bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
2889 -+
2890 -+ /* Increase, decrease or leave budget unchanged according to reason */
2891 -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
2892 -+ __bfq_bfqq_expire(bfqd, bfqq);
2893 -+}
2894 -+
2895 -+/*
2896 -+ * Budget timeout is not implemented through a dedicated timer, but
2897 -+ * just checked on request arrivals and completions, as well as on
2898 -+ * idle timer expirations.
2899 -+ */
2900 -+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
2901 -+{
2902 -+ if (bfq_bfqq_budget_new(bfqq))
2903 -+ return 0;
2904 -+
2905 -+ if (time_before(jiffies, bfqq->budget_timeout))
2906 -+ return 0;
2907 -+
2908 -+ return 1;
2909 -+}
2910 -+
2911 -+/*
2912 -+ * If we expire a queue that is waiting for the arrival of a new
2913 -+ * request, we may prevent the fictitious timestamp backshifting that
2914 -+ * allows the guarantees of the queue to be preserved (see [1] for
2915 -+ * this tricky aspect). Hence we return true only if this condition
2916 -+ * does not hold, or if the queue is slow enough to deserve only to be
2917 -+ * kicked off for preserving a high throughput.
2918 -+*/
2919 -+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
2920 -+{
2921 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
2922 -+ "may_budget_timeout: wr %d left %d timeout %d",
2923 -+ bfq_bfqq_wait_request(bfqq),
2924 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
2925 -+ bfq_bfqq_budget_timeout(bfqq));
2926 -+
2927 -+ return (!bfq_bfqq_wait_request(bfqq) ||
2928 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
2929 -+ &&
2930 -+ bfq_bfqq_budget_timeout(bfqq);
2931 -+}
2932 -+
2933 -+/*
2934 -+ * For weight-raised queues issuing sync requests, idling is always performed,
2935 -+ * as this is instrumental in guaranteeing a high fraction of the throughput
2936 -+ * to these queues, and hence in guaranteeing a lower latency for their
2937 -+ * requests. See [1] for details.
2938 -+ *
2939 -+ * For non-weight-raised queues, idling is instead disabled if the device is
2940 -+ * NCQ-enabled and non-rotational, as this boosts the throughput on such
2941 -+ * devices.
2942 -+ */
2943 -+static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
2944 -+{
2945 -+ struct bfq_data *bfqd = bfqq->bfqd;
2946 -+
2947 -+ return bfq_bfqq_sync(bfqq) && (
2948 -+ bfqq->raising_coeff > 1 ||
2949 -+ (bfq_bfqq_idle_window(bfqq) &&
2950 -+ !(bfqd->hw_tag &&
2951 -+ (blk_queue_nonrot(bfqd->queue) ||
2952 -+ /*
2953 -+ * If there are weight-raised busy queues, then do not idle
2954 -+ * the disk for a sync non-weight-raised queue, and hence
2955 -+ * expire the queue immediately if empty. Combined with the
2956 -+ * timestamping rules of BFQ (see [1] for details), this
2957 -+ * causes sync non-weight-raised queues to get a lower
2958 -+ * fraction of the disk throughput, and hence reduces the rate
2959 -+ * at which the processes associated to these queues ask for
2960 -+ * requests from the request pool.
2961 -+ *
2962 -+ * This is beneficial for weight-raised processes, when the
2963 -+ * system operates in request-pool saturation conditions
2964 -+ * (e.g., in the presence of write hogs). In fact, if
2965 -+ * non-weight-raised processes ask for requests at a lower
2966 -+ * rate, then weight-raised processes have a higher
2967 -+ * probability to get a request from the pool immediately
2968 -+ * (or at least soon) when they need one. Hence they have a
2969 -+ * higher probability to actually get a fraction of the disk
2970 -+ * throughput proportional to their high weight. This is
2971 -+ * especially true with NCQ-enabled drives, which enqueue
2972 -+ * several requests in advance and further reorder
2973 -+ * internally-queued requests.
2974 -+ *
2975 -+ * Mistreating non-weight-raised queues in the above-described
2976 -+ * way, when there are busy weight-raised queues, seems to
2977 -+ * mitigate starvation problems in the presence of heavy write
2978 -+ * workloads and NCQ, and hence to guarantee a higher
2979 -+ * application and system responsiveness in these hostile
2980 -+ * scenarios.
2981 -+ */
2982 -+ bfqd->raised_busy_queues > 0)
2983 -+ )
2984 -+ )
2985 -+ );
2986 -+}
2987 -+
2988 -+/*
2989 -+ * If the in-service queue is empty, but it is sync and either of the following
2990 -+ * conditions holds, then: 1) the queue must remain in service and cannot be
2991 -+ * expired, and 2) the disk must be idled to wait for the possible arrival
2992 -+ * of a new request for the queue. The conditions are:
2993 -+ * - the device is rotational and not performing NCQ, and the queue has its
2994 -+ * idle window set (in this case, waiting for a new request for the queue
2995 -+ * is likely to boost the disk throughput);
2996 -+ * - the queue is weight-raised (waiting for the request is necessary to
2997 -+ * provide the queue with fairness and latency guarantees, see [1] for
2998 -+ * details).
2999 -+ */
3000 -+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
3001 -+{
3002 -+ struct bfq_data *bfqd = bfqq->bfqd;
3003 -+
3004 -+ return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
3005 -+ bfq_bfqq_must_not_expire(bfqq) &&
3006 -+ !bfq_queue_nonrot_noidle(bfqd, bfqq));
3007 -+}
3008 -+
3009 -+/*
3010 -+ * Select a queue for service. If we have a current queue in service,
3011 -+ * check whether to continue servicing it, or retrieve and set a new one.
3012 -+ */
3013 -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
3014 -+{
3015 -+ struct bfq_queue *bfqq, *new_bfqq = NULL;
3016 -+ struct request *next_rq;
3017 -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
3018 -+
3019 -+ bfqq = bfqd->in_service_queue;
3020 -+ if (bfqq == NULL)
3021 -+ goto new_queue;
3022 -+
3023 -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
3024 -+
3025 -+ /*
3026 -+ * If another queue has a request waiting within our mean seek
3027 -+ * distance, let it run. The expire code will check for close
3028 -+ * cooperators and put the close queue at the front of the
3029 -+ * service tree. If possible, merge the expiring queue with the
3030 -+ * new bfqq.
3031 -+ */
3032 -+ new_bfqq = bfq_close_cooperator(bfqd, bfqq);
3033 -+ if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
3034 -+ bfq_setup_merge(bfqq, new_bfqq);
3035 -+
3036 -+ if (bfq_may_expire_for_budg_timeout(bfqq) &&
3037 -+ !timer_pending(&bfqd->idle_slice_timer) &&
3038 -+ !bfq_bfqq_must_idle(bfqq))
3039 -+ goto expire;
3040 -+
3041 -+ next_rq = bfqq->next_rq;
3042 -+ /*
3043 -+ * If bfqq has requests queued and it has enough budget left to
3044 -+ * serve them, keep the queue, otherwise expire it.
3045 -+ */
3046 -+ if (next_rq != NULL) {
3047 -+ if (bfq_serv_to_charge(next_rq, bfqq) >
3048 -+ bfq_bfqq_budget_left(bfqq)) {
3049 -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
3050 -+ goto expire;
3051 -+ } else {
3052 -+ /*
3053 -+ * The idle timer may be pending because we may not
3054 -+ * disable disk idling even when a new request arrives
3055 -+ */
3056 -+ if (timer_pending(&bfqd->idle_slice_timer)) {
3057 -+ /*
3058 -+ * If we get here: 1) at least a new request
3059 -+ * has arrived but we have not disabled the
3060 -+ * timer because the request was too small,
3061 -+ * 2) then the block layer has unplugged the
3062 -+ * device, causing the dispatch to be invoked.
3063 -+ *
3064 -+ * Since the device is unplugged, now the
3065 -+ * requests are probably large enough to
3066 -+ * provide a reasonable throughput.
3067 -+ * So we disable idling.
3068 -+ */
3069 -+ bfq_clear_bfqq_wait_request(bfqq);
3070 -+ del_timer(&bfqd->idle_slice_timer);
3071 -+ }
3072 -+ if (new_bfqq == NULL)
3073 -+ goto keep_queue;
3074 -+ else
3075 -+ goto expire;
3076 -+ }
3077 -+ }
3078 -+
3079 -+ /*
3080 -+ * No requests pending. If the in-service queue has no cooperator and
3081 -+ * still has requests in flight (possibly waiting for a completion)
3082 -+ * or is idling for a new request, then keep it.
3083 -+ */
3084 -+ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
3085 -+ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
3086 -+ bfqq = NULL;
3087 -+ goto keep_queue;
3088 -+ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
3089 -+ /*
3090 -+ * Expiring the queue because there is a close cooperator,
3091 -+ * cancel timer.
3092 -+ */
3093 -+ bfq_clear_bfqq_wait_request(bfqq);
3094 -+ del_timer(&bfqd->idle_slice_timer);
3095 -+ }
3096 -+
3097 -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
3098 -+expire:
3099 -+ bfq_bfqq_expire(bfqd, bfqq, 0, reason);
3100 -+new_queue:
3101 -+ bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
3102 -+ bfq_log(bfqd, "select_queue: new queue %d returned",
3103 -+ bfqq != NULL ? bfqq->pid : 0);
3104 -+keep_queue:
3105 -+ return bfqq;
3106 -+}
3107 -+
3108 -+static void bfq_update_raising_data(struct bfq_data *bfqd,
3109 -+ struct bfq_queue *bfqq)
3110 -+{
3111 -+ if (bfqq->raising_coeff > 1) { /* queue is being boosted */
3112 -+ struct bfq_entity *entity = &bfqq->entity;
3113 -+
3114 -+ bfq_log_bfqq(bfqd, bfqq,
3115 -+ "raising period dur %u/%u msec, "
3116 -+ "old raising coeff %u, w %d(%d)",
3117 -+ jiffies_to_msecs(jiffies -
3118 -+ bfqq->last_rais_start_finish),
3119 -+ jiffies_to_msecs(bfqq->raising_cur_max_time),
3120 -+ bfqq->raising_coeff,
3121 -+ bfqq->entity.weight, bfqq->entity.orig_weight);
3122 -+
3123 -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
3124 -+ entity->orig_weight * bfqq->raising_coeff);
3125 -+ if (entity->ioprio_changed)
3126 -+ bfq_log_bfqq(bfqd, bfqq,
3127 -+ "WARN: pending prio change");
3128 -+ /*
3129 -+ * If too much time has elapsed from the beginning
3130 -+ * of this weight-raising, stop it.
3131 -+ */
3132 -+ if (jiffies - bfqq->last_rais_start_finish >
3133 -+ bfqq->raising_cur_max_time) {
3134 -+ bfqq->last_rais_start_finish = jiffies;
3135 -+ bfq_log_bfqq(bfqd, bfqq,
3136 -+ "wrais ending at %llu msec,"
3137 -+ "rais_max_time %u",
3138 -+ bfqq->last_rais_start_finish,
3139 -+ jiffies_to_msecs(bfqq->
3140 -+ raising_cur_max_time));
3141 -+ bfq_bfqq_end_raising(bfqq);
3142 -+ __bfq_entity_update_weight_prio(
3143 -+ bfq_entity_service_tree(entity),
3144 -+ entity);
3145 -+ }
3146 -+ }
3147 -+}
3148 -+
3149 -+/*
3150 -+ * Dispatch one request from bfqq, moving it to the request queue
3151 -+ * dispatch list.
3152 -+ */
3153 -+static int bfq_dispatch_request(struct bfq_data *bfqd,
3154 -+ struct bfq_queue *bfqq)
3155 -+{
3156 -+ int dispatched = 0;
3157 -+ struct request *rq;
3158 -+ unsigned long service_to_charge;
3159 -+
3160 -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
3161 -+
3162 -+ /* Follow expired path, else get first next available. */
3163 -+ rq = bfq_check_fifo(bfqq);
3164 -+ if (rq == NULL)
3165 -+ rq = bfqq->next_rq;
3166 -+ service_to_charge = bfq_serv_to_charge(rq, bfqq);
3167 -+
3168 -+ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
3169 -+ /*
3170 -+ * This may happen if the next rq is chosen
3171 -+ * in fifo order instead of sector order.
3172 -+ * The budget is properly dimensioned
3173 -+ * to be always sufficient to serve the next request
3174 -+ * only if it is chosen in sector order. The reason is
3175 -+ * that it would be quite inefficient and little useful
3176 -+ * to always make sure that the budget is large enough
3177 -+ * to serve even the possible next rq in fifo order.
3178 -+ * In fact, requests are seldom served in fifo order.
3179 -+ *
3180 -+ * Expire the queue for budget exhaustion, and
3181 -+ * make sure that the next act_budget is enough
3182 -+ * to serve the next request, even if it comes
3183 -+ * from the fifo expired path.
3184 -+ */
3185 -+ bfqq->next_rq = rq;
3186 -+ /*
3187 -+ * Since this dispatch is failed, make sure that
3188 -+ * a new one will be performed
3189 -+ */
3190 -+ if (!bfqd->rq_in_driver)
3191 -+ bfq_schedule_dispatch(bfqd);
3192 -+ goto expire;
3193 -+ }
3194 -+
3195 -+ /* Finally, insert request into driver dispatch list. */
3196 -+ bfq_bfqq_served(bfqq, service_to_charge);
3197 -+ bfq_dispatch_insert(bfqd->queue, rq);
3198 -+
3199 -+ bfq_update_raising_data(bfqd, bfqq);
3200 -+
3201 -+ bfq_log_bfqq(bfqd, bfqq,
3202 -+ "dispatched %u sec req (%llu), budg left %lu",
3203 -+ blk_rq_sectors(rq),
3204 -+ (long long unsigned)blk_rq_pos(rq),
3205 -+ bfq_bfqq_budget_left(bfqq));
3206 -+
3207 -+ dispatched++;
3208 -+
3209 -+ if (bfqd->in_service_bic == NULL) {
3210 -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
3211 -+ bfqd->in_service_bic = RQ_BIC(rq);
3212 -+ }
3213 -+
3214 -+ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
3215 -+ dispatched >= bfqd->bfq_max_budget_async_rq) ||
3216 -+ bfq_class_idle(bfqq)))
3217 -+ goto expire;
3218 -+
3219 -+ return dispatched;
3220 -+
3221 -+expire:
3222 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
3223 -+ return dispatched;
3224 -+}
3225 -+
3226 -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
3227 -+{
3228 -+ int dispatched = 0;
3229 -+
3230 -+ while (bfqq->next_rq != NULL) {
3231 -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
3232 -+ dispatched++;
3233 -+ }
3234 -+
3235 -+ BUG_ON(!list_empty(&bfqq->fifo));
3236 -+ return dispatched;
3237 -+}
3238 -+
3239 -+/*
3240 -+ * Drain our current requests. Used for barriers and when switching
3241 -+ * io schedulers on-the-fly.
3242 -+ */
3243 -+static int bfq_forced_dispatch(struct bfq_data *bfqd)
3244 -+{
3245 -+ struct bfq_queue *bfqq, *n;
3246 -+ struct bfq_service_tree *st;
3247 -+ int dispatched = 0;
3248 -+
3249 -+ bfqq = bfqd->in_service_queue;
3250 -+ if (bfqq != NULL)
3251 -+ __bfq_bfqq_expire(bfqd, bfqq);
3252 -+
3253 -+ /*
3254 -+ * Loop through classes, and be careful to leave the scheduler
3255 -+ * in a consistent state, as feedback mechanisms and vtime
3256 -+ * updates cannot be disabled during the process.
3257 -+ */
3258 -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
3259 -+ st = bfq_entity_service_tree(&bfqq->entity);
3260 -+
3261 -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
3262 -+ bfqq->max_budget = bfq_max_budget(bfqd);
3263 -+
3264 -+ bfq_forget_idle(st);
3265 -+ }
3266 -+
3267 -+ BUG_ON(bfqd->busy_queues != 0);
3268 -+
3269 -+ return dispatched;
3270 -+}
3271 -+
3272 -+static int bfq_dispatch_requests(struct request_queue *q, int force)
3273 -+{
3274 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3275 -+ struct bfq_queue *bfqq;
3276 -+ int max_dispatch;
3277 -+
3278 -+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
3279 -+ if (bfqd->busy_queues == 0)
3280 -+ return 0;
3281 -+
3282 -+ if (unlikely(force))
3283 -+ return bfq_forced_dispatch(bfqd);
3284 -+
3285 -+ bfqq = bfq_select_queue(bfqd);
3286 -+ if (bfqq == NULL)
3287 -+ return 0;
3288 -+
3289 -+ max_dispatch = bfqd->bfq_quantum;
3290 -+ if (bfq_class_idle(bfqq))
3291 -+ max_dispatch = 1;
3292 -+
3293 -+ if (!bfq_bfqq_sync(bfqq))
3294 -+ max_dispatch = bfqd->bfq_max_budget_async_rq;
3295 -+
3296 -+ if (bfqq->dispatched >= max_dispatch) {
3297 -+ if (bfqd->busy_queues > 1)
3298 -+ return 0;
3299 -+ if (bfqq->dispatched >= 4 * max_dispatch)
3300 -+ return 0;
3301 -+ }
3302 -+
3303 -+ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
3304 -+ return 0;
3305 -+
3306 -+ bfq_clear_bfqq_wait_request(bfqq);
3307 -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
3308 -+
3309 -+ if (!bfq_dispatch_request(bfqd, bfqq))
3310 -+ return 0;
3311 -+
3312 -+ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",
3313 -+ bfqq->pid, max_dispatch);
3314 -+
3315 -+ return 1;
3316 -+}
3317 -+
3318 -+/*
3319 -+ * Task holds one reference to the queue, dropped when task exits. Each rq
3320 -+ * in-flight on this queue also holds a reference, dropped when rq is freed.
3321 -+ *
3322 -+ * Queue lock must be held here.
3323 -+ */
3324 -+static void bfq_put_queue(struct bfq_queue *bfqq)
3325 -+{
3326 -+ struct bfq_data *bfqd = bfqq->bfqd;
3327 -+
3328 -+ BUG_ON(atomic_read(&bfqq->ref) <= 0);
3329 -+
3330 -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
3331 -+ atomic_read(&bfqq->ref));
3332 -+ if (!atomic_dec_and_test(&bfqq->ref))
3333 -+ return;
3334 -+
3335 -+ BUG_ON(rb_first(&bfqq->sort_list) != NULL);
3336 -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
3337 -+ BUG_ON(bfqq->entity.tree != NULL);
3338 -+ BUG_ON(bfq_bfqq_busy(bfqq));
3339 -+ BUG_ON(bfqd->in_service_queue == bfqq);
3340 -+
3341 -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
3342 -+
3343 -+ kmem_cache_free(bfq_pool, bfqq);
3344 -+}
3345 -+
3346 -+static void bfq_put_cooperator(struct bfq_queue *bfqq)
3347 -+{
3348 -+ struct bfq_queue *__bfqq, *next;
3349 -+
3350 -+ /*
3351 -+ * If this queue was scheduled to merge with another queue, be
3352 -+ * sure to drop the reference taken on that queue (and others in
3353 -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
3354 -+ */
3355 -+ __bfqq = bfqq->new_bfqq;
3356 -+ while (__bfqq) {
3357 -+ if (__bfqq == bfqq) {
3358 -+ WARN(1, "bfqq->new_bfqq loop detected.\n");
3359 -+ break;
3360 -+ }
3361 -+ next = __bfqq->new_bfqq;
3362 -+ bfq_put_queue(__bfqq);
3363 -+ __bfqq = next;
3364 -+ }
3365 -+}
3366 -+
3367 -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3368 -+{
3369 -+ if (bfqq == bfqd->in_service_queue) {
3370 -+ __bfq_bfqq_expire(bfqd, bfqq);
3371 -+ bfq_schedule_dispatch(bfqd);
3372 -+ }
3373 -+
3374 -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
3375 -+ atomic_read(&bfqq->ref));
3376 -+
3377 -+ bfq_put_cooperator(bfqq);
3378 -+
3379 -+ bfq_put_queue(bfqq);
3380 -+}
3381 -+
3382 -+static void bfq_init_icq(struct io_cq *icq)
3383 -+{
3384 -+ struct bfq_io_cq *bic = icq_to_bic(icq);
3385 -+
3386 -+ bic->ttime.last_end_request = jiffies;
3387 -+}
3388 -+
3389 -+static void bfq_exit_icq(struct io_cq *icq)
3390 -+{
3391 -+ struct bfq_io_cq *bic = icq_to_bic(icq);
3392 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
3393 -+
3394 -+ if (bic->bfqq[BLK_RW_ASYNC]) {
3395 -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
3396 -+ bic->bfqq[BLK_RW_ASYNC] = NULL;
3397 -+ }
3398 -+
3399 -+ if (bic->bfqq[BLK_RW_SYNC]) {
3400 -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
3401 -+ bic->bfqq[BLK_RW_SYNC] = NULL;
3402 -+ }
3403 -+}
3404 -+
3405 -+/*
3406 -+ * Update the entity prio values; note that the new values will not
3407 -+ * be used until the next (re)activation.
3408 -+ */
3409 -+static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
3410 -+{
3411 -+ struct task_struct *tsk = current;
3412 -+ int ioprio_class;
3413 -+
3414 -+ if (!bfq_bfqq_prio_changed(bfqq))
3415 -+ return;
3416 -+
3417 -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3418 -+ switch (ioprio_class) {
3419 -+ default:
3420 -+ dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
3421 -+ "bfq: bad prio %x\n", ioprio_class);
3422 -+ case IOPRIO_CLASS_NONE:
3423 -+ /*
3424 -+ * No prio set, inherit CPU scheduling settings.
3425 -+ */
3426 -+ bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
3427 -+ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
3428 -+ break;
3429 -+ case IOPRIO_CLASS_RT:
3430 -+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3431 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
3432 -+ break;
3433 -+ case IOPRIO_CLASS_BE:
3434 -+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3435 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
3436 -+ break;
3437 -+ case IOPRIO_CLASS_IDLE:
3438 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
3439 -+ bfqq->entity.new_ioprio = 7;
3440 -+ bfq_clear_bfqq_idle_window(bfqq);
3441 -+ break;
3442 -+ }
3443 -+
3444 -+ bfqq->entity.ioprio_changed = 1;
3445 -+
3446 -+ /*
3447 -+ * Keep track of original prio settings in case we have to temporarily
3448 -+ * elevate the priority of this queue.
3449 -+ */
3450 -+ bfqq->org_ioprio = bfqq->entity.new_ioprio;
3451 -+ bfq_clear_bfqq_prio_changed(bfqq);
3452 -+}
3453 -+
3454 -+static void bfq_changed_ioprio(struct bfq_io_cq *bic)
3455 -+{
3456 -+ struct bfq_data *bfqd;
3457 -+ struct bfq_queue *bfqq, *new_bfqq;
3458 -+ struct bfq_group *bfqg;
3459 -+ unsigned long uninitialized_var(flags);
3460 -+ int ioprio = bic->icq.ioc->ioprio;
3461 -+
3462 -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
3463 -+ &flags);
3464 -+ /*
3465 -+ * This condition may trigger on a newly created bic, be sure to drop
3466 -+ * the lock before returning.
3467 -+ */
3468 -+ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
3469 -+ goto out;
3470 -+
3471 -+ bfqq = bic->bfqq[BLK_RW_ASYNC];
3472 -+ if (bfqq != NULL) {
3473 -+ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
3474 -+ sched_data);
3475 -+ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
3476 -+ GFP_ATOMIC);
3477 -+ if (new_bfqq != NULL) {
3478 -+ bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
3479 -+ bfq_log_bfqq(bfqd, bfqq,
3480 -+ "changed_ioprio: bfqq %p %d",
3481 -+ bfqq, atomic_read(&bfqq->ref));
3482 -+ bfq_put_queue(bfqq);
3483 -+ }
3484 -+ }
3485 -+
3486 -+ bfqq = bic->bfqq[BLK_RW_SYNC];
3487 -+ if (bfqq != NULL)
3488 -+ bfq_mark_bfqq_prio_changed(bfqq);
3489 -+
3490 -+ bic->ioprio = ioprio;
3491 -+
3492 -+out:
3493 -+ bfq_put_bfqd_unlock(bfqd, &flags);
3494 -+}
3495 -+
3496 -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3497 -+ pid_t pid, int is_sync)
3498 -+{
3499 -+ RB_CLEAR_NODE(&bfqq->entity.rb_node);
3500 -+ INIT_LIST_HEAD(&bfqq->fifo);
3501 -+
3502 -+ atomic_set(&bfqq->ref, 0);
3503 -+ bfqq->bfqd = bfqd;
3504 -+
3505 -+ bfq_mark_bfqq_prio_changed(bfqq);
3506 -+
3507 -+ if (is_sync) {
3508 -+ if (!bfq_class_idle(bfqq))
3509 -+ bfq_mark_bfqq_idle_window(bfqq);
3510 -+ bfq_mark_bfqq_sync(bfqq);
3511 -+ }
3512 -+
3513 -+ /* Tentative initial value to trade off between thr and lat */
3514 -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
3515 -+ bfqq->pid = pid;
3516 -+
3517 -+ bfqq->raising_coeff = 1;
3518 -+ bfqq->last_rais_start_finish = 0;
3519 -+ bfqq->soft_rt_next_start = -1;
3520 -+}
3521 -+
3522 -+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
3523 -+ struct bfq_group *bfqg,
3524 -+ int is_sync,
3525 -+ struct bfq_io_cq *bic,
3526 -+ gfp_t gfp_mask)
3527 -+{
3528 -+ struct bfq_queue *bfqq, *new_bfqq = NULL;
3529 -+
3530 -+retry:
3531 -+ /* bic always exists here */
3532 -+ bfqq = bic_to_bfqq(bic, is_sync);
3533 -+
3534 -+ /*
3535 -+ * Always try a new alloc if we fall back to the OOM bfqq
3536 -+ * originally, since it should just be a temporary situation.
3537 -+ */
3538 -+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3539 -+ bfqq = NULL;
3540 -+ if (new_bfqq != NULL) {
3541 -+ bfqq = new_bfqq;
3542 -+ new_bfqq = NULL;
3543 -+ } else if (gfp_mask & __GFP_WAIT) {
3544 -+ spin_unlock_irq(bfqd->queue->queue_lock);
3545 -+ new_bfqq = kmem_cache_alloc_node(bfq_pool,
3546 -+ gfp_mask | __GFP_ZERO,
3547 -+ bfqd->queue->node);
3548 -+ spin_lock_irq(bfqd->queue->queue_lock);
3549 -+ if (new_bfqq != NULL)
3550 -+ goto retry;
3551 -+ } else {
3552 -+ bfqq = kmem_cache_alloc_node(bfq_pool,
3553 -+ gfp_mask | __GFP_ZERO,
3554 -+ bfqd->queue->node);
3555 -+ }
3556 -+
3557 -+ if (bfqq != NULL) {
3558 -+ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
3559 -+ bfq_log_bfqq(bfqd, bfqq, "allocated");
3560 -+ } else {
3561 -+ bfqq = &bfqd->oom_bfqq;
3562 -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
3563 -+ }
3564 -+
3565 -+ bfq_init_prio_data(bfqq, bic);
3566 -+ bfq_init_entity(&bfqq->entity, bfqg);
3567 -+ }
3568 -+
3569 -+ if (new_bfqq != NULL)
3570 -+ kmem_cache_free(bfq_pool, new_bfqq);
3571 -+
3572 -+ return bfqq;
3573 -+}
3574 -+
3575 -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
3576 -+ struct bfq_group *bfqg,
3577 -+ int ioprio_class, int ioprio)
3578 -+{
3579 -+ switch (ioprio_class) {
3580 -+ case IOPRIO_CLASS_RT:
3581 -+ return &bfqg->async_bfqq[0][ioprio];
3582 -+ case IOPRIO_CLASS_NONE:
3583 -+ ioprio = IOPRIO_NORM;
3584 -+ /* fall through */
3585 -+ case IOPRIO_CLASS_BE:
3586 -+ return &bfqg->async_bfqq[1][ioprio];
3587 -+ case IOPRIO_CLASS_IDLE:
3588 -+ return &bfqg->async_idle_bfqq;
3589 -+ default:
3590 -+ BUG();
3591 -+ }
3592 -+}
3593 -+
3594 -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
3595 -+ struct bfq_group *bfqg, int is_sync,
3596 -+ struct bfq_io_cq *bic, gfp_t gfp_mask)
3597 -+{
3598 -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3599 -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3600 -+ struct bfq_queue **async_bfqq = NULL;
3601 -+ struct bfq_queue *bfqq = NULL;
3602 -+
3603 -+ if (!is_sync) {
3604 -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
3605 -+ ioprio);
3606 -+ bfqq = *async_bfqq;
3607 -+ }
3608 -+
3609 -+ if (bfqq == NULL)
3610 -+ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
3611 -+
3612 -+ /*
3613 -+ * Pin the queue now that it's allocated, scheduler exit will prune it.
3614 -+ */
3615 -+ if (!is_sync && *async_bfqq == NULL) {
3616 -+ atomic_inc(&bfqq->ref);
3617 -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
3618 -+ bfqq, atomic_read(&bfqq->ref));
3619 -+ *async_bfqq = bfqq;
3620 -+ }
3621 -+
3622 -+ atomic_inc(&bfqq->ref);
3623 -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
3624 -+ atomic_read(&bfqq->ref));
3625 -+ return bfqq;
3626 -+}
3627 -+
3628 -+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
3629 -+ struct bfq_io_cq *bic)
3630 -+{
3631 -+ unsigned long elapsed = jiffies - bic->ttime.last_end_request;
3632 -+ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
3633 -+
3634 -+ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
3635 -+ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
3636 -+ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /
3637 -+ bic->ttime.ttime_samples;
3638 -+}
3639 -+
3640 -+static void bfq_update_io_seektime(struct bfq_data *bfqd,
3641 -+ struct bfq_queue *bfqq,
3642 -+ struct request *rq)
3643 -+{
3644 -+ sector_t sdist;
3645 -+ u64 total;
3646 -+
3647 -+ if (bfqq->last_request_pos < blk_rq_pos(rq))
3648 -+ sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
3649 -+ else
3650 -+ sdist = bfqq->last_request_pos - blk_rq_pos(rq);
3651 -+
3652 -+ /*
3653 -+ * Don't allow the seek distance to get too large from the
3654 -+ * odd fragment, pagein, etc.
3655 -+ */
3656 -+ if (bfqq->seek_samples == 0) /* first request, not really a seek */
3657 -+ sdist = 0;
3658 -+ else if (bfqq->seek_samples <= 60) /* second & third seek */
3659 -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
3660 -+ else
3661 -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
3662 -+
3663 -+ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
3664 -+ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
3665 -+ total = bfqq->seek_total + (bfqq->seek_samples/2);
3666 -+ do_div(total, bfqq->seek_samples);
3667 -+ bfqq->seek_mean = (sector_t)total;
3668 -+
3669 -+ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
3670 -+ (u64)bfqq->seek_mean);
3671 -+}
3672 -+
3673 -+/*
3674 -+ * Disable idle window if the process thinks too long or seeks so much that
3675 -+ * it doesn't matter.
3676 -+ */
3677 -+static void bfq_update_idle_window(struct bfq_data *bfqd,
3678 -+ struct bfq_queue *bfqq,
3679 -+ struct bfq_io_cq *bic)
3680 -+{
3681 -+ int enable_idle;
3682 -+
3683 -+ /* Don't idle for async or idle io prio class. */
3684 -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
3685 -+ return;
3686 -+
3687 -+ enable_idle = bfq_bfqq_idle_window(bfqq);
3688 -+
3689 -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
3690 -+ bfqd->bfq_slice_idle == 0 ||
3691 -+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
3692 -+ bfqq->raising_coeff == 1))
3693 -+ enable_idle = 0;
3694 -+ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
3695 -+ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
3696 -+ bfqq->raising_coeff == 1)
3697 -+ enable_idle = 0;
3698 -+ else
3699 -+ enable_idle = 1;
3700 -+ }
3701 -+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
3702 -+ enable_idle);
3703 -+
3704 -+ if (enable_idle)
3705 -+ bfq_mark_bfqq_idle_window(bfqq);
3706 -+ else
3707 -+ bfq_clear_bfqq_idle_window(bfqq);
3708 -+}
3709 -+
3710 -+/*
3711 -+ * Called when a new fs request (rq) is added to bfqq. Check if there's
3712 -+ * something we should do about it.
3713 -+ */
3714 -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3715 -+ struct request *rq)
3716 -+{
3717 -+ struct bfq_io_cq *bic = RQ_BIC(rq);
3718 -+
3719 -+ if (rq->cmd_flags & REQ_META)
3720 -+ bfqq->meta_pending++;
3721 -+
3722 -+ bfq_update_io_thinktime(bfqd, bic);
3723 -+ bfq_update_io_seektime(bfqd, bfqq, rq);
3724 -+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
3725 -+ !BFQQ_SEEKY(bfqq))
3726 -+ bfq_update_idle_window(bfqd, bfqq, bic);
3727 -+
3728 -+ bfq_log_bfqq(bfqd, bfqq,
3729 -+ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
3730 -+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
3731 -+ (long long unsigned)bfqq->seek_mean);
3732 -+
3733 -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3734 -+
3735 -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
3736 -+ int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
3737 -+ blk_rq_sectors(rq) < 32;
3738 -+ int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
3739 -+
3740 -+ /*
3741 -+ * There is just this request queued: if the request
3742 -+ * is small and the queue is not to be expired, then
3743 -+ * just exit.
3744 -+ *
3745 -+ * In this way, if the disk is being idled to wait for
3746 -+ * a new request from the in-service queue, we avoid
3747 -+ * unplugging the device and committing the disk to serve
3748 -+ * just a small request. On the contrary, we wait for
3749 -+ * the block layer to decide when to unplug the device:
3750 -+ * hopefully, new requests will be merged to this one
3751 -+ * quickly, then the device will be unplugged and
3752 -+ * larger requests will be dispatched.
3753 -+ */
3754 -+ if (small_req && !budget_timeout)
3755 -+ return;
3756 -+
3757 -+ /*
3758 -+ * A large enough request arrived, or the queue is to
3759 -+ * be expired: in both cases disk idling is to be
3760 -+ * stopped, so clear wait_request flag and reset
3761 -+ * timer.
3762 -+ */
3763 -+ bfq_clear_bfqq_wait_request(bfqq);
3764 -+ del_timer(&bfqd->idle_slice_timer);
3765 -+
3766 -+ /*
3767 -+ * The queue is not empty, because a new request just
3768 -+ * arrived. Hence we can safely expire the queue, in
3769 -+ * case of budget timeout, without risking that the
3770 -+ * timestamps of the queue are not updated correctly.
3771 -+ * See [1] for more details.
3772 -+ */
3773 -+ if (budget_timeout)
3774 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
3775 -+
3776 -+ /*
3777 -+ * Let the request rip immediately, or let a new queue be
3778 -+ * selected if bfqq has just been expired.
3779 -+ */
3780 -+ __blk_run_queue(bfqd->queue);
3781 -+ }
3782 -+}
3783 -+
3784 -+static void bfq_insert_request(struct request_queue *q, struct request *rq)
3785 -+{
3786 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3787 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3788 -+
3789 -+ assert_spin_locked(bfqd->queue->queue_lock);
3790 -+ bfq_init_prio_data(bfqq, RQ_BIC(rq));
3791 -+
3792 -+ bfq_add_rq_rb(rq);
3793 -+
3794 -+ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
3795 -+ list_add_tail(&rq->queuelist, &bfqq->fifo);
3796 -+
3797 -+ bfq_rq_enqueued(bfqd, bfqq, rq);
3798 -+}
3799 -+
3800 -+static void bfq_update_hw_tag(struct bfq_data *bfqd)
3801 -+{
3802 -+ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
3803 -+ bfqd->rq_in_driver);
3804 -+
3805 -+ if (bfqd->hw_tag == 1)
3806 -+ return;
3807 -+
3808 -+ /*
3809 -+ * This sample is valid if the number of outstanding requests
3810 -+ * is large enough to allow a queueing behavior. Note that the
3811 -+ * sum is not exact, as it's not taking into account deactivated
3812 -+ * requests.
3813 -+ */
3814 -+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
3815 -+ return;
3816 -+
3817 -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
3818 -+ return;
3819 -+
3820 -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
3821 -+ bfqd->max_rq_in_driver = 0;
3822 -+ bfqd->hw_tag_samples = 0;
3823 -+}
3824 -+
3825 -+static void bfq_completed_request(struct request_queue *q, struct request *rq)
3826 -+{
3827 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3828 -+ struct bfq_data *bfqd = bfqq->bfqd;
3829 -+ const int sync = rq_is_sync(rq);
3830 -+
3831 -+ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
3832 -+ blk_rq_sectors(rq), sync);
3833 -+
3834 -+ bfq_update_hw_tag(bfqd);
3835 -+
3836 -+ WARN_ON(!bfqd->rq_in_driver);
3837 -+ WARN_ON(!bfqq->dispatched);
3838 -+ bfqd->rq_in_driver--;
3839 -+ bfqq->dispatched--;
3840 -+
3841 -+ if (bfq_bfqq_sync(bfqq))
3842 -+ bfqd->sync_flight--;
3843 -+
3844 -+ if (sync)
3845 -+ RQ_BIC(rq)->ttime.last_end_request = jiffies;
3846 -+
3847 -+ /*
3848 -+ * The computation of softrt_next_start was scheduled for the next
3849 -+ * request completion: it is now time to compute it.
3850 -+ */
3851 -+ if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list))
3852 -+ bfqq->soft_rt_next_start =
3853 -+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
3854 -+
3855 -+ /*
3856 -+ * If this is the in-service queue, check if it needs to be expired,
3857 -+ * or if we want to idle in case it has no pending requests.
3858 -+ */
3859 -+ if (bfqd->in_service_queue == bfqq) {
3860 -+ if (bfq_bfqq_budget_new(bfqq))
3861 -+ bfq_set_budget_timeout(bfqd);
3862 -+
3863 -+ if (bfq_bfqq_must_idle(bfqq)) {
3864 -+ bfq_arm_slice_timer(bfqd);
3865 -+ goto out;
3866 -+ } else if (bfq_may_expire_for_budg_timeout(bfqq))
3867 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
3868 -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
3869 -+ (bfqq->dispatched == 0 ||
3870 -+ !bfq_bfqq_must_not_expire(bfqq)))
3871 -+ bfq_bfqq_expire(bfqd, bfqq, 0,
3872 -+ BFQ_BFQQ_NO_MORE_REQUESTS);
3873 -+ }
3874 -+
3875 -+ if (!bfqd->rq_in_driver)
3876 -+ bfq_schedule_dispatch(bfqd);
3877 -+
3878 -+out:
3879 -+ return;
3880 -+}
3881 -+
3882 -+static inline int __bfq_may_queue(struct bfq_queue *bfqq)
3883 -+{
3884 -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
3885 -+ bfq_clear_bfqq_must_alloc(bfqq);
3886 -+ return ELV_MQUEUE_MUST;
3887 -+ }
3888 -+
3889 -+ return ELV_MQUEUE_MAY;
3890 -+}
3891 -+
3892 -+static int bfq_may_queue(struct request_queue *q, int rw)
3893 -+{
3894 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3895 -+ struct task_struct *tsk = current;
3896 -+ struct bfq_io_cq *bic;
3897 -+ struct bfq_queue *bfqq;
3898 -+
3899 -+ /*
3900 -+ * Don't force setup of a queue from here, as a call to may_queue
3901 -+ * does not necessarily imply that a request actually will be queued.
3902 -+ * So just lookup a possibly existing queue, or return 'may queue'
3903 -+ * if that fails.
3904 -+ */
3905 -+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
3906 -+ if (bic == NULL)
3907 -+ return ELV_MQUEUE_MAY;
3908 -+
3909 -+ bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
3910 -+ if (bfqq != NULL) {
3911 -+ bfq_init_prio_data(bfqq, bic);
3912 -+
3913 -+ return __bfq_may_queue(bfqq);
3914 -+ }
3915 -+
3916 -+ return ELV_MQUEUE_MAY;
3917 -+}
3918 -+
3919 -+/*
3920 -+ * Queue lock held here.
3921 -+ */
3922 -+static void bfq_put_request(struct request *rq)
3923 -+{
3924 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3925 -+
3926 -+ if (bfqq != NULL) {
3927 -+ const int rw = rq_data_dir(rq);
3928 -+
3929 -+ BUG_ON(!bfqq->allocated[rw]);
3930 -+ bfqq->allocated[rw]--;
3931 -+
3932 -+ rq->elv.priv[0] = NULL;
3933 -+ rq->elv.priv[1] = NULL;
3934 -+
3935 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
3936 -+ bfqq, atomic_read(&bfqq->ref));
3937 -+ bfq_put_queue(bfqq);
3938 -+ }
3939 -+}
3940 -+
3941 -+static struct bfq_queue *
3942 -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
3943 -+ struct bfq_queue *bfqq)
3944 -+{
3945 -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
3946 -+ (long unsigned)bfqq->new_bfqq->pid);
3947 -+ bic_set_bfqq(bic, bfqq->new_bfqq, 1);
3948 -+ bfq_mark_bfqq_coop(bfqq->new_bfqq);
3949 -+ bfq_put_queue(bfqq);
3950 -+ return bic_to_bfqq(bic, 1);
3951 -+}
3952 -+
3953 -+/*
3954 -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
3955 -+ * was the last process referring to said bfqq.
3956 -+ */
3957 -+static struct bfq_queue *
3958 -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
3959 -+{
3960 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
3961 -+ if (bfqq_process_refs(bfqq) == 1) {
3962 -+ bfqq->pid = current->pid;
3963 -+ bfq_clear_bfqq_coop(bfqq);
3964 -+ bfq_clear_bfqq_split_coop(bfqq);
3965 -+ return bfqq;
3966 -+ }
3967 -+
3968 -+ bic_set_bfqq(bic, NULL, 1);
3969 -+
3970 -+ bfq_put_cooperator(bfqq);
3971 -+
3972 -+ bfq_put_queue(bfqq);
3973 -+ return NULL;
3974 -+}
3975 -+
3976 -+/*
3977 -+ * Allocate bfq data structures associated with this request.
3978 -+ */
3979 -+static int bfq_set_request(struct request_queue *q, struct request *rq,
3980 -+ struct bio *bio, gfp_t gfp_mask)
3981 -+{
3982 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3983 -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
3984 -+ const int rw = rq_data_dir(rq);
3985 -+ const int is_sync = rq_is_sync(rq);
3986 -+ struct bfq_queue *bfqq;
3987 -+ struct bfq_group *bfqg;
3988 -+ unsigned long flags;
3989 -+
3990 -+ might_sleep_if(gfp_mask & __GFP_WAIT);
3991 -+
3992 -+ bfq_changed_ioprio(bic);
3993 -+
3994 -+ spin_lock_irqsave(q->queue_lock, flags);
3995 -+
3996 -+ if (bic == NULL)
3997 -+ goto queue_fail;
3998 -+
3999 -+ bfqg = bfq_bic_update_cgroup(bic);
4000 -+
4001 -+new_queue:
4002 -+ bfqq = bic_to_bfqq(bic, is_sync);
4003 -+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
4004 -+ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
4005 -+ bic_set_bfqq(bic, bfqq, is_sync);
4006 -+ } else {
4007 -+ /*
4008 -+ * If the queue was seeky for too long, break it apart.
4009 -+ */
4010 -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
4011 -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
4012 -+ bfqq = bfq_split_bfqq(bic, bfqq);
4013 -+ if (!bfqq)
4014 -+ goto new_queue;
4015 -+ }
4016 -+
4017 -+ /*
4018 -+ * Check to see if this queue is scheduled to merge with
4019 -+ * another closely cooperating queue. The merging of queues
4020 -+ * happens here as it must be done in process context.
4021 -+ * The reference on new_bfqq was taken in merge_bfqqs.
4022 -+ */
4023 -+ if (bfqq->new_bfqq != NULL)
4024 -+ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
4025 -+ }
4026 -+
4027 -+ bfqq->allocated[rw]++;
4028 -+ atomic_inc(&bfqq->ref);
4029 -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
4030 -+ atomic_read(&bfqq->ref));
4031 -+
4032 -+ rq->elv.priv[0] = bic;
4033 -+ rq->elv.priv[1] = bfqq;
4034 -+
4035 -+ spin_unlock_irqrestore(q->queue_lock, flags);
4036 -+
4037 -+ return 0;
4038 -+
4039 -+queue_fail:
4040 -+ bfq_schedule_dispatch(bfqd);
4041 -+ spin_unlock_irqrestore(q->queue_lock, flags);
4042 -+
4043 -+ return 1;
4044 -+}
4045 -+
4046 -+static void bfq_kick_queue(struct work_struct *work)
4047 -+{
4048 -+ struct bfq_data *bfqd =
4049 -+ container_of(work, struct bfq_data, unplug_work);
4050 -+ struct request_queue *q = bfqd->queue;
4051 -+
4052 -+ spin_lock_irq(q->queue_lock);
4053 -+ __blk_run_queue(q);
4054 -+ spin_unlock_irq(q->queue_lock);
4055 -+}
4056 -+
4057 -+/*
4058 -+ * Handler of the expiration of the timer running if the in-service queue
4059 -+ * is idling inside its time slice.
4060 -+ */
4061 -+static void bfq_idle_slice_timer(unsigned long data)
4062 -+{
4063 -+ struct bfq_data *bfqd = (struct bfq_data *)data;
4064 -+ struct bfq_queue *bfqq;
4065 -+ unsigned long flags;
4066 -+ enum bfqq_expiration reason;
4067 -+
4068 -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
4069 -+
4070 -+ bfqq = bfqd->in_service_queue;
4071 -+ /*
4072 -+ * Theoretical race here: the in-service queue can be NULL or different
4073 -+ * from the queue that was idling if the timer handler spins on
4074 -+ * the queue_lock and a new request arrives for the current
4075 -+ * queue and there is a full dispatch cycle that changes the
4076 -+ * in-service queue. This can hardly happen, but in the worst case
4077 -+ * we just expire a queue too early.
4078 -+ */
4079 -+ if (bfqq != NULL) {
4080 -+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
4081 -+ if (bfq_bfqq_budget_timeout(bfqq))
4082 -+ /*
4083 -+ * Also here the queue can be safely expired
4084 -+ * for budget timeout without wasting
4085 -+ * guarantees
4086 -+ */
4087 -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
4088 -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
4089 -+ /*
4090 -+ * The queue may not be empty upon timer expiration,
4091 -+ * because we may not disable the timer when the first
4092 -+ * request of the in-service queue arrives during
4093 -+ * disk idling
4094 -+ */
4095 -+ reason = BFQ_BFQQ_TOO_IDLE;
4096 -+ else
4097 -+ goto schedule_dispatch;
4098 -+
4099 -+ bfq_bfqq_expire(bfqd, bfqq, 1, reason);
4100 -+ }
4101 -+
4102 -+schedule_dispatch:
4103 -+ bfq_schedule_dispatch(bfqd);
4104 -+
4105 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
4106 -+}
4107 -+
4108 -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
4109 -+{
4110 -+ del_timer_sync(&bfqd->idle_slice_timer);
4111 -+ cancel_work_sync(&bfqd->unplug_work);
4112 -+}
4113 -+
4114 -+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
4115 -+ struct bfq_queue **bfqq_ptr)
4116 -+{
4117 -+ struct bfq_group *root_group = bfqd->root_group;
4118 -+ struct bfq_queue *bfqq = *bfqq_ptr;
4119 -+
4120 -+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
4121 -+ if (bfqq != NULL) {
4122 -+ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
4123 -+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
4124 -+ bfqq, atomic_read(&bfqq->ref));
4125 -+ bfq_put_queue(bfqq);
4126 -+ *bfqq_ptr = NULL;
4127 -+ }
4128 -+}
4129 -+
4130 -+/*
4131 -+ * Release all the bfqg references to its async queues. If we are
4132 -+ * deallocating the group these queues may still contain requests, so
4133 -+ * we reparent them to the root cgroup (i.e., the only one that will
4134 -+ * exist for sure untill all the requests on a device are gone).
4135 -+ */
4136 -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
4137 -+{
4138 -+ int i, j;
4139 -+
4140 -+ for (i = 0; i < 2; i++)
4141 -+ for (j = 0; j < IOPRIO_BE_NR; j++)
4142 -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
4143 -+
4144 -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
4145 -+}
4146 -+
4147 -+static void bfq_exit_queue(struct elevator_queue *e)
4148 -+{
4149 -+ struct bfq_data *bfqd = e->elevator_data;
4150 -+ struct request_queue *q = bfqd->queue;
4151 -+ struct bfq_queue *bfqq, *n;
4152 -+
4153 -+ bfq_shutdown_timer_wq(bfqd);
4154 -+
4155 -+ spin_lock_irq(q->queue_lock);
4156 -+
4157 -+ BUG_ON(bfqd->in_service_queue != NULL);
4158 -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
4159 -+ bfq_deactivate_bfqq(bfqd, bfqq, 0);
4160 -+
4161 -+ bfq_disconnect_groups(bfqd);
4162 -+ spin_unlock_irq(q->queue_lock);
4163 -+
4164 -+ bfq_shutdown_timer_wq(bfqd);
4165 -+
4166 -+ synchronize_rcu();
4167 -+
4168 -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
4169 -+
4170 -+ bfq_free_root_group(bfqd);
4171 -+ kfree(bfqd);
4172 -+}
4173 -+
4174 -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
4175 -+{
4176 -+ struct bfq_group *bfqg;
4177 -+ struct bfq_data *bfqd;
4178 -+ struct elevator_queue *eq;
4179 -+
4180 -+ eq = elevator_alloc(q, e);
4181 -+ if (eq == NULL)
4182 -+ return -ENOMEM;
4183 -+
4184 -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
4185 -+ if (bfqd == NULL) {
4186 -+ kobject_put(&eq->kobj);
4187 -+ return -ENOMEM;
4188 -+ }
4189 -+ eq->elevator_data = bfqd;
4190 -+
4191 -+ /*
4192 -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
4193 -+ * Grab a permanent reference to it, so that the normal code flow
4194 -+ * will not attempt to free it.
4195 -+ */
4196 -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
4197 -+ atomic_inc(&bfqd->oom_bfqq.ref);
4198 -+
4199 -+ bfqd->queue = q;
4200 -+
4201 -+ spin_lock_irq(q->queue_lock);
4202 -+ q->elevator = eq;
4203 -+ spin_unlock_irq(q->queue_lock);
4204 -+
4205 -+ bfqg = bfq_alloc_root_group(bfqd, q->node);
4206 -+ if (bfqg == NULL) {
4207 -+ kfree(bfqd);
4208 -+ kobject_put(&eq->kobj);
4209 -+ return -ENOMEM;
4210 -+ }
4211 -+
4212 -+ bfqd->root_group = bfqg;
4213 -+
4214 -+ init_timer(&bfqd->idle_slice_timer);
4215 -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
4216 -+ bfqd->idle_slice_timer.data = (unsigned long)bfqd;
4217 -+
4218 -+ bfqd->rq_pos_tree = RB_ROOT;
4219 -+
4220 -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
4221 -+
4222 -+ INIT_LIST_HEAD(&bfqd->active_list);
4223 -+ INIT_LIST_HEAD(&bfqd->idle_list);
4224 -+
4225 -+ bfqd->hw_tag = -1;
4226 -+
4227 -+ bfqd->bfq_max_budget = bfq_default_max_budget;
4228 -+
4229 -+ bfqd->bfq_quantum = bfq_quantum;
4230 -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
4231 -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
4232 -+ bfqd->bfq_back_max = bfq_back_max;
4233 -+ bfqd->bfq_back_penalty = bfq_back_penalty;
4234 -+ bfqd->bfq_slice_idle = bfq_slice_idle;
4235 -+ bfqd->bfq_class_idle_last_service = 0;
4236 -+ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
4237 -+ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
4238 -+ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
4239 -+
4240 -+ bfqd->low_latency = true;
4241 -+
4242 -+ bfqd->bfq_raising_coeff = 20;
4243 -+ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
4244 -+ bfqd->bfq_raising_max_time = 0;
4245 -+ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
4246 -+ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
4247 -+ bfqd->bfq_raising_max_softrt_rate = 7000; /*
4248 -+ * Approximate rate required
4249 -+ * to playback or record a
4250 -+ * high-definition compressed
4251 -+ * video.
4252 -+ */
4253 -+ bfqd->raised_busy_queues = 0;
4254 -+
4255 -+ /* Initially estimate the device's peak rate as the reference rate */
4256 -+ if (blk_queue_nonrot(bfqd->queue)) {
4257 -+ bfqd->RT_prod = R_nonrot * T_nonrot;
4258 -+ bfqd->peak_rate = R_nonrot;
4259 -+ } else {
4260 -+ bfqd->RT_prod = R_rot * T_rot;
4261 -+ bfqd->peak_rate = R_rot;
4262 -+ }
4263 -+
4264 -+ return 0;
4265 -+}
4266 -+
4267 -+static void bfq_slab_kill(void)
4268 -+{
4269 -+ if (bfq_pool != NULL)
4270 -+ kmem_cache_destroy(bfq_pool);
4271 -+}
4272 -+
4273 -+static int __init bfq_slab_setup(void)
4274 -+{
4275 -+ bfq_pool = KMEM_CACHE(bfq_queue, 0);
4276 -+ if (bfq_pool == NULL)
4277 -+ return -ENOMEM;
4278 -+ return 0;
4279 -+}
4280 -+
4281 -+static ssize_t bfq_var_show(unsigned int var, char *page)
4282 -+{
4283 -+ return sprintf(page, "%d\n", var);
4284 -+}
4285 -+
4286 -+static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
4287 -+{
4288 -+ unsigned long new_val;
4289 -+ int ret = kstrtoul(page, 10, &new_val);
4290 -+
4291 -+ if (ret == 0)
4292 -+ *var = new_val;
4293 -+
4294 -+ return count;
4295 -+}
4296 -+
4297 -+static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
4298 -+{
4299 -+ struct bfq_data *bfqd = e->elevator_data;
4300 -+ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
4301 -+ jiffies_to_msecs(bfqd->bfq_raising_max_time) :
4302 -+ jiffies_to_msecs(bfq_wrais_duration(bfqd)));
4303 -+}
4304 -+
4305 -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
4306 -+{
4307 -+ struct bfq_queue *bfqq;
4308 -+ struct bfq_data *bfqd = e->elevator_data;
4309 -+ ssize_t num_char = 0;
4310 -+
4311 -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
4312 -+ bfqd->queued);
4313 -+
4314 -+ spin_lock_irq(bfqd->queue->queue_lock);
4315 -+
4316 -+ num_char += sprintf(page + num_char, "Active:\n");
4317 -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
4318 -+ num_char += sprintf(page + num_char,
4319 -+ "pid%d: weight %hu, nr_queued %d %d,"
4320 -+ " dur %d/%u\n",
4321 -+ bfqq->pid,
4322 -+ bfqq->entity.weight,
4323 -+ bfqq->queued[0],
4324 -+ bfqq->queued[1],
4325 -+ jiffies_to_msecs(jiffies -
4326 -+ bfqq->last_rais_start_finish),
4327 -+ jiffies_to_msecs(bfqq->raising_cur_max_time));
4328 -+ }
4329 -+
4330 -+ num_char += sprintf(page + num_char, "Idle:\n");
4331 -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
4332 -+ num_char += sprintf(page + num_char,
4333 -+ "pid%d: weight %hu, dur %d/%u\n",
4334 -+ bfqq->pid,
4335 -+ bfqq->entity.weight,
4336 -+ jiffies_to_msecs(jiffies -
4337 -+ bfqq->last_rais_start_finish),
4338 -+ jiffies_to_msecs(bfqq->raising_cur_max_time));
4339 -+ }
4340 -+
4341 -+ spin_unlock_irq(bfqd->queue->queue_lock);
4342 -+
4343 -+ return num_char;
4344 -+}
4345 -+
4346 -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
4347 -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
4348 -+{ \
4349 -+ struct bfq_data *bfqd = e->elevator_data; \
4350 -+ unsigned int __data = __VAR; \
4351 -+ if (__CONV) \
4352 -+ __data = jiffies_to_msecs(__data); \
4353 -+ return bfq_var_show(__data, (page)); \
4354 -+}
4355 -+SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
4356 -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
4357 -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
4358 -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
4359 -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
4360 -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
4361 -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
4362 -+SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
4363 -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
4364 -+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
4365 -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
4366 -+SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
4367 -+SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
4368 -+SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
4369 -+ 1);
4370 -+SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
4371 -+ bfqd->bfq_raising_min_inter_arr_async,
4372 -+ 1);
4373 -+SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
4374 -+ bfqd->bfq_raising_max_softrt_rate, 0);
4375 -+#undef SHOW_FUNCTION
4376 -+
4377 -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
4378 -+static ssize_t \
4379 -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \
4380 -+{ \
4381 -+ struct bfq_data *bfqd = e->elevator_data; \
4382 -+ unsigned long uninitialized_var(__data); \
4383 -+ int ret = bfq_var_store(&__data, (page), count); \
4384 -+ if (__data < (MIN)) \
4385 -+ __data = (MIN); \
4386 -+ else if (__data > (MAX)) \
4387 -+ __data = (MAX); \
4388 -+ if (__CONV) \
4389 -+ *(__PTR) = msecs_to_jiffies(__data); \
4390 -+ else \
4391 -+ *(__PTR) = __data; \
4392 -+ return ret; \
4393 -+}
4394 -+STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
4395 -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
4396 -+ INT_MAX, 1);
4397 -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
4398 -+ INT_MAX, 1);
4399 -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
4400 -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
4401 -+ INT_MAX, 0);
4402 -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
4403 -+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
4404 -+ 1, INT_MAX, 0);
4405 -+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
4406 -+ INT_MAX, 1);
4407 -+STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
4408 -+ INT_MAX, 0);
4409 -+STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
4410 -+ INT_MAX, 1);
4411 -+STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
4412 -+ INT_MAX, 1);
4413 -+STORE_FUNCTION(bfq_raising_min_idle_time_store,
4414 -+ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
4415 -+STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
4416 -+ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
4417 -+STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
4418 -+ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
4419 -+#undef STORE_FUNCTION
4420 -+
4421 -+/* do nothing for the moment */
4422 -+static ssize_t bfq_weights_store(struct elevator_queue *e,
4423 -+ const char *page, size_t count)
4424 -+{
4425 -+ return count;
4426 -+}
4427 -+
4428 -+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
4429 -+{
4430 -+ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
4431 -+
4432 -+ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
4433 -+ return bfq_calc_max_budget(bfqd->peak_rate, timeout);
4434 -+ else
4435 -+ return bfq_default_max_budget;
4436 -+}
4437 -+
4438 -+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
4439 -+ const char *page, size_t count)
4440 -+{
4441 -+ struct bfq_data *bfqd = e->elevator_data;
4442 -+ unsigned long uninitialized_var(__data);
4443 -+ int ret = bfq_var_store(&__data, (page), count);
4444 -+
4445 -+ if (__data == 0)
4446 -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4447 -+ else {
4448 -+ if (__data > INT_MAX)
4449 -+ __data = INT_MAX;
4450 -+ bfqd->bfq_max_budget = __data;
4451 -+ }
4452 -+
4453 -+ bfqd->bfq_user_max_budget = __data;
4454 -+
4455 -+ return ret;
4456 -+}
4457 -+
4458 -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
4459 -+ const char *page, size_t count)
4460 -+{
4461 -+ struct bfq_data *bfqd = e->elevator_data;
4462 -+ unsigned long uninitialized_var(__data);
4463 -+ int ret = bfq_var_store(&__data, (page), count);
4464 -+
4465 -+ if (__data < 1)
4466 -+ __data = 1;
4467 -+ else if (__data > INT_MAX)
4468 -+ __data = INT_MAX;
4469 -+
4470 -+ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
4471 -+ if (bfqd->bfq_user_max_budget == 0)
4472 -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4473 -+
4474 -+ return ret;
4475 -+}
4476 -+
4477 -+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
4478 -+ const char *page, size_t count)
4479 -+{
4480 -+ struct bfq_data *bfqd = e->elevator_data;
4481 -+ unsigned long uninitialized_var(__data);
4482 -+ int ret = bfq_var_store(&__data, (page), count);
4483 -+
4484 -+ if (__data > 1)
4485 -+ __data = 1;
4486 -+ if (__data == 0 && bfqd->low_latency != 0)
4487 -+ bfq_end_raising(bfqd);
4488 -+ bfqd->low_latency = __data;
4489 -+
4490 -+ return ret;
4491 -+}
4492 -+
4493 -+#define BFQ_ATTR(name) \
4494 -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
4495 -+
4496 -+static struct elv_fs_entry bfq_attrs[] = {
4497 -+ BFQ_ATTR(quantum),
4498 -+ BFQ_ATTR(fifo_expire_sync),
4499 -+ BFQ_ATTR(fifo_expire_async),
4500 -+ BFQ_ATTR(back_seek_max),
4501 -+ BFQ_ATTR(back_seek_penalty),
4502 -+ BFQ_ATTR(slice_idle),
4503 -+ BFQ_ATTR(max_budget),
4504 -+ BFQ_ATTR(max_budget_async_rq),
4505 -+ BFQ_ATTR(timeout_sync),
4506 -+ BFQ_ATTR(timeout_async),
4507 -+ BFQ_ATTR(low_latency),
4508 -+ BFQ_ATTR(raising_coeff),
4509 -+ BFQ_ATTR(raising_max_time),
4510 -+ BFQ_ATTR(raising_rt_max_time),
4511 -+ BFQ_ATTR(raising_min_idle_time),
4512 -+ BFQ_ATTR(raising_min_inter_arr_async),
4513 -+ BFQ_ATTR(raising_max_softrt_rate),
4514 -+ BFQ_ATTR(weights),
4515 -+ __ATTR_NULL
4516 -+};
4517 -+
4518 -+static struct elevator_type iosched_bfq = {
4519 -+ .ops = {
4520 -+ .elevator_merge_fn = bfq_merge,
4521 -+ .elevator_merged_fn = bfq_merged_request,
4522 -+ .elevator_merge_req_fn = bfq_merged_requests,
4523 -+ .elevator_allow_merge_fn = bfq_allow_merge,
4524 -+ .elevator_dispatch_fn = bfq_dispatch_requests,
4525 -+ .elevator_add_req_fn = bfq_insert_request,
4526 -+ .elevator_activate_req_fn = bfq_activate_request,
4527 -+ .elevator_deactivate_req_fn = bfq_deactivate_request,
4528 -+ .elevator_completed_req_fn = bfq_completed_request,
4529 -+ .elevator_former_req_fn = elv_rb_former_request,
4530 -+ .elevator_latter_req_fn = elv_rb_latter_request,
4531 -+ .elevator_init_icq_fn = bfq_init_icq,
4532 -+ .elevator_exit_icq_fn = bfq_exit_icq,
4533 -+ .elevator_set_req_fn = bfq_set_request,
4534 -+ .elevator_put_req_fn = bfq_put_request,
4535 -+ .elevator_may_queue_fn = bfq_may_queue,
4536 -+ .elevator_init_fn = bfq_init_queue,
4537 -+ .elevator_exit_fn = bfq_exit_queue,
4538 -+ },
4539 -+ .icq_size = sizeof(struct bfq_io_cq),
4540 -+ .icq_align = __alignof__(struct bfq_io_cq),
4541 -+ .elevator_attrs = bfq_attrs,
4542 -+ .elevator_name = "bfq",
4543 -+ .elevator_owner = THIS_MODULE,
4544 -+};
4545 -+
4546 -+static int __init bfq_init(void)
4547 -+{
4548 -+ /*
4549 -+ * Can be 0 on HZ < 1000 setups.
4550 -+ */
4551 -+ if (bfq_slice_idle == 0)
4552 -+ bfq_slice_idle = 1;
4553 -+
4554 -+ if (bfq_timeout_async == 0)
4555 -+ bfq_timeout_async = 1;
4556 -+
4557 -+ if (bfq_slab_setup())
4558 -+ return -ENOMEM;
4559 -+
4560 -+ elv_register(&iosched_bfq);
4561 -+ printk(KERN_INFO "BFQ I/O-scheduler version: v7");
4562 -+
4563 -+ return 0;
4564 -+}
4565 -+
4566 -+static void __exit bfq_exit(void)
4567 -+{
4568 -+ elv_unregister(&iosched_bfq);
4569 -+ bfq_slab_kill();
4570 -+}
4571 -+
4572 -+module_init(bfq_init);
4573 -+module_exit(bfq_exit);
4574 -+
4575 -+MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
4576 -+MODULE_LICENSE("GPL");
4577 -+MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
4578 -diff --git a/block/bfq-sched.c b/block/bfq-sched.c
4579 -new file mode 100644
4580 -index 0000000..30df81c
4581 ---- /dev/null
4582 -+++ b/block/bfq-sched.c
4583 -@@ -0,0 +1,1077 @@
4584 -+/*
4585 -+ * BFQ: Hierarchical B-WF2Q+ scheduler.
4586 -+ *
4587 -+ * Based on ideas and code from CFQ:
4588 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
4589 -+ *
4590 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
4591 -+ * Paolo Valente <paolo.valente@×××××××.it>
4592 -+ *
4593 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
4594 -+ */
4595 -+
4596 -+#ifdef CONFIG_CGROUP_BFQIO
4597 -+#define for_each_entity(entity) \
4598 -+ for (; entity != NULL; entity = entity->parent)
4599 -+
4600 -+#define for_each_entity_safe(entity, parent) \
4601 -+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
4602 -+
4603 -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
4604 -+ int extract,
4605 -+ struct bfq_data *bfqd);
4606 -+
4607 -+static inline void bfq_update_budget(struct bfq_entity *next_active)
4608 -+{
4609 -+ struct bfq_entity *bfqg_entity;
4610 -+ struct bfq_group *bfqg;
4611 -+ struct bfq_sched_data *group_sd;
4612 -+
4613 -+ BUG_ON(next_active == NULL);
4614 -+
4615 -+ group_sd = next_active->sched_data;
4616 -+
4617 -+ bfqg = container_of(group_sd, struct bfq_group, sched_data);
4618 -+ /*
4619 -+ * bfq_group's my_entity field is not NULL only if the group
4620 -+ * is not the root group. We must not touch the root entity
4621 -+ * as it must never become an active entity.
4622 -+ */
4623 -+ bfqg_entity = bfqg->my_entity;
4624 -+ if (bfqg_entity != NULL)
4625 -+ bfqg_entity->budget = next_active->budget;
4626 -+}
4627 -+
4628 -+static int bfq_update_next_active(struct bfq_sched_data *sd)
4629 -+{
4630 -+ struct bfq_entity *next_active;
4631 -+
4632 -+ if (sd->active_entity != NULL)
4633 -+ /* will update/requeue at the end of service */
4634 -+ return 0;
4635 -+
4636 -+ /*
4637 -+ * NOTE: this can be improved in many ways, such as returning
4638 -+ * 1 (and thus propagating upwards the update) only when the
4639 -+ * budget changes, or caching the bfqq that will be scheduled
4640 -+ * next from this subtree. By now we worry more about
4641 -+ * correctness than about performance...
4642 -+ */
4643 -+ next_active = bfq_lookup_next_entity(sd, 0, NULL);
4644 -+ sd->next_active = next_active;
4645 -+
4646 -+ if (next_active != NULL)
4647 -+ bfq_update_budget(next_active);
4648 -+
4649 -+ return 1;
4650 -+}
4651 -+
4652 -+static inline void bfq_check_next_active(struct bfq_sched_data *sd,
4653 -+ struct bfq_entity *entity)
4654 -+{
4655 -+ BUG_ON(sd->next_active != entity);
4656 -+}
4657 -+#else
4658 -+#define for_each_entity(entity) \
4659 -+ for (; entity != NULL; entity = NULL)
4660 -+
4661 -+#define for_each_entity_safe(entity, parent) \
4662 -+ for (parent = NULL; entity != NULL; entity = parent)
4663 -+
4664 -+static inline int bfq_update_next_active(struct bfq_sched_data *sd)
4665 -+{
4666 -+ return 0;
4667 -+}
4668 -+
4669 -+static inline void bfq_check_next_active(struct bfq_sched_data *sd,
4670 -+ struct bfq_entity *entity)
4671 -+{
4672 -+}
4673 -+
4674 -+static inline void bfq_update_budget(struct bfq_entity *next_active)
4675 -+{
4676 -+}
4677 -+#endif
4678 -+
4679 -+/*
4680 -+ * Shift for timestamp calculations. This actually limits the maximum
4681 -+ * service allowed in one timestamp delta (small shift values increase it),
4682 -+ * the maximum total weight that can be used for the queues in the system
4683 -+ * (big shift values increase it), and the period of virtual time wraparounds.
4684 -+ */
4685 -+#define WFQ_SERVICE_SHIFT 22
4686 -+
4687 -+/**
4688 -+ * bfq_gt - compare two timestamps.
4689 -+ * @a: first ts.
4690 -+ * @b: second ts.
4691 -+ *
4692 -+ * Return @a > @b, dealing with wrapping correctly.
4693 -+ */
4694 -+static inline int bfq_gt(u64 a, u64 b)
4695 -+{
4696 -+ return (s64)(a - b) > 0;
4697 -+}
4698 -+
4699 -+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
4700 -+{
4701 -+ struct bfq_queue *bfqq = NULL;
4702 -+
4703 -+ BUG_ON(entity == NULL);
4704 -+
4705 -+ if (entity->my_sched_data == NULL)
4706 -+ bfqq = container_of(entity, struct bfq_queue, entity);
4707 -+
4708 -+ return bfqq;
4709 -+}
4710 -+
4711 -+
4712 -+/**
4713 -+ * bfq_delta - map service into the virtual time domain.
4714 -+ * @service: amount of service.
4715 -+ * @weight: scale factor (weight of an entity or weight sum).
4716 -+ */
4717 -+static inline u64 bfq_delta(unsigned long service,
4718 -+ unsigned long weight)
4719 -+{
4720 -+ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
4721 -+
4722 -+ do_div(d, weight);
4723 -+ return d;
4724 -+}
4725 -+
4726 -+/**
4727 -+ * bfq_calc_finish - assign the finish time to an entity.
4728 -+ * @entity: the entity to act upon.
4729 -+ * @service: the service to be charged to the entity.
4730 -+ */
4731 -+static inline void bfq_calc_finish(struct bfq_entity *entity,
4732 -+ unsigned long service)
4733 -+{
4734 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4735 -+
4736 -+ BUG_ON(entity->weight == 0);
4737 -+
4738 -+ entity->finish = entity->start +
4739 -+ bfq_delta(service, entity->weight);
4740 -+
4741 -+ if (bfqq != NULL) {
4742 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
4743 -+ "calc_finish: serv %lu, w %d",
4744 -+ service, entity->weight);
4745 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
4746 -+ "calc_finish: start %llu, finish %llu, delta %llu",
4747 -+ entity->start, entity->finish,
4748 -+ bfq_delta(service, entity->weight));
4749 -+ }
4750 -+}
4751 -+
4752 -+/**
4753 -+ * bfq_entity_of - get an entity from a node.
4754 -+ * @node: the node field of the entity.
4755 -+ *
4756 -+ * Convert a node pointer to the relative entity. This is used only
4757 -+ * to simplify the logic of some functions and not as the generic
4758 -+ * conversion mechanism because, e.g., in the tree walking functions,
4759 -+ * the check for a %NULL value would be redundant.
4760 -+ */
4761 -+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
4762 -+{
4763 -+ struct bfq_entity *entity = NULL;
4764 -+
4765 -+ if (node != NULL)
4766 -+ entity = rb_entry(node, struct bfq_entity, rb_node);
4767 -+
4768 -+ return entity;
4769 -+}
4770 -+
4771 -+/**
4772 -+ * bfq_extract - remove an entity from a tree.
4773 -+ * @root: the tree root.
4774 -+ * @entity: the entity to remove.
4775 -+ */
4776 -+static inline void bfq_extract(struct rb_root *root,
4777 -+ struct bfq_entity *entity)
4778 -+{
4779 -+ BUG_ON(entity->tree != root);
4780 -+
4781 -+ entity->tree = NULL;
4782 -+ rb_erase(&entity->rb_node, root);
4783 -+}
4784 -+
4785 -+/**
4786 -+ * bfq_idle_extract - extract an entity from the idle tree.
4787 -+ * @st: the service tree of the owning @entity.
4788 -+ * @entity: the entity being removed.
4789 -+ */
4790 -+static void bfq_idle_extract(struct bfq_service_tree *st,
4791 -+ struct bfq_entity *entity)
4792 -+{
4793 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4794 -+ struct rb_node *next;
4795 -+
4796 -+ BUG_ON(entity->tree != &st->idle);
4797 -+
4798 -+ if (entity == st->first_idle) {
4799 -+ next = rb_next(&entity->rb_node);
4800 -+ st->first_idle = bfq_entity_of(next);
4801 -+ }
4802 -+
4803 -+ if (entity == st->last_idle) {
4804 -+ next = rb_prev(&entity->rb_node);
4805 -+ st->last_idle = bfq_entity_of(next);
4806 -+ }
4807 -+
4808 -+ bfq_extract(&st->idle, entity);
4809 -+
4810 -+ if (bfqq != NULL)
4811 -+ list_del(&bfqq->bfqq_list);
4812 -+}
4813 -+
4814 -+/**
4815 -+ * bfq_insert - generic tree insertion.
4816 -+ * @root: tree root.
4817 -+ * @entity: entity to insert.
4818 -+ *
4819 -+ * This is used for the idle and the active tree, since they are both
4820 -+ * ordered by finish time.
4821 -+ */
4822 -+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
4823 -+{
4824 -+ struct bfq_entity *entry;
4825 -+ struct rb_node **node = &root->rb_node;
4826 -+ struct rb_node *parent = NULL;
4827 -+
4828 -+ BUG_ON(entity->tree != NULL);
4829 -+
4830 -+ while (*node != NULL) {
4831 -+ parent = *node;
4832 -+ entry = rb_entry(parent, struct bfq_entity, rb_node);
4833 -+
4834 -+ if (bfq_gt(entry->finish, entity->finish))
4835 -+ node = &parent->rb_left;
4836 -+ else
4837 -+ node = &parent->rb_right;
4838 -+ }
4839 -+
4840 -+ rb_link_node(&entity->rb_node, parent, node);
4841 -+ rb_insert_color(&entity->rb_node, root);
4842 -+
4843 -+ entity->tree = root;
4844 -+}
4845 -+
4846 -+/**
4847 -+ * bfq_update_min - update the min_start field of a entity.
4848 -+ * @entity: the entity to update.
4849 -+ * @node: one of its children.
4850 -+ *
4851 -+ * This function is called when @entity may store an invalid value for
4852 -+ * min_start due to updates to the active tree. The function assumes
4853 -+ * that the subtree rooted at @node (which may be its left or its right
4854 -+ * child) has a valid min_start value.
4855 -+ */
4856 -+static inline void bfq_update_min(struct bfq_entity *entity,
4857 -+ struct rb_node *node)
4858 -+{
4859 -+ struct bfq_entity *child;
4860 -+
4861 -+ if (node != NULL) {
4862 -+ child = rb_entry(node, struct bfq_entity, rb_node);
4863 -+ if (bfq_gt(entity->min_start, child->min_start))
4864 -+ entity->min_start = child->min_start;
4865 -+ }
4866 -+}
4867 -+
4868 -+/**
4869 -+ * bfq_update_active_node - recalculate min_start.
4870 -+ * @node: the node to update.
4871 -+ *
4872 -+ * @node may have changed position or one of its children may have moved,
4873 -+ * this function updates its min_start value. The left and right subtrees
4874 -+ * are assumed to hold a correct min_start value.
4875 -+ */
4876 -+static inline void bfq_update_active_node(struct rb_node *node)
4877 -+{
4878 -+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
4879 -+
4880 -+ entity->min_start = entity->start;
4881 -+ bfq_update_min(entity, node->rb_right);
4882 -+ bfq_update_min(entity, node->rb_left);
4883 -+}
4884 -+
4885 -+/**
4886 -+ * bfq_update_active_tree - update min_start for the whole active tree.
4887 -+ * @node: the starting node.
4888 -+ *
4889 -+ * @node must be the deepest modified node after an update. This function
4890 -+ * updates its min_start using the values held by its children, assuming
4891 -+ * that they did not change, and then updates all the nodes that may have
4892 -+ * changed in the path to the root. The only nodes that may have changed
4893 -+ * are the ones in the path or their siblings.
4894 -+ */
4895 -+static void bfq_update_active_tree(struct rb_node *node)
4896 -+{
4897 -+ struct rb_node *parent;
4898 -+
4899 -+up:
4900 -+ bfq_update_active_node(node);
4901 -+
4902 -+ parent = rb_parent(node);
4903 -+ if (parent == NULL)
4904 -+ return;
4905 -+
4906 -+ if (node == parent->rb_left && parent->rb_right != NULL)
4907 -+ bfq_update_active_node(parent->rb_right);
4908 -+ else if (parent->rb_left != NULL)
4909 -+ bfq_update_active_node(parent->rb_left);
4910 -+
4911 -+ node = parent;
4912 -+ goto up;
4913 -+}
4914 -+
4915 -+/**
4916 -+ * bfq_active_insert - insert an entity in the active tree of its group/device.
4917 -+ * @st: the service tree of the entity.
4918 -+ * @entity: the entity being inserted.
4919 -+ *
4920 -+ * The active tree is ordered by finish time, but an extra key is kept
4921 -+ * per each node, containing the minimum value for the start times of
4922 -+ * its children (and the node itself), so it's possible to search for
4923 -+ * the eligible node with the lowest finish time in logarithmic time.
4924 -+ */
4925 -+static void bfq_active_insert(struct bfq_service_tree *st,
4926 -+ struct bfq_entity *entity)
4927 -+{
4928 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4929 -+ struct rb_node *node = &entity->rb_node;
4930 -+
4931 -+ bfq_insert(&st->active, entity);
4932 -+
4933 -+ if (node->rb_left != NULL)
4934 -+ node = node->rb_left;
4935 -+ else if (node->rb_right != NULL)
4936 -+ node = node->rb_right;
4937 -+
4938 -+ bfq_update_active_tree(node);
4939 -+
4940 -+ if (bfqq != NULL)
4941 -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
4942 -+}
4943 -+
4944 -+/**
4945 -+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
4946 -+ * @ioprio: the ioprio value to convert.
4947 -+ */
4948 -+static unsigned short bfq_ioprio_to_weight(int ioprio)
4949 -+{
4950 -+ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
4951 -+ return IOPRIO_BE_NR - ioprio;
4952 -+}
4953 -+
4954 -+/**
4955 -+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
4956 -+ * @weight: the weight value to convert.
4957 -+ *
4958 -+ * To preserve as mush as possible the old only-ioprio user interface,
4959 -+ * 0 is used as an escape ioprio value for weights (numerically) equal or
4960 -+ * larger than IOPRIO_BE_NR
4961 -+ */
4962 -+static unsigned short bfq_weight_to_ioprio(int weight)
4963 -+{
4964 -+ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
4965 -+ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
4966 -+}
4967 -+
4968 -+static inline void bfq_get_entity(struct bfq_entity *entity)
4969 -+{
4970 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4971 -+ struct bfq_sched_data *sd;
4972 -+
4973 -+ if (bfqq != NULL) {
4974 -+ sd = entity->sched_data;
4975 -+ atomic_inc(&bfqq->ref);
4976 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
4977 -+ bfqq, atomic_read(&bfqq->ref));
4978 -+ }
4979 -+}
4980 -+
4981 -+/**
4982 -+ * bfq_find_deepest - find the deepest node that an extraction can modify.
4983 -+ * @node: the node being removed.
4984 -+ *
4985 -+ * Do the first step of an extraction in an rb tree, looking for the
4986 -+ * node that will replace @node, and returning the deepest node that
4987 -+ * the following modifications to the tree can touch. If @node is the
4988 -+ * last node in the tree return %NULL.
4989 -+ */
4990 -+static struct rb_node *bfq_find_deepest(struct rb_node *node)
4991 -+{
4992 -+ struct rb_node *deepest;
4993 -+
4994 -+ if (node->rb_right == NULL && node->rb_left == NULL)
4995 -+ deepest = rb_parent(node);
4996 -+ else if (node->rb_right == NULL)
4997 -+ deepest = node->rb_left;
4998 -+ else if (node->rb_left == NULL)
4999 -+ deepest = node->rb_right;
5000 -+ else {
5001 -+ deepest = rb_next(node);
5002 -+ if (deepest->rb_right != NULL)
5003 -+ deepest = deepest->rb_right;
5004 -+ else if (rb_parent(deepest) != node)
5005 -+ deepest = rb_parent(deepest);
5006 -+ }
5007 -+
5008 -+ return deepest;
5009 -+}
5010 -+
5011 -+/**
5012 -+ * bfq_active_extract - remove an entity from the active tree.
5013 -+ * @st: the service_tree containing the tree.
5014 -+ * @entity: the entity being removed.
5015 -+ */
5016 -+static void bfq_active_extract(struct bfq_service_tree *st,
5017 -+ struct bfq_entity *entity)
5018 -+{
5019 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5020 -+ struct rb_node *node;
5021 -+
5022 -+ node = bfq_find_deepest(&entity->rb_node);
5023 -+ bfq_extract(&st->active, entity);
5024 -+
5025 -+ if (node != NULL)
5026 -+ bfq_update_active_tree(node);
5027 -+
5028 -+ if (bfqq != NULL)
5029 -+ list_del(&bfqq->bfqq_list);
5030 -+}
5031 -+
5032 -+/**
5033 -+ * bfq_idle_insert - insert an entity into the idle tree.
5034 -+ * @st: the service tree containing the tree.
5035 -+ * @entity: the entity to insert.
5036 -+ */
5037 -+static void bfq_idle_insert(struct bfq_service_tree *st,
5038 -+ struct bfq_entity *entity)
5039 -+{
5040 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5041 -+ struct bfq_entity *first_idle = st->first_idle;
5042 -+ struct bfq_entity *last_idle = st->last_idle;
5043 -+
5044 -+ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
5045 -+ st->first_idle = entity;
5046 -+ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
5047 -+ st->last_idle = entity;
5048 -+
5049 -+ bfq_insert(&st->idle, entity);
5050 -+
5051 -+ if (bfqq != NULL)
5052 -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
5053 -+}
5054 -+
5055 -+/**
5056 -+ * bfq_forget_entity - remove an entity from the wfq trees.
5057 -+ * @st: the service tree.
5058 -+ * @entity: the entity being removed.
5059 -+ *
5060 -+ * Update the device status and forget everything about @entity, putting
5061 -+ * the device reference to it, if it is a queue. Entities belonging to
5062 -+ * groups are not refcounted.
5063 -+ */
5064 -+static void bfq_forget_entity(struct bfq_service_tree *st,
5065 -+ struct bfq_entity *entity)
5066 -+{
5067 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5068 -+ struct bfq_sched_data *sd;
5069 -+
5070 -+ BUG_ON(!entity->on_st);
5071 -+
5072 -+ entity->on_st = 0;
5073 -+ st->wsum -= entity->weight;
5074 -+ if (bfqq != NULL) {
5075 -+ sd = entity->sched_data;
5076 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
5077 -+ bfqq, atomic_read(&bfqq->ref));
5078 -+ bfq_put_queue(bfqq);
5079 -+ }
5080 -+}
5081 -+
5082 -+/**
5083 -+ * bfq_put_idle_entity - release the idle tree ref of an entity.
5084 -+ * @st: service tree for the entity.
5085 -+ * @entity: the entity being released.
5086 -+ */
5087 -+static void bfq_put_idle_entity(struct bfq_service_tree *st,
5088 -+ struct bfq_entity *entity)
5089 -+{
5090 -+ bfq_idle_extract(st, entity);
5091 -+ bfq_forget_entity(st, entity);
5092 -+}
5093 -+
5094 -+/**
5095 -+ * bfq_forget_idle - update the idle tree if necessary.
5096 -+ * @st: the service tree to act upon.
5097 -+ *
5098 -+ * To preserve the global O(log N) complexity we only remove one entry here;
5099 -+ * as the idle tree will not grow indefinitely this can be done safely.
5100 -+ */
5101 -+static void bfq_forget_idle(struct bfq_service_tree *st)
5102 -+{
5103 -+ struct bfq_entity *first_idle = st->first_idle;
5104 -+ struct bfq_entity *last_idle = st->last_idle;
5105 -+
5106 -+ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
5107 -+ !bfq_gt(last_idle->finish, st->vtime)) {
5108 -+ /*
5109 -+ * Forget the whole idle tree, increasing the vtime past
5110 -+ * the last finish time of idle entities.
5111 -+ */
5112 -+ st->vtime = last_idle->finish;
5113 -+ }
5114 -+
5115 -+ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
5116 -+ bfq_put_idle_entity(st, first_idle);
5117 -+}
5118 -+
5119 -+static struct bfq_service_tree *
5120 -+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
5121 -+ struct bfq_entity *entity)
5122 -+{
5123 -+ struct bfq_service_tree *new_st = old_st;
5124 -+
5125 -+ if (entity->ioprio_changed) {
5126 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5127 -+
5128 -+ BUG_ON(old_st->wsum < entity->weight);
5129 -+ old_st->wsum -= entity->weight;
5130 -+
5131 -+ if (entity->new_weight != entity->orig_weight) {
5132 -+ entity->orig_weight = entity->new_weight;
5133 -+ entity->ioprio =
5134 -+ bfq_weight_to_ioprio(entity->orig_weight);
5135 -+ } else if (entity->new_ioprio != entity->ioprio) {
5136 -+ entity->ioprio = entity->new_ioprio;
5137 -+ entity->orig_weight =
5138 -+ bfq_ioprio_to_weight(entity->ioprio);
5139 -+ } else
5140 -+ entity->new_weight = entity->orig_weight =
5141 -+ bfq_ioprio_to_weight(entity->ioprio);
5142 -+
5143 -+ entity->ioprio_class = entity->new_ioprio_class;
5144 -+ entity->ioprio_changed = 0;
5145 -+
5146 -+ /*
5147 -+ * NOTE: here we may be changing the weight too early,
5148 -+ * this will cause unfairness. The correct approach
5149 -+ * would have required additional complexity to defer
5150 -+ * weight changes to the proper time instants (i.e.,
5151 -+ * when entity->finish <= old_st->vtime).
5152 -+ */
5153 -+ new_st = bfq_entity_service_tree(entity);
5154 -+ entity->weight = entity->orig_weight *
5155 -+ (bfqq != NULL ? bfqq->raising_coeff : 1);
5156 -+ new_st->wsum += entity->weight;
5157 -+
5158 -+ if (new_st != old_st)
5159 -+ entity->start = new_st->vtime;
5160 -+ }
5161 -+
5162 -+ return new_st;
5163 -+}
5164 -+
5165 -+/**
5166 -+ * bfq_bfqq_served - update the scheduler status after selection for service.
5167 -+ * @bfqq: the queue being served.
5168 -+ * @served: bytes to transfer.
5169 -+ *
5170 -+ * NOTE: this can be optimized, as the timestamps of upper level entities
5171 -+ * are synchronized every time a new bfqq is selected for service. By now,
5172 -+ * we keep it to better check consistency.
5173 -+ */
5174 -+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
5175 -+{
5176 -+ struct bfq_entity *entity = &bfqq->entity;
5177 -+ struct bfq_service_tree *st;
5178 -+
5179 -+ for_each_entity(entity) {
5180 -+ st = bfq_entity_service_tree(entity);
5181 -+
5182 -+ entity->service += served;
5183 -+ BUG_ON(entity->service > entity->budget);
5184 -+ BUG_ON(st->wsum == 0);
5185 -+
5186 -+ st->vtime += bfq_delta(served, st->wsum);
5187 -+ bfq_forget_idle(st);
5188 -+ }
5189 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
5190 -+}
5191 -+
5192 -+/**
5193 -+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
5194 -+ * @bfqq: the queue that needs a service update.
5195 -+ *
5196 -+ * When it's not possible to be fair in the service domain, because
5197 -+ * a queue is not consuming its budget fast enough (the meaning of
5198 -+ * fast depends on the timeout parameter), we charge it a full
5199 -+ * budget. In this way we should obtain a sort of time-domain
5200 -+ * fairness among all the seeky/slow queues.
5201 -+ */
5202 -+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
5203 -+{
5204 -+ struct bfq_entity *entity = &bfqq->entity;
5205 -+
5206 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
5207 -+
5208 -+ bfq_bfqq_served(bfqq, entity->budget - entity->service);
5209 -+}
5210 -+
5211 -+/**
5212 -+ * __bfq_activate_entity - activate an entity.
5213 -+ * @entity: the entity being activated.
5214 -+ *
5215 -+ * Called whenever an entity is activated, i.e., it is not active and one
5216 -+ * of its children receives a new request, or has to be reactivated due to
5217 -+ * budget exhaustion. It uses the current budget of the entity (and the
5218 -+ * service received if @entity is active) of the queue to calculate its
5219 -+ * timestamps.
5220 -+ */
5221 -+static void __bfq_activate_entity(struct bfq_entity *entity)
5222 -+{
5223 -+ struct bfq_sched_data *sd = entity->sched_data;
5224 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
5225 -+
5226 -+ if (entity == sd->active_entity) {
5227 -+ BUG_ON(entity->tree != NULL);
5228 -+ /*
5229 -+ * If we are requeueing the current entity we have
5230 -+ * to take care of not charging to it service it has
5231 -+ * not received.
5232 -+ */
5233 -+ bfq_calc_finish(entity, entity->service);
5234 -+ entity->start = entity->finish;
5235 -+ sd->active_entity = NULL;
5236 -+ } else if (entity->tree == &st->active) {
5237 -+ /*
5238 -+ * Requeueing an entity due to a change of some
5239 -+ * next_active entity below it. We reuse the old
5240 -+ * start time.
5241 -+ */
5242 -+ bfq_active_extract(st, entity);
5243 -+ } else if (entity->tree == &st->idle) {
5244 -+ /*
5245 -+ * Must be on the idle tree, bfq_idle_extract() will
5246 -+ * check for that.
5247 -+ */
5248 -+ bfq_idle_extract(st, entity);
5249 -+ entity->start = bfq_gt(st->vtime, entity->finish) ?
5250 -+ st->vtime : entity->finish;
5251 -+ } else {
5252 -+ /*
5253 -+ * The finish time of the entity may be invalid, and
5254 -+ * it is in the past for sure, otherwise the queue
5255 -+ * would have been on the idle tree.
5256 -+ */
5257 -+ entity->start = st->vtime;
5258 -+ st->wsum += entity->weight;
5259 -+ bfq_get_entity(entity);
5260 -+
5261 -+ BUG_ON(entity->on_st);
5262 -+ entity->on_st = 1;
5263 -+ }
5264 -+
5265 -+ st = __bfq_entity_update_weight_prio(st, entity);
5266 -+ bfq_calc_finish(entity, entity->budget);
5267 -+ bfq_active_insert(st, entity);
5268 -+}
5269 -+
5270 -+/**
5271 -+ * bfq_activate_entity - activate an entity and its ancestors if necessary.
5272 -+ * @entity: the entity to activate.
5273 -+ *
5274 -+ * Activate @entity and all the entities on the path from it to the root.
5275 -+ */
5276 -+static void bfq_activate_entity(struct bfq_entity *entity)
5277 -+{
5278 -+ struct bfq_sched_data *sd;
5279 -+
5280 -+ for_each_entity(entity) {
5281 -+ __bfq_activate_entity(entity);
5282 -+
5283 -+ sd = entity->sched_data;
5284 -+ if (!bfq_update_next_active(sd))
5285 -+ /*
5286 -+ * No need to propagate the activation to the
5287 -+ * upper entities, as they will be updated when
5288 -+ * the active entity is rescheduled.
5289 -+ */
5290 -+ break;
5291 -+ }
5292 -+}
5293 -+
5294 -+/**
5295 -+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
5296 -+ * @entity: the entity to deactivate.
5297 -+ * @requeue: if false, the entity will not be put into the idle tree.
5298 -+ *
5299 -+ * Deactivate an entity, independently from its previous state. If the
5300 -+ * entity was not on a service tree just return, otherwise if it is on
5301 -+ * any scheduler tree, extract it from that tree, and if necessary
5302 -+ * and if the caller did not specify @requeue, put it on the idle tree.
5303 -+ *
5304 -+ * Return %1 if the caller should update the entity hierarchy, i.e.,
5305 -+ * if the entity was under service or if it was the next_active for
5306 -+ * its sched_data; return %0 otherwise.
5307 -+ */
5308 -+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
5309 -+{
5310 -+ struct bfq_sched_data *sd = entity->sched_data;
5311 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
5312 -+ int was_active = entity == sd->active_entity;
5313 -+ int ret = 0;
5314 -+
5315 -+ if (!entity->on_st)
5316 -+ return 0;
5317 -+
5318 -+ BUG_ON(was_active && entity->tree != NULL);
5319 -+
5320 -+ if (was_active) {
5321 -+ bfq_calc_finish(entity, entity->service);
5322 -+ sd->active_entity = NULL;
5323 -+ } else if (entity->tree == &st->active)
5324 -+ bfq_active_extract(st, entity);
5325 -+ else if (entity->tree == &st->idle)
5326 -+ bfq_idle_extract(st, entity);
5327 -+ else if (entity->tree != NULL)
5328 -+ BUG();
5329 -+
5330 -+ if (was_active || sd->next_active == entity)
5331 -+ ret = bfq_update_next_active(sd);
5332 -+
5333 -+ if (!requeue || !bfq_gt(entity->finish, st->vtime))
5334 -+ bfq_forget_entity(st, entity);
5335 -+ else
5336 -+ bfq_idle_insert(st, entity);
5337 -+
5338 -+ BUG_ON(sd->active_entity == entity);
5339 -+ BUG_ON(sd->next_active == entity);
5340 -+
5341 -+ return ret;
5342 -+}
5343 -+
5344 -+/**
5345 -+ * bfq_deactivate_entity - deactivate an entity.
5346 -+ * @entity: the entity to deactivate.
5347 -+ * @requeue: true if the entity can be put on the idle tree
5348 -+ */
5349 -+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
5350 -+{
5351 -+ struct bfq_sched_data *sd;
5352 -+ struct bfq_entity *parent;
5353 -+
5354 -+ for_each_entity_safe(entity, parent) {
5355 -+ sd = entity->sched_data;
5356 -+
5357 -+ if (!__bfq_deactivate_entity(entity, requeue))
5358 -+ /*
5359 -+ * The parent entity is still backlogged, and
5360 -+ * we don't need to update it as it is still
5361 -+ * under service.
5362 -+ */
5363 -+ break;
5364 -+
5365 -+ if (sd->next_active != NULL)
5366 -+ /*
5367 -+ * The parent entity is still backlogged and
5368 -+ * the budgets on the path towards the root
5369 -+ * need to be updated.
5370 -+ */
5371 -+ goto update;
5372 -+
5373 -+ /*
5374 -+ * If we reach there the parent is no more backlogged and
5375 -+ * we want to propagate the dequeue upwards.
5376 -+ */
5377 -+ requeue = 1;
5378 -+ }
5379 -+
5380 -+ return;
5381 -+
5382 -+update:
5383 -+ entity = parent;
5384 -+ for_each_entity(entity) {
5385 -+ __bfq_activate_entity(entity);
5386 -+
5387 -+ sd = entity->sched_data;
5388 -+ if (!bfq_update_next_active(sd))
5389 -+ break;
5390 -+ }
5391 -+}
5392 -+
5393 -+/**
5394 -+ * bfq_update_vtime - update vtime if necessary.
5395 -+ * @st: the service tree to act upon.
5396 -+ *
5397 -+ * If necessary update the service tree vtime to have at least one
5398 -+ * eligible entity, skipping to its start time. Assumes that the
5399 -+ * active tree of the device is not empty.
5400 -+ *
5401 -+ * NOTE: this hierarchical implementation updates vtimes quite often,
5402 -+ * we may end up with reactivated tasks getting timestamps after a
5403 -+ * vtime skip done because we needed a ->first_active entity on some
5404 -+ * intermediate node.
5405 -+ */
5406 -+static void bfq_update_vtime(struct bfq_service_tree *st)
5407 -+{
5408 -+ struct bfq_entity *entry;
5409 -+ struct rb_node *node = st->active.rb_node;
5410 -+
5411 -+ entry = rb_entry(node, struct bfq_entity, rb_node);
5412 -+ if (bfq_gt(entry->min_start, st->vtime)) {
5413 -+ st->vtime = entry->min_start;
5414 -+ bfq_forget_idle(st);
5415 -+ }
5416 -+}
5417 -+
5418 -+/**
5419 -+ * bfq_first_active - find the eligible entity with the smallest finish time
5420 -+ * @st: the service tree to select from.
5421 -+ *
5422 -+ * This function searches the first schedulable entity, starting from the
5423 -+ * root of the tree and going on the left every time on this side there is
5424 -+ * a subtree with at least one eligible (start >= vtime) entity. The path
5425 -+ * on the right is followed only if a) the left subtree contains no eligible
5426 -+ * entities and b) no eligible entity has been found yet.
5427 -+ */
5428 -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
5429 -+{
5430 -+ struct bfq_entity *entry, *first = NULL;
5431 -+ struct rb_node *node = st->active.rb_node;
5432 -+
5433 -+ while (node != NULL) {
5434 -+ entry = rb_entry(node, struct bfq_entity, rb_node);
5435 -+left:
5436 -+ if (!bfq_gt(entry->start, st->vtime))
5437 -+ first = entry;
5438 -+
5439 -+ BUG_ON(bfq_gt(entry->min_start, st->vtime));
5440 -+
5441 -+ if (node->rb_left != NULL) {
5442 -+ entry = rb_entry(node->rb_left,
5443 -+ struct bfq_entity, rb_node);
5444 -+ if (!bfq_gt(entry->min_start, st->vtime)) {
5445 -+ node = node->rb_left;
5446 -+ goto left;
5447 -+ }
5448 -+ }
5449 -+ if (first != NULL)
5450 -+ break;
5451 -+ node = node->rb_right;
5452 -+ }
5453 -+
5454 -+ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
5455 -+ return first;
5456 -+}
5457 -+
5458 -+/**
5459 -+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
5460 -+ * @st: the service tree.
5461 -+ *
5462 -+ * Update the virtual time in @st and return the first eligible entity
5463 -+ * it contains.
5464 -+ */
5465 -+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
5466 -+ bool force)
5467 -+{
5468 -+ struct bfq_entity *entity, *new_next_active = NULL;
5469 -+
5470 -+ if (RB_EMPTY_ROOT(&st->active))
5471 -+ return NULL;
5472 -+
5473 -+ bfq_update_vtime(st);
5474 -+ entity = bfq_first_active_entity(st);
5475 -+ BUG_ON(bfq_gt(entity->start, st->vtime));
5476 -+
5477 -+ /*
5478 -+ * If the chosen entity does not match with the sched_data's
5479 -+ * next_active and we are forcedly serving the IDLE priority
5480 -+ * class tree, bubble up budget update.
5481 -+ */
5482 -+ if (unlikely(force && entity != entity->sched_data->next_active)) {
5483 -+ new_next_active = entity;
5484 -+ for_each_entity(new_next_active)
5485 -+ bfq_update_budget(new_next_active);
5486 -+ }
5487 -+
5488 -+ return entity;
5489 -+}
5490 -+
5491 -+/**
5492 -+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
5493 -+ * @sd: the sched_data.
5494 -+ * @extract: if true the returned entity will be also extracted from @sd.
5495 -+ *
5496 -+ * NOTE: since we cache the next_active entity at each level of the
5497 -+ * hierarchy, the complexity of the lookup can be decreased with
5498 -+ * absolutely no effort just returning the cached next_active value;
5499 -+ * we prefer to do full lookups to test the consistency of * the data
5500 -+ * structures.
5501 -+ */
5502 -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
5503 -+ int extract,
5504 -+ struct bfq_data *bfqd)
5505 -+{
5506 -+ struct bfq_service_tree *st = sd->service_tree;
5507 -+ struct bfq_entity *entity;
5508 -+ int i = 0;
5509 -+
5510 -+ BUG_ON(sd->active_entity != NULL);
5511 -+
5512 -+ if (bfqd != NULL &&
5513 -+ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
5514 -+ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
5515 -+ true);
5516 -+ if (entity != NULL) {
5517 -+ i = BFQ_IOPRIO_CLASSES - 1;
5518 -+ bfqd->bfq_class_idle_last_service = jiffies;
5519 -+ sd->next_active = entity;
5520 -+ }
5521 -+ }
5522 -+ for (; i < BFQ_IOPRIO_CLASSES; i++) {
5523 -+ entity = __bfq_lookup_next_entity(st + i, false);
5524 -+ if (entity != NULL) {
5525 -+ if (extract) {
5526 -+ bfq_check_next_active(sd, entity);
5527 -+ bfq_active_extract(st + i, entity);
5528 -+ sd->active_entity = entity;
5529 -+ sd->next_active = NULL;
5530 -+ }
5531 -+ break;
5532 -+ }
5533 -+ }
5534 -+
5535 -+ return entity;
5536 -+}
5537 -+
5538 -+/*
5539 -+ * Get next queue for service.
5540 -+ */
5541 -+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
5542 -+{
5543 -+ struct bfq_entity *entity = NULL;
5544 -+ struct bfq_sched_data *sd;
5545 -+ struct bfq_queue *bfqq;
5546 -+
5547 -+ BUG_ON(bfqd->in_service_queue != NULL);
5548 -+
5549 -+ if (bfqd->busy_queues == 0)
5550 -+ return NULL;
5551 -+
5552 -+ sd = &bfqd->root_group->sched_data;
5553 -+ for (; sd != NULL; sd = entity->my_sched_data) {
5554 -+ entity = bfq_lookup_next_entity(sd, 1, bfqd);
5555 -+ BUG_ON(entity == NULL);
5556 -+ entity->service = 0;
5557 -+ }
5558 -+
5559 -+ bfqq = bfq_entity_to_bfqq(entity);
5560 -+ BUG_ON(bfqq == NULL);
5561 -+
5562 -+ return bfqq;
5563 -+}
5564 -+
5565 -+/*
5566 -+ * Forced extraction of the given queue.
5567 -+ */
5568 -+static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
5569 -+ struct bfq_queue *bfqq)
5570 -+{
5571 -+ struct bfq_entity *entity;
5572 -+ struct bfq_sched_data *sd;
5573 -+
5574 -+ BUG_ON(bfqd->in_service_queue != NULL);
5575 -+
5576 -+ entity = &bfqq->entity;
5577 -+ /*
5578 -+ * Bubble up extraction/update from the leaf to the root.
5579 -+ */
5580 -+ for_each_entity(entity) {
5581 -+ sd = entity->sched_data;
5582 -+ bfq_update_budget(entity);
5583 -+ bfq_update_vtime(bfq_entity_service_tree(entity));
5584 -+ bfq_active_extract(bfq_entity_service_tree(entity), entity);
5585 -+ sd->active_entity = entity;
5586 -+ sd->next_active = NULL;
5587 -+ entity->service = 0;
5588 -+ }
5589 -+
5590 -+ return;
5591 -+}
5592 -+
5593 -+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
5594 -+{
5595 -+ if (bfqd->in_service_bic != NULL) {
5596 -+ put_io_context(bfqd->in_service_bic->icq.ioc);
5597 -+ bfqd->in_service_bic = NULL;
5598 -+ }
5599 -+
5600 -+ bfqd->in_service_queue = NULL;
5601 -+ del_timer(&bfqd->idle_slice_timer);
5602 -+}
5603 -+
5604 -+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5605 -+ int requeue)
5606 -+{
5607 -+ struct bfq_entity *entity = &bfqq->entity;
5608 -+
5609 -+ if (bfqq == bfqd->in_service_queue)
5610 -+ __bfq_bfqd_reset_in_service(bfqd);
5611 -+
5612 -+ bfq_deactivate_entity(entity, requeue);
5613 -+}
5614 -+
5615 -+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5616 -+{
5617 -+ struct bfq_entity *entity = &bfqq->entity;
5618 -+
5619 -+ bfq_activate_entity(entity);
5620 -+}
5621 -+
5622 -+/*
5623 -+ * Called when the bfqq no longer has requests pending, remove it from
5624 -+ * the service tree.
5625 -+ */
5626 -+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5627 -+ int requeue)
5628 -+{
5629 -+ BUG_ON(!bfq_bfqq_busy(bfqq));
5630 -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
5631 -+
5632 -+ bfq_log_bfqq(bfqd, bfqq, "del from busy");
5633 -+
5634 -+ bfq_clear_bfqq_busy(bfqq);
5635 -+
5636 -+ BUG_ON(bfqd->busy_queues == 0);
5637 -+ bfqd->busy_queues--;
5638 -+ if (bfqq->raising_coeff > 1)
5639 -+ bfqd->raised_busy_queues--;
5640 -+
5641 -+ bfq_deactivate_bfqq(bfqd, bfqq, requeue);
5642 -+}
5643 -+
5644 -+/*
5645 -+ * Called when an inactive queue receives a new request.
5646 -+ */
5647 -+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5648 -+{
5649 -+ BUG_ON(bfq_bfqq_busy(bfqq));
5650 -+ BUG_ON(bfqq == bfqd->in_service_queue);
5651 -+
5652 -+ bfq_log_bfqq(bfqd, bfqq, "add to busy");
5653 -+
5654 -+ bfq_activate_bfqq(bfqd, bfqq);
5655 -+
5656 -+ bfq_mark_bfqq_busy(bfqq);
5657 -+ bfqd->busy_queues++;
5658 -+ if (bfqq->raising_coeff > 1)
5659 -+ bfqd->raised_busy_queues++;
5660 -+}
5661 -diff --git a/block/bfq.h b/block/bfq.h
5662 -new file mode 100644
5663 -index 0000000..68b28e3
5664 ---- /dev/null
5665 -+++ b/block/bfq.h
5666 -@@ -0,0 +1,614 @@
5667 -+/*
5668 -+ * BFQ-v7 for 3.13.0: data structures and common functions prototypes.
5669 -+ *
5670 -+ * Based on ideas and code from CFQ:
5671 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
5672 -+ *
5673 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
5674 -+ * Paolo Valente <paolo.valente@×××××××.it>
5675 -+ *
5676 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
5677 -+ */
5678 -+
5679 -+#ifndef _BFQ_H
5680 -+#define _BFQ_H
5681 -+
5682 -+#include <linux/blktrace_api.h>
5683 -+#include <linux/hrtimer.h>
5684 -+#include <linux/ioprio.h>
5685 -+#include <linux/rbtree.h>
5686 -+
5687 -+#define BFQ_IOPRIO_CLASSES 3
5688 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
5689 -+
5690 -+#define BFQ_MIN_WEIGHT 1
5691 -+#define BFQ_MAX_WEIGHT 1000
5692 -+
5693 -+#define BFQ_DEFAULT_GRP_WEIGHT 10
5694 -+#define BFQ_DEFAULT_GRP_IOPRIO 0
5695 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
5696 -+
5697 -+struct bfq_entity;
5698 -+
5699 -+/**
5700 -+ * struct bfq_service_tree - per ioprio_class service tree.
5701 -+ * @active: tree for active entities (i.e., those backlogged).
5702 -+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
5703 -+ * @first_idle: idle entity with minimum F_i.
5704 -+ * @last_idle: idle entity with maximum F_i.
5705 -+ * @vtime: scheduler virtual time.
5706 -+ * @wsum: scheduler weight sum; active and idle entities contribute to it.
5707 -+ *
5708 -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
5709 -+ * ioprio_class has its own independent scheduler, and so its own
5710 -+ * bfq_service_tree. All the fields are protected by the queue lock
5711 -+ * of the containing bfqd.
5712 -+ */
5713 -+struct bfq_service_tree {
5714 -+ struct rb_root active;
5715 -+ struct rb_root idle;
5716 -+
5717 -+ struct bfq_entity *first_idle;
5718 -+ struct bfq_entity *last_idle;
5719 -+
5720 -+ u64 vtime;
5721 -+ unsigned long wsum;
5722 -+};
5723 -+
5724 -+/**
5725 -+ * struct bfq_sched_data - multi-class scheduler.
5726 -+ * @active_entity: entity under service.
5727 -+ * @next_active: head-of-the-line entity in the scheduler.
5728 -+ * @service_tree: array of service trees, one per ioprio_class.
5729 -+ *
5730 -+ * bfq_sched_data is the basic scheduler queue. It supports three
5731 -+ * ioprio_classes, and can be used either as a toplevel queue or as
5732 -+ * an intermediate queue on a hierarchical setup.
5733 -+ * @next_active points to the active entity of the sched_data service
5734 -+ * trees that will be scheduled next.
5735 -+ *
5736 -+ * The supported ioprio_classes are the same as in CFQ, in descending
5737 -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
5738 -+ * Requests from higher priority queues are served before all the
5739 -+ * requests from lower priority queues; among requests of the same
5740 -+ * queue requests are served according to B-WF2Q+.
5741 -+ * All the fields are protected by the queue lock of the containing bfqd.
5742 -+ */
5743 -+struct bfq_sched_data {
5744 -+ struct bfq_entity *active_entity;
5745 -+ struct bfq_entity *next_active;
5746 -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
5747 -+};
5748 -+
5749 -+/**
5750 -+ * struct bfq_entity - schedulable entity.
5751 -+ * @rb_node: service_tree member.
5752 -+ * @on_st: flag, true if the entity is on a tree (either the active or
5753 -+ * the idle one of its service_tree).
5754 -+ * @finish: B-WF2Q+ finish timestamp (aka F_i).
5755 -+ * @start: B-WF2Q+ start timestamp (aka S_i).
5756 -+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
5757 -+ * @min_start: minimum start time of the (active) subtree rooted at
5758 -+ * this entity; used for O(log N) lookups into active trees.
5759 -+ * @service: service received during the last round of service.
5760 -+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
5761 -+ * @weight: weight of the queue
5762 -+ * @parent: parent entity, for hierarchical scheduling.
5763 -+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
5764 -+ * associated scheduler queue, %NULL on leaf nodes.
5765 -+ * @sched_data: the scheduler queue this entity belongs to.
5766 -+ * @ioprio: the ioprio in use.
5767 -+ * @new_weight: when a weight change is requested, the new weight value.
5768 -+ * @orig_weight: original weight, used to implement weight boosting
5769 -+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
5770 -+ * @ioprio_class: the ioprio_class in use.
5771 -+ * @new_ioprio_class: when an ioprio_class change is requested, the new
5772 -+ * ioprio_class value.
5773 -+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
5774 -+ * ioprio_class change.
5775 -+ *
5776 -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
5777 -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
5778 -+ * entity belongs to the sched_data of the parent group in the cgroup
5779 -+ * hierarchy. Non-leaf entities have also their own sched_data, stored
5780 -+ * in @my_sched_data.
5781 -+ *
5782 -+ * Each entity stores independently its priority values; this would
5783 -+ * allow different weights on different devices, but this
5784 -+ * functionality is not exported to userspace by now. Priorities and
5785 -+ * weights are updated lazily, first storing the new values into the
5786 -+ * new_* fields, then setting the @ioprio_changed flag. As soon as
5787 -+ * there is a transition in the entity state that allows the priority
5788 -+ * update to take place the effective and the requested priority
5789 -+ * values are synchronized.
5790 -+ *
5791 -+ * Unless cgroups are used, the weight value is calculated from the
5792 -+ * ioprio to export the same interface as CFQ. When dealing with
5793 -+ * ``well-behaved'' queues (i.e., queues that do not spend too much
5794 -+ * time to consume their budget and have true sequential behavior, and
5795 -+ * when there are no external factors breaking anticipation) the
5796 -+ * relative weights at each level of the cgroups hierarchy should be
5797 -+ * guaranteed. All the fields are protected by the queue lock of the
5798 -+ * containing bfqd.
5799 -+ */
5800 -+struct bfq_entity {
5801 -+ struct rb_node rb_node;
5802 -+
5803 -+ int on_st;
5804 -+
5805 -+ u64 finish;
5806 -+ u64 start;
5807 -+
5808 -+ struct rb_root *tree;
5809 -+
5810 -+ u64 min_start;
5811 -+
5812 -+ unsigned long service, budget;
5813 -+ unsigned short weight, new_weight;
5814 -+ unsigned short orig_weight;
5815 -+
5816 -+ struct bfq_entity *parent;
5817 -+
5818 -+ struct bfq_sched_data *my_sched_data;
5819 -+ struct bfq_sched_data *sched_data;
5820 -+
5821 -+ unsigned short ioprio, new_ioprio;
5822 -+ unsigned short ioprio_class, new_ioprio_class;
5823 -+
5824 -+ int ioprio_changed;
5825 -+};
5826 -+
5827 -+struct bfq_group;
5828 -+
5829 -+/**
5830 -+ * struct bfq_queue - leaf schedulable entity.
5831 -+ * @ref: reference counter.
5832 -+ * @bfqd: parent bfq_data.
5833 -+ * @new_bfqq: shared bfq_queue if queue is cooperating with
5834 -+ * one or more other queues.
5835 -+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
5836 -+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
5837 -+ * @sort_list: sorted list of pending requests.
5838 -+ * @next_rq: if fifo isn't expired, next request to serve.
5839 -+ * @queued: nr of requests queued in @sort_list.
5840 -+ * @allocated: currently allocated requests.
5841 -+ * @meta_pending: pending metadata requests.
5842 -+ * @fifo: fifo list of requests in sort_list.
5843 -+ * @entity: entity representing this queue in the scheduler.
5844 -+ * @max_budget: maximum budget allowed from the feedback mechanism.
5845 -+ * @budget_timeout: budget expiration (in jiffies).
5846 -+ * @dispatched: number of requests on the dispatch list or inside driver.
5847 -+ * @org_ioprio: saved ioprio during boosted periods.
5848 -+ * @flags: status flags.
5849 -+ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
5850 -+ * @seek_samples: number of seeks sampled
5851 -+ * @seek_total: sum of the distances of the seeks sampled
5852 -+ * @seek_mean: mean seek distance
5853 -+ * @last_request_pos: position of the last request enqueued
5854 -+ * @pid: pid of the process owning the queue, used for logging purposes.
5855 -+ * @last_rais_start_time: last (idle -> weight-raised) transition attempt
5856 -+ * @raising_cur_max_time: current max raising time for this queue
5857 -+ * @last_idle_bklogged: time of the last transition of the @bfq_queue from
5858 -+ * idle to backlogged
5859 -+ * @service_from_backlogged: cumulative service received from the @bfq_queue
5860 -+ * since the last transition from idle to backlogged
5861 -+ *
5862 -+ * A bfq_queue is a leaf request queue; it can be associated to an io_context
5863 -+ * or more (if it is an async one). @cgroup holds a reference to the
5864 -+ * cgroup, to be sure that it does not disappear while a bfqq still
5865 -+ * references it (mostly to avoid races between request issuing and task
5866 -+ * migration followed by cgroup distruction).
5867 -+ * All the fields are protected by the queue lock of the containing bfqd.
5868 -+ */
5869 -+struct bfq_queue {
5870 -+ atomic_t ref;
5871 -+ struct bfq_data *bfqd;
5872 -+
5873 -+ /* fields for cooperating queues handling */
5874 -+ struct bfq_queue *new_bfqq;
5875 -+ struct rb_node pos_node;
5876 -+ struct rb_root *pos_root;
5877 -+
5878 -+ struct rb_root sort_list;
5879 -+ struct request *next_rq;
5880 -+ int queued[2];
5881 -+ int allocated[2];
5882 -+ int meta_pending;
5883 -+ struct list_head fifo;
5884 -+
5885 -+ struct bfq_entity entity;
5886 -+
5887 -+ unsigned long max_budget;
5888 -+ unsigned long budget_timeout;
5889 -+
5890 -+ int dispatched;
5891 -+
5892 -+ unsigned short org_ioprio;
5893 -+
5894 -+ unsigned int flags;
5895 -+
5896 -+ struct list_head bfqq_list;
5897 -+
5898 -+ unsigned int seek_samples;
5899 -+ u64 seek_total;
5900 -+ sector_t seek_mean;
5901 -+ sector_t last_request_pos;
5902 -+
5903 -+ pid_t pid;
5904 -+
5905 -+ /* weight-raising fields */
5906 -+ unsigned int raising_cur_max_time;
5907 -+ unsigned long soft_rt_next_start;
5908 -+ u64 last_rais_start_finish;
5909 -+ unsigned int raising_coeff;
5910 -+ u64 last_idle_bklogged;
5911 -+ unsigned long service_from_backlogged;
5912 -+};
5913 -+
5914 -+/**
5915 -+ * struct bfq_ttime - per process thinktime stats.
5916 -+ * @ttime_total: total process thinktime
5917 -+ * @ttime_samples: number of thinktime samples
5918 -+ * @ttime_mean: average process thinktime
5919 -+ */
5920 -+struct bfq_ttime {
5921 -+ unsigned long last_end_request;
5922 -+
5923 -+ unsigned long ttime_total;
5924 -+ unsigned long ttime_samples;
5925 -+ unsigned long ttime_mean;
5926 -+};
5927 -+
5928 -+/**
5929 -+ * struct bfq_io_cq - per (request_queue, io_context) structure.
5930 -+ * @icq: associated io_cq structure
5931 -+ * @bfqq: array of two process queues, the sync and the async
5932 -+ * @ttime: associated @bfq_ttime struct
5933 -+ */
5934 -+struct bfq_io_cq {
5935 -+ struct io_cq icq; /* must be the first member */
5936 -+ struct bfq_queue *bfqq[2];
5937 -+ struct bfq_ttime ttime;
5938 -+ int ioprio;
5939 -+};
5940 -+
5941 -+/**
5942 -+ * struct bfq_data - per device data structure.
5943 -+ * @queue: request queue for the managed device.
5944 -+ * @root_group: root bfq_group for the device.
5945 -+ * @rq_pos_tree: rbtree sorted by next_request position,
5946 -+ * used when determining if two or more queues
5947 -+ * have interleaving requests (see bfq_close_cooperator).
5948 -+ * @busy_queues: number of bfq_queues containing requests (including the
5949 -+ * queue under service, even if it is idling).
5950 -+ * @raised_busy_queues: number of weight-raised busy bfq_queues.
5951 -+ * @queued: number of queued requests.
5952 -+ * @rq_in_driver: number of requests dispatched and waiting for completion.
5953 -+ * @sync_flight: number of sync requests in the driver.
5954 -+ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
5955 -+ * completed requests .
5956 -+ * @hw_tag_samples: nr of samples used to calculate hw_tag.
5957 -+ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
5958 -+ * @budgets_assigned: number of budgets assigned.
5959 -+ * @idle_slice_timer: timer set when idling for the next sequential request
5960 -+ * from the queue under service.
5961 -+ * @unplug_work: delayed work to restart dispatching on the request queue.
5962 -+ * @in_service_queue: bfq_queue under service.
5963 -+ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.
5964 -+ * @last_position: on-disk position of the last served request.
5965 -+ * @last_budget_start: beginning of the last budget.
5966 -+ * @last_idling_start: beginning of the last idle slice.
5967 -+ * @peak_rate: peak transfer rate observed for a budget.
5968 -+ * @peak_rate_samples: number of samples used to calculate @peak_rate.
5969 -+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
5970 -+ * @group_list: list of all the bfq_groups active on the device.
5971 -+ * @active_list: list of all the bfq_queues active on the device.
5972 -+ * @idle_list: list of all the bfq_queues idle on the device.
5973 -+ * @bfq_quantum: max number of requests dispatched per dispatch round.
5974 -+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
5975 -+ * requests are served in fifo order.
5976 -+ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
5977 -+ * @bfq_back_max: maximum allowed backward seek.
5978 -+ * @bfq_slice_idle: maximum idling time.
5979 -+ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
5980 -+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
5981 -+ * async queues.
5982 -+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
5983 -+ * to prevent seeky queues to impose long latencies to well
5984 -+ * behaved ones (this also implies that seeky queues cannot
5985 -+ * receive guarantees in the service domain; after a timeout
5986 -+ * they are charged for the whole allocated budget, to try
5987 -+ * to preserve a behavior reasonably fair among them, but
5988 -+ * without service-domain guarantees).
5989 -+ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
5990 -+ * queue is multiplied
5991 -+ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
5992 -+ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
5993 -+ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
5994 -+ * may be reactivated for a queue (in jiffies)
5995 -+ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
5996 -+ * after which weight-raising may be
5997 -+ * reactivated for an already busy queue
5998 -+ * (in jiffies)
5999 -+ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
6000 -+ * sectors per seconds
6001 -+ * @RT_prod: cached value of the product R*T used for computing the maximum
6002 -+ * duration of the weight raising automatically
6003 -+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
6004 -+ *
6005 -+ * All the fields are protected by the @queue lock.
6006 -+ */
6007 -+struct bfq_data {
6008 -+ struct request_queue *queue;
6009 -+
6010 -+ struct bfq_group *root_group;
6011 -+
6012 -+ struct rb_root rq_pos_tree;
6013 -+
6014 -+ int busy_queues;
6015 -+ int raised_busy_queues;
6016 -+ int queued;
6017 -+ int rq_in_driver;
6018 -+ int sync_flight;
6019 -+
6020 -+ int max_rq_in_driver;
6021 -+ int hw_tag_samples;
6022 -+ int hw_tag;
6023 -+
6024 -+ int budgets_assigned;
6025 -+
6026 -+ struct timer_list idle_slice_timer;
6027 -+ struct work_struct unplug_work;
6028 -+
6029 -+ struct bfq_queue *in_service_queue;
6030 -+ struct bfq_io_cq *in_service_bic;
6031 -+
6032 -+ sector_t last_position;
6033 -+
6034 -+ ktime_t last_budget_start;
6035 -+ ktime_t last_idling_start;
6036 -+ int peak_rate_samples;
6037 -+ u64 peak_rate;
6038 -+ unsigned long bfq_max_budget;
6039 -+
6040 -+ struct hlist_head group_list;
6041 -+ struct list_head active_list;
6042 -+ struct list_head idle_list;
6043 -+
6044 -+ unsigned int bfq_quantum;
6045 -+ unsigned int bfq_fifo_expire[2];
6046 -+ unsigned int bfq_back_penalty;
6047 -+ unsigned int bfq_back_max;
6048 -+ unsigned int bfq_slice_idle;
6049 -+ u64 bfq_class_idle_last_service;
6050 -+
6051 -+ unsigned int bfq_user_max_budget;
6052 -+ unsigned int bfq_max_budget_async_rq;
6053 -+ unsigned int bfq_timeout[2];
6054 -+
6055 -+ bool low_latency;
6056 -+
6057 -+ /* parameters of the low_latency heuristics */
6058 -+ unsigned int bfq_raising_coeff;
6059 -+ unsigned int bfq_raising_max_time;
6060 -+ unsigned int bfq_raising_rt_max_time;
6061 -+ unsigned int bfq_raising_min_idle_time;
6062 -+ unsigned long bfq_raising_min_inter_arr_async;
6063 -+ unsigned int bfq_raising_max_softrt_rate;
6064 -+ u64 RT_prod;
6065 -+
6066 -+ struct bfq_queue oom_bfqq;
6067 -+};
6068 -+
6069 -+enum bfqq_state_flags {
6070 -+ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */
6071 -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
6072 -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
6073 -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
6074 -+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
6075 -+ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */
6076 -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
6077 -+ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
6078 -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
6079 -+ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
6080 -+ BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */
6081 -+};
6082 -+
6083 -+#define BFQ_BFQQ_FNS(name) \
6084 -+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
6085 -+{ \
6086 -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
6087 -+} \
6088 -+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
6089 -+{ \
6090 -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
6091 -+} \
6092 -+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
6093 -+{ \
6094 -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
6095 -+}
6096 -+
6097 -+BFQ_BFQQ_FNS(busy);
6098 -+BFQ_BFQQ_FNS(wait_request);
6099 -+BFQ_BFQQ_FNS(must_alloc);
6100 -+BFQ_BFQQ_FNS(fifo_expire);
6101 -+BFQ_BFQQ_FNS(idle_window);
6102 -+BFQ_BFQQ_FNS(prio_changed);
6103 -+BFQ_BFQQ_FNS(sync);
6104 -+BFQ_BFQQ_FNS(budget_new);
6105 -+BFQ_BFQQ_FNS(coop);
6106 -+BFQ_BFQQ_FNS(split_coop);
6107 -+BFQ_BFQQ_FNS(softrt_update);
6108 -+#undef BFQ_BFQQ_FNS
6109 -+
6110 -+/* Logging facilities. */
6111 -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
6112 -+ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
6113 -+
6114 -+#define bfq_log(bfqd, fmt, args...) \
6115 -+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
6116 -+
6117 -+/* Expiration reasons. */
6118 -+enum bfqq_expiration {
6119 -+ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */
6120 -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
6121 -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
6122 -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
6123 -+};
6124 -+
6125 -+#ifdef CONFIG_CGROUP_BFQIO
6126 -+/**
6127 -+ * struct bfq_group - per (device, cgroup) data structure.
6128 -+ * @entity: schedulable entity to insert into the parent group sched_data.
6129 -+ * @sched_data: own sched_data, to contain child entities (they may be
6130 -+ * both bfq_queues and bfq_groups).
6131 -+ * @group_node: node to be inserted into the bfqio_cgroup->group_data
6132 -+ * list of the containing cgroup's bfqio_cgroup.
6133 -+ * @bfqd_node: node to be inserted into the @bfqd->group_list list
6134 -+ * of the groups active on the same device; used for cleanup.
6135 -+ * @bfqd: the bfq_data for the device this group acts upon.
6136 -+ * @async_bfqq: array of async queues for all the tasks belonging to
6137 -+ * the group, one queue per ioprio value per ioprio_class,
6138 -+ * except for the idle class that has only one queue.
6139 -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
6140 -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
6141 -+ * to avoid too many special cases during group creation/migration.
6142 -+ *
6143 -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
6144 -+ * there is a set of bfq_groups, each one collecting the lower-level
6145 -+ * entities belonging to the group that are acting on the same device.
6146 -+ *
6147 -+ * Locking works as follows:
6148 -+ * o @group_node is protected by the bfqio_cgroup lock, and is accessed
6149 -+ * via RCU from its readers.
6150 -+ * o @bfqd is protected by the queue lock, RCU is used to access it
6151 -+ * from the readers.
6152 -+ * o All the other fields are protected by the @bfqd queue lock.
6153 -+ */
6154 -+struct bfq_group {
6155 -+ struct bfq_entity entity;
6156 -+ struct bfq_sched_data sched_data;
6157 -+
6158 -+ struct hlist_node group_node;
6159 -+ struct hlist_node bfqd_node;
6160 -+
6161 -+ void *bfqd;
6162 -+
6163 -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
6164 -+ struct bfq_queue *async_idle_bfqq;
6165 -+
6166 -+ struct bfq_entity *my_entity;
6167 -+};
6168 -+
6169 -+/**
6170 -+ * struct bfqio_cgroup - bfq cgroup data structure.
6171 -+ * @css: subsystem state for bfq in the containing cgroup.
6172 -+ * @online: flag marked when the subsystem is inserted.
6173 -+ * @weight: cgroup weight.
6174 -+ * @ioprio: cgroup ioprio.
6175 -+ * @ioprio_class: cgroup ioprio_class.
6176 -+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
6177 -+ * @group_data: list containing the bfq_group belonging to this cgroup.
6178 -+ *
6179 -+ * @group_data is accessed using RCU, with @lock protecting the updates,
6180 -+ * @ioprio and @ioprio_class are protected by @lock.
6181 -+ */
6182 -+struct bfqio_cgroup {
6183 -+ struct cgroup_subsys_state css;
6184 -+ bool online;
6185 -+
6186 -+ unsigned short weight, ioprio, ioprio_class;
6187 -+
6188 -+ spinlock_t lock;
6189 -+ struct hlist_head group_data;
6190 -+};
6191 -+#else
6192 -+struct bfq_group {
6193 -+ struct bfq_sched_data sched_data;
6194 -+
6195 -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
6196 -+ struct bfq_queue *async_idle_bfqq;
6197 -+};
6198 -+#endif
6199 -+
6200 -+static inline struct bfq_service_tree *
6201 -+bfq_entity_service_tree(struct bfq_entity *entity)
6202 -+{
6203 -+ struct bfq_sched_data *sched_data = entity->sched_data;
6204 -+ unsigned int idx = entity->ioprio_class - 1;
6205 -+
6206 -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
6207 -+ BUG_ON(sched_data == NULL);
6208 -+
6209 -+ return sched_data->service_tree + idx;
6210 -+}
6211 -+
6212 -+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
6213 -+ int is_sync)
6214 -+{
6215 -+ return bic->bfqq[!!is_sync];
6216 -+}
6217 -+
6218 -+static inline void bic_set_bfqq(struct bfq_io_cq *bic,
6219 -+ struct bfq_queue *bfqq, int is_sync)
6220 -+{
6221 -+ bic->bfqq[!!is_sync] = bfqq;
6222 -+}
6223 -+
6224 -+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
6225 -+{
6226 -+ return bic->icq.q->elevator->elevator_data;
6227 -+}
6228 -+
6229 -+/**
6230 -+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
6231 -+ * @ptr: a pointer to a bfqd.
6232 -+ * @flags: storage for the flags to be saved.
6233 -+ *
6234 -+ * This function allows bfqg->bfqd to be protected by the
6235 -+ * queue lock of the bfqd they reference; the pointer is dereferenced
6236 -+ * under RCU, so the storage for bfqd is assured to be safe as long
6237 -+ * as the RCU read side critical section does not end. After the
6238 -+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
6239 -+ * sure that no other writer accessed it. If we raced with a writer,
6240 -+ * the function returns NULL, with the queue unlocked, otherwise it
6241 -+ * returns the dereferenced pointer, with the queue locked.
6242 -+ */
6243 -+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
6244 -+ unsigned long *flags)
6245 -+{
6246 -+ struct bfq_data *bfqd;
6247 -+
6248 -+ rcu_read_lock();
6249 -+ bfqd = rcu_dereference(*(struct bfq_data **)ptr);
6250 -+
6251 -+ if (bfqd != NULL) {
6252 -+ spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
6253 -+ if (*ptr == bfqd)
6254 -+ goto out;
6255 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
6256 -+ }
6257 -+
6258 -+ bfqd = NULL;
6259 -+out:
6260 -+ rcu_read_unlock();
6261 -+ return bfqd;
6262 -+}
6263 -+
6264 -+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
6265 -+ unsigned long *flags)
6266 -+{
6267 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
6268 -+}
6269 -+
6270 -+static void bfq_changed_ioprio(struct bfq_io_cq *bic);
6271 -+static void bfq_put_queue(struct bfq_queue *bfqq);
6272 -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
6273 -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
6274 -+ struct bfq_group *bfqg, int is_sync,
6275 -+ struct bfq_io_cq *bic, gfp_t gfp_mask);
6276 -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
6277 -+ struct bfq_group *bfqg);
6278 -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
6279 -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
6280 -+#endif
6281 ---
6282 -1.8.5.2
6283 -
6284
6285 Added: genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1
6286 ===================================================================
6287 --- genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1 (rev 0)
6288 +++ genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1 2014-02-07 15:42:35 UTC (rev 2666)
6289 @@ -0,0 +1,6040 @@
6290 +From be5107dc591f7ae692ca7cceecbba72e4c174c37 Mon Sep 17 00:00:00 2001
6291 +From: Paolo Valente <paolo.valente@×××××××.it>
6292 +Date: Thu, 9 May 2013 19:10:02 +0200
6293 +Subject: [PATCH 2/3] block: introduce the BFQ-v7r1 I/O sched for 3.13
6294 +
6295 +Add the BFQ-v7r1 I/O scheduler to 3.13.
6296 +The general structure is borrowed from CFQ, as much of the code for
6297 +handling I/O contexts Over time, several useful features have been
6298 +ported from CFQ as well (details in the changelog in README.BFQ). A
6299 +(bfq_)queue is associated to each task doing I/O on a device, and each
6300 +time a scheduling decision has to be made a queue is selected and served
6301 +until it expires.
6302 +
6303 + - Slices are given in the service domain: tasks are assigned
6304 + budgets, measured in number of sectors. Once got the disk, a task
6305 + must however consume its assigned budget within a configurable
6306 + maximum time (by default, the maximum possible value of the
6307 + budgets is automatically computed to comply with this timeout).
6308 + This allows the desired latency vs "throughput boosting" tradeoff
6309 + to be set.
6310 +
6311 + - Budgets are scheduled according to a variant of WF2Q+, implemented
6312 + using an augmented rb-tree to take eligibility into account while
6313 + preserving an O(log N) overall complexity.
6314 +
6315 + - A low-latency tunable is provided; if enabled, both interactive
6316 + and soft real-time applications are guaranteed a very low latency.
6317 +
6318 + - Latency guarantees are preserved also in the presence of NCQ.
6319 +
6320 + - Also with flash-based devices, a high throughput is achieved
6321 + while still preserving latency guarantees.
6322 +
6323 + - BFQ features Early Queue Merge (EQM), a sort of fusion of the
6324 + cooperating-queue-merging and the preemption mechanisms present
6325 + in CFQ. EQM is in fact a unified mechanism that tries to get a
6326 + sequential read pattern, and hence a high throughput, with any
6327 + set of processes performing interleaved I/O over a contiguous
6328 + sequence of sectors.
6329 +
6330 + - BFQ supports full hierarchical scheduling, exporting a cgroups
6331 + interface. Since each node has a full scheduler, each group can
6332 + be assigned its own weight.
6333 +
6334 + - If the cgroups interface is not used, only I/O priorities can be
6335 + assigned to processes, with ioprio values mapped to weights
6336 + with the relation weight = IOPRIO_BE_NR - ioprio.
6337 +
6338 + - ioprio classes are served in strict priority order, i.e., lower
6339 + priority queues are not served as long as there are higher
6340 + priority queues. Among queues in the same class the bandwidth is
6341 + distributed in proportion to the weight of each queue. A very
6342 + thin extra bandwidth is however guaranteed to the Idle class, to
6343 + prevent it from starving.
6344 +
6345 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
6346 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
6347 +---
6348 + block/bfq-cgroup.c | 911 ++++++++++++++
6349 + block/bfq-ioc.c | 36 +
6350 + block/bfq-iosched.c | 3298 +++++++++++++++++++++++++++++++++++++++++++++++++++
6351 + block/bfq-sched.c | 1078 +++++++++++++++++
6352 + block/bfq.h | 614 ++++++++++
6353 + 5 files changed, 5937 insertions(+)
6354 + create mode 100644 block/bfq-cgroup.c
6355 + create mode 100644 block/bfq-ioc.c
6356 + create mode 100644 block/bfq-iosched.c
6357 + create mode 100644 block/bfq-sched.c
6358 + create mode 100644 block/bfq.h
6359 +
6360 +diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
6361 +new file mode 100644
6362 +index 0000000..79a288a
6363 +--- /dev/null
6364 ++++ b/block/bfq-cgroup.c
6365 +@@ -0,0 +1,911 @@
6366 ++/*
6367 ++ * BFQ: CGROUPS support.
6368 ++ *
6369 ++ * Based on ideas and code from CFQ:
6370 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
6371 ++ *
6372 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
6373 ++ * Paolo Valente <paolo.valente@×××××××.it>
6374 ++ *
6375 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
6376 ++ *
6377 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
6378 ++ */
6379 ++
6380 ++#ifdef CONFIG_CGROUP_BFQIO
6381 ++
6382 ++static DEFINE_MUTEX(bfqio_mutex);
6383 ++
6384 ++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
6385 ++{
6386 ++ return bgrp ? !bgrp->online : false;
6387 ++}
6388 ++
6389 ++static struct bfqio_cgroup bfqio_root_cgroup = {
6390 ++ .weight = BFQ_DEFAULT_GRP_WEIGHT,
6391 ++ .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
6392 ++ .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
6393 ++};
6394 ++
6395 ++static inline void bfq_init_entity(struct bfq_entity *entity,
6396 ++ struct bfq_group *bfqg)
6397 ++{
6398 ++ entity->weight = entity->new_weight;
6399 ++ entity->orig_weight = entity->new_weight;
6400 ++ entity->ioprio = entity->new_ioprio;
6401 ++ entity->ioprio_class = entity->new_ioprio_class;
6402 ++ entity->parent = bfqg->my_entity;
6403 ++ entity->sched_data = &bfqg->sched_data;
6404 ++}
6405 ++
6406 ++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
6407 ++{
6408 ++ return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
6409 ++}
6410 ++
6411 ++/*
6412 ++ * Search the bfq_group for bfqd into the hash table (by now only a list)
6413 ++ * of bgrp. Must be called under rcu_read_lock().
6414 ++ */
6415 ++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
6416 ++ struct bfq_data *bfqd)
6417 ++{
6418 ++ struct bfq_group *bfqg;
6419 ++ void *key;
6420 ++
6421 ++ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
6422 ++ key = rcu_dereference(bfqg->bfqd);
6423 ++ if (key == bfqd)
6424 ++ return bfqg;
6425 ++ }
6426 ++
6427 ++ return NULL;
6428 ++}
6429 ++
6430 ++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
6431 ++ struct bfq_group *bfqg)
6432 ++{
6433 ++ struct bfq_entity *entity = &bfqg->entity;
6434 ++
6435 ++ /*
6436 ++ * If the weight of the entity has never been set via the sysfs
6437 ++ * interface, then bgrp->weight == 0. In this case we initialize
6438 ++ * the weight from the current ioprio value. Otherwise, the group
6439 ++ * weight, if set, has priority over the ioprio value.
6440 ++ */
6441 ++ if (bgrp->weight == 0) {
6442 ++ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
6443 ++ entity->new_ioprio = bgrp->ioprio;
6444 ++ } else {
6445 ++ entity->new_weight = bgrp->weight;
6446 ++ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
6447 ++ }
6448 ++ entity->orig_weight = entity->weight = entity->new_weight;
6449 ++ entity->ioprio = entity->new_ioprio;
6450 ++ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
6451 ++ entity->my_sched_data = &bfqg->sched_data;
6452 ++}
6453 ++
6454 ++static inline void bfq_group_set_parent(struct bfq_group *bfqg,
6455 ++ struct bfq_group *parent)
6456 ++{
6457 ++ struct bfq_entity *entity;
6458 ++
6459 ++ BUG_ON(parent == NULL);
6460 ++ BUG_ON(bfqg == NULL);
6461 ++
6462 ++ entity = &bfqg->entity;
6463 ++ entity->parent = parent->my_entity;
6464 ++ entity->sched_data = &parent->sched_data;
6465 ++}
6466 ++
6467 ++/**
6468 ++ * bfq_group_chain_alloc - allocate a chain of groups.
6469 ++ * @bfqd: queue descriptor.
6470 ++ * @css: the leaf cgroup_subsys_state this chain starts from.
6471 ++ *
6472 ++ * Allocate a chain of groups starting from the one belonging to
6473 ++ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
6474 ++ * to the root has already an allocated group on @bfqd.
6475 ++ */
6476 ++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
6477 ++ struct cgroup_subsys_state *css)
6478 ++{
6479 ++ struct bfqio_cgroup *bgrp;
6480 ++ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
6481 ++
6482 ++ for (; css != NULL; css = css->parent) {
6483 ++ bgrp = css_to_bfqio(css);
6484 ++
6485 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
6486 ++ if (bfqg != NULL) {
6487 ++ /*
6488 ++ * All the cgroups in the path from there to the
6489 ++ * root must have a bfq_group for bfqd, so we don't
6490 ++ * need any more allocations.
6491 ++ */
6492 ++ break;
6493 ++ }
6494 ++
6495 ++ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
6496 ++ if (bfqg == NULL)
6497 ++ goto cleanup;
6498 ++
6499 ++ bfq_group_init_entity(bgrp, bfqg);
6500 ++ bfqg->my_entity = &bfqg->entity;
6501 ++
6502 ++ if (leaf == NULL) {
6503 ++ leaf = bfqg;
6504 ++ prev = leaf;
6505 ++ } else {
6506 ++ bfq_group_set_parent(prev, bfqg);
6507 ++ /*
6508 ++ * Build a list of allocated nodes using the bfqd
6509 ++ * filed, that is still unused and will be initialized
6510 ++ * only after the node will be connected.
6511 ++ */
6512 ++ prev->bfqd = bfqg;
6513 ++ prev = bfqg;
6514 ++ }
6515 ++ }
6516 ++
6517 ++ return leaf;
6518 ++
6519 ++cleanup:
6520 ++ while (leaf != NULL) {
6521 ++ prev = leaf;
6522 ++ leaf = leaf->bfqd;
6523 ++ kfree(prev);
6524 ++ }
6525 ++
6526 ++ return NULL;
6527 ++}
6528 ++
6529 ++/**
6530 ++ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
6531 ++ * @bfqd: the queue descriptor.
6532 ++ * @css: the leaf cgroup_subsys_state to start from.
6533 ++ * @leaf: the leaf group (to be associated to @cgroup).
6534 ++ *
6535 ++ * Try to link a chain of groups to a cgroup hierarchy, connecting the
6536 ++ * nodes bottom-up, so we can be sure that when we find a cgroup in the
6537 ++ * hierarchy that already as a group associated to @bfqd all the nodes
6538 ++ * in the path to the root cgroup have one too.
6539 ++ *
6540 ++ * On locking: the queue lock protects the hierarchy (there is a hierarchy
6541 ++ * per device) while the bfqio_cgroup lock protects the list of groups
6542 ++ * belonging to the same cgroup.
6543 ++ */
6544 ++static void bfq_group_chain_link(struct bfq_data *bfqd,
6545 ++ struct cgroup_subsys_state *css,
6546 ++ struct bfq_group *leaf)
6547 ++{
6548 ++ struct bfqio_cgroup *bgrp;
6549 ++ struct bfq_group *bfqg, *next, *prev = NULL;
6550 ++ unsigned long flags;
6551 ++
6552 ++ assert_spin_locked(bfqd->queue->queue_lock);
6553 ++
6554 ++ for (; css != NULL && leaf != NULL; css = css->parent) {
6555 ++ bgrp = css_to_bfqio(css);
6556 ++ next = leaf->bfqd;
6557 ++
6558 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
6559 ++ BUG_ON(bfqg != NULL);
6560 ++
6561 ++ spin_lock_irqsave(&bgrp->lock, flags);
6562 ++
6563 ++ rcu_assign_pointer(leaf->bfqd, bfqd);
6564 ++ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
6565 ++ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
6566 ++
6567 ++ spin_unlock_irqrestore(&bgrp->lock, flags);
6568 ++
6569 ++ prev = leaf;
6570 ++ leaf = next;
6571 ++ }
6572 ++
6573 ++ BUG_ON(css == NULL && leaf != NULL);
6574 ++ if (css != NULL && prev != NULL) {
6575 ++ bgrp = css_to_bfqio(css);
6576 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
6577 ++ bfq_group_set_parent(prev, bfqg);
6578 ++ }
6579 ++}
6580 ++
6581 ++/**
6582 ++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
6583 ++ * @bfqd: queue descriptor.
6584 ++ * @cgroup: cgroup being searched for.
6585 ++ *
6586 ++ * Return a group associated to @bfqd in @cgroup, allocating one if
6587 ++ * necessary. When a group is returned all the cgroups in the path
6588 ++ * to the root have a group associated to @bfqd.
6589 ++ *
6590 ++ * If the allocation fails, return the root group: this breaks guarantees
6591 ++ * but is a safe fallbak. If this loss becames a problem it can be
6592 ++ * mitigated using the equivalent weight (given by the product of the
6593 ++ * weights of the groups in the path from @group to the root) in the
6594 ++ * root scheduler.
6595 ++ *
6596 ++ * We allocate all the missing nodes in the path from the leaf cgroup
6597 ++ * to the root and we connect the nodes only after all the allocations
6598 ++ * have been successful.
6599 ++ */
6600 ++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
6601 ++ struct cgroup_subsys_state *css)
6602 ++{
6603 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
6604 ++ struct bfq_group *bfqg;
6605 ++
6606 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
6607 ++ if (bfqg != NULL)
6608 ++ return bfqg;
6609 ++
6610 ++ bfqg = bfq_group_chain_alloc(bfqd, css);
6611 ++ if (bfqg != NULL)
6612 ++ bfq_group_chain_link(bfqd, css, bfqg);
6613 ++ else
6614 ++ bfqg = bfqd->root_group;
6615 ++
6616 ++ return bfqg;
6617 ++}
6618 ++
6619 ++/**
6620 ++ * bfq_bfqq_move - migrate @bfqq to @bfqg.
6621 ++ * @bfqd: queue descriptor.
6622 ++ * @bfqq: the queue to move.
6623 ++ * @entity: @bfqq's entity.
6624 ++ * @bfqg: the group to move to.
6625 ++ *
6626 ++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
6627 ++ * it on the new one. Avoid putting the entity on the old group idle tree.
6628 ++ *
6629 ++ * Must be called under the queue lock; the cgroup owning @bfqg must
6630 ++ * not disappear (by now this just means that we are called under
6631 ++ * rcu_read_lock()).
6632 ++ */
6633 ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6634 ++ struct bfq_entity *entity, struct bfq_group *bfqg)
6635 ++{
6636 ++ int busy, resume;
6637 ++
6638 ++ busy = bfq_bfqq_busy(bfqq);
6639 ++ resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
6640 ++
6641 ++ BUG_ON(resume && !entity->on_st);
6642 ++ BUG_ON(busy && !resume && entity->on_st &&
6643 ++ bfqq != bfqd->in_service_queue);
6644 ++
6645 ++ if (busy) {
6646 ++ BUG_ON(atomic_read(&bfqq->ref) < 2);
6647 ++
6648 ++ if (!resume)
6649 ++ bfq_del_bfqq_busy(bfqd, bfqq, 0);
6650 ++ else
6651 ++ bfq_deactivate_bfqq(bfqd, bfqq, 0);
6652 ++ } else if (entity->on_st)
6653 ++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
6654 ++
6655 ++ /*
6656 ++ * Here we use a reference to bfqg. We don't need a refcounter
6657 ++ * as the cgroup reference will not be dropped, so that its
6658 ++ * destroy() callback will not be invoked.
6659 ++ */
6660 ++ entity->parent = bfqg->my_entity;
6661 ++ entity->sched_data = &bfqg->sched_data;
6662 ++
6663 ++ if (busy && resume)
6664 ++ bfq_activate_bfqq(bfqd, bfqq);
6665 ++
6666 ++ if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)
6667 ++ bfq_schedule_dispatch(bfqd);
6668 ++}
6669 ++
6670 ++/**
6671 ++ * __bfq_bic_change_cgroup - move @bic to @cgroup.
6672 ++ * @bfqd: the queue descriptor.
6673 ++ * @bic: the bic to move.
6674 ++ * @cgroup: the cgroup to move to.
6675 ++ *
6676 ++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
6677 ++ * has to make sure that the reference to cgroup is valid across the call.
6678 ++ *
6679 ++ * NOTE: an alternative approach might have been to store the current
6680 ++ * cgroup in bfqq and getting a reference to it, reducing the lookup
6681 ++ * time here, at the price of slightly more complex code.
6682 ++ */
6683 ++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
6684 ++ struct bfq_io_cq *bic,
6685 ++ struct cgroup_subsys_state *css)
6686 ++{
6687 ++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
6688 ++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
6689 ++ struct bfq_entity *entity;
6690 ++ struct bfq_group *bfqg;
6691 ++ struct bfqio_cgroup *bgrp;
6692 ++
6693 ++ bgrp = css_to_bfqio(css);
6694 ++
6695 ++ bfqg = bfq_find_alloc_group(bfqd, css);
6696 ++ if (async_bfqq != NULL) {
6697 ++ entity = &async_bfqq->entity;
6698 ++
6699 ++ if (entity->sched_data != &bfqg->sched_data) {
6700 ++ bic_set_bfqq(bic, NULL, 0);
6701 ++ bfq_log_bfqq(bfqd, async_bfqq,
6702 ++ "bic_change_group: %p %d",
6703 ++ async_bfqq, atomic_read(&async_bfqq->ref));
6704 ++ bfq_put_queue(async_bfqq);
6705 ++ }
6706 ++ }
6707 ++
6708 ++ if (sync_bfqq != NULL) {
6709 ++ entity = &sync_bfqq->entity;
6710 ++ if (entity->sched_data != &bfqg->sched_data)
6711 ++ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
6712 ++ }
6713 ++
6714 ++ return bfqg;
6715 ++}
6716 ++
6717 ++/**
6718 ++ * bfq_bic_change_cgroup - move @bic to @cgroup.
6719 ++ * @bic: the bic being migrated.
6720 ++ * @cgroup: the destination cgroup.
6721 ++ *
6722 ++ * When the task owning @bic is moved to @cgroup, @bic is immediately
6723 ++ * moved into its new parent group.
6724 ++ */
6725 ++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
6726 ++ struct cgroup_subsys_state *css)
6727 ++{
6728 ++ struct bfq_data *bfqd;
6729 ++ unsigned long uninitialized_var(flags);
6730 ++
6731 ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
6732 ++ &flags);
6733 ++ if (bfqd != NULL) {
6734 ++ __bfq_bic_change_cgroup(bfqd, bic, css);
6735 ++ bfq_put_bfqd_unlock(bfqd, &flags);
6736 ++ }
6737 ++}
6738 ++
6739 ++/**
6740 ++ * bfq_bic_update_cgroup - update the cgroup of @bic.
6741 ++ * @bic: the @bic to update.
6742 ++ *
6743 ++ * Make sure that @bic is enqueued in the cgroup of the current task.
6744 ++ * We need this in addition to moving bics during the cgroup attach
6745 ++ * phase because the task owning @bic could be at its first disk
6746 ++ * access or we may end up in the root cgroup as the result of a
6747 ++ * memory allocation failure and here we try to move to the right
6748 ++ * group.
6749 ++ *
6750 ++ * Must be called under the queue lock. It is safe to use the returned
6751 ++ * value even after the rcu_read_unlock() as the migration/destruction
6752 ++ * paths act under the queue lock too. IOW it is impossible to race with
6753 ++ * group migration/destruction and end up with an invalid group as:
6754 ++ * a) here cgroup has not yet been destroyed, nor its destroy callback
6755 ++ * has started execution, as current holds a reference to it,
6756 ++ * b) if it is destroyed after rcu_read_unlock() [after current is
6757 ++ * migrated to a different cgroup] its attach() callback will have
6758 ++ * taken care of remove all the references to the old cgroup data.
6759 ++ */
6760 ++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
6761 ++{
6762 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
6763 ++ struct bfq_group *bfqg;
6764 ++ struct cgroup_subsys_state *css;
6765 ++
6766 ++ BUG_ON(bfqd == NULL);
6767 ++
6768 ++ rcu_read_lock();
6769 ++ css = task_css(current, bfqio_subsys_id);
6770 ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
6771 ++ rcu_read_unlock();
6772 ++
6773 ++ return bfqg;
6774 ++}
6775 ++
6776 ++/**
6777 ++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
6778 ++ * @st: the service tree being flushed.
6779 ++ */
6780 ++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
6781 ++{
6782 ++ struct bfq_entity *entity = st->first_idle;
6783 ++
6784 ++ for (; entity != NULL; entity = st->first_idle)
6785 ++ __bfq_deactivate_entity(entity, 0);
6786 ++}
6787 ++
6788 ++/**
6789 ++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
6790 ++ * @bfqd: the device data structure with the root group.
6791 ++ * @entity: the entity to move.
6792 ++ */
6793 ++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
6794 ++ struct bfq_entity *entity)
6795 ++{
6796 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
6797 ++
6798 ++ BUG_ON(bfqq == NULL);
6799 ++ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
6800 ++ return;
6801 ++}
6802 ++
6803 ++/**
6804 ++ * bfq_reparent_active_entities - move to the root group all active entities.
6805 ++ * @bfqd: the device data structure with the root group.
6806 ++ * @bfqg: the group to move from.
6807 ++ * @st: the service tree with the entities.
6808 ++ *
6809 ++ * Needs queue_lock to be taken and reference to be valid over the call.
6810 ++ */
6811 ++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
6812 ++ struct bfq_group *bfqg,
6813 ++ struct bfq_service_tree *st)
6814 ++{
6815 ++ struct rb_root *active = &st->active;
6816 ++ struct bfq_entity *entity = NULL;
6817 ++
6818 ++ if (!RB_EMPTY_ROOT(&st->active))
6819 ++ entity = bfq_entity_of(rb_first(active));
6820 ++
6821 ++ for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))
6822 ++ bfq_reparent_leaf_entity(bfqd, entity);
6823 ++
6824 ++ if (bfqg->sched_data.in_service_entity != NULL)
6825 ++ bfq_reparent_leaf_entity(bfqd,
6826 ++ bfqg->sched_data.in_service_entity);
6827 ++
6828 ++ return;
6829 ++}
6830 ++
6831 ++/**
6832 ++ * bfq_destroy_group - destroy @bfqg.
6833 ++ * @bgrp: the bfqio_cgroup containing @bfqg.
6834 ++ * @bfqg: the group being destroyed.
6835 ++ *
6836 ++ * Destroy @bfqg, making sure that it is not referenced from its parent.
6837 ++ */
6838 ++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
6839 ++{
6840 ++ struct bfq_data *bfqd;
6841 ++ struct bfq_service_tree *st;
6842 ++ struct bfq_entity *entity = bfqg->my_entity;
6843 ++ unsigned long uninitialized_var(flags);
6844 ++ int i;
6845 ++
6846 ++ hlist_del(&bfqg->group_node);
6847 ++
6848 ++ /*
6849 ++ * Empty all service_trees belonging to this group before deactivating
6850 ++ * the group itself.
6851 ++ */
6852 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
6853 ++ st = bfqg->sched_data.service_tree + i;
6854 ++
6855 ++ /*
6856 ++ * The idle tree may still contain bfq_queues belonging
6857 ++ * to exited task because they never migrated to a different
6858 ++ * cgroup from the one being destroyed now. Noone else
6859 ++ * can access them so it's safe to act without any lock.
6860 ++ */
6861 ++ bfq_flush_idle_tree(st);
6862 ++
6863 ++ /*
6864 ++ * It may happen that some queues are still active
6865 ++ * (busy) upon group destruction (if the corresponding
6866 ++ * processes have been forced to terminate). We move
6867 ++ * all the leaf entities corresponding to these queues
6868 ++ * to the root_group.
6869 ++ * Also, it may happen that the group has an entity
6870 ++ * under service, which is disconnected from the active
6871 ++ * tree: it must be moved, too.
6872 ++ * There is no need to put the sync queues, as the
6873 ++ * scheduler has taken no reference.
6874 ++ */
6875 ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
6876 ++ if (bfqd != NULL) {
6877 ++ bfq_reparent_active_entities(bfqd, bfqg, st);
6878 ++ bfq_put_bfqd_unlock(bfqd, &flags);
6879 ++ }
6880 ++ BUG_ON(!RB_EMPTY_ROOT(&st->active));
6881 ++ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
6882 ++ }
6883 ++ BUG_ON(bfqg->sched_data.next_in_service != NULL);
6884 ++ BUG_ON(bfqg->sched_data.in_service_entity != NULL);
6885 ++
6886 ++ /*
6887 ++ * We may race with device destruction, take extra care when
6888 ++ * dereferencing bfqg->bfqd.
6889 ++ */
6890 ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
6891 ++ if (bfqd != NULL) {
6892 ++ hlist_del(&bfqg->bfqd_node);
6893 ++ __bfq_deactivate_entity(entity, 0);
6894 ++ bfq_put_async_queues(bfqd, bfqg);
6895 ++ bfq_put_bfqd_unlock(bfqd, &flags);
6896 ++ }
6897 ++ BUG_ON(entity->tree != NULL);
6898 ++
6899 ++ /*
6900 ++ * No need to defer the kfree() to the end of the RCU grace
6901 ++ * period: we are called from the destroy() callback of our
6902 ++ * cgroup, so we can be sure that noone is a) still using
6903 ++ * this cgroup or b) doing lookups in it.
6904 ++ */
6905 ++ kfree(bfqg);
6906 ++}
6907 ++
6908 ++static void bfq_end_raising_async(struct bfq_data *bfqd)
6909 ++{
6910 ++ struct hlist_node *tmp;
6911 ++ struct bfq_group *bfqg;
6912 ++
6913 ++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
6914 ++ bfq_end_raising_async_queues(bfqd, bfqg);
6915 ++ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
6916 ++}
6917 ++
6918 ++/**
6919 ++ * bfq_disconnect_groups - diconnect @bfqd from all its groups.
6920 ++ * @bfqd: the device descriptor being exited.
6921 ++ *
6922 ++ * When the device exits we just make sure that no lookup can return
6923 ++ * the now unused group structures. They will be deallocated on cgroup
6924 ++ * destruction.
6925 ++ */
6926 ++static void bfq_disconnect_groups(struct bfq_data *bfqd)
6927 ++{
6928 ++ struct hlist_node *tmp;
6929 ++ struct bfq_group *bfqg;
6930 ++
6931 ++ bfq_log(bfqd, "disconnect_groups beginning");
6932 ++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
6933 ++ hlist_del(&bfqg->bfqd_node);
6934 ++
6935 ++ __bfq_deactivate_entity(bfqg->my_entity, 0);
6936 ++
6937 ++ /*
6938 ++ * Don't remove from the group hash, just set an
6939 ++ * invalid key. No lookups can race with the
6940 ++ * assignment as bfqd is being destroyed; this
6941 ++ * implies also that new elements cannot be added
6942 ++ * to the list.
6943 ++ */
6944 ++ rcu_assign_pointer(bfqg->bfqd, NULL);
6945 ++
6946 ++ bfq_log(bfqd, "disconnect_groups: put async for group %p",
6947 ++ bfqg);
6948 ++ bfq_put_async_queues(bfqd, bfqg);
6949 ++ }
6950 ++}
6951 ++
6952 ++static inline void bfq_free_root_group(struct bfq_data *bfqd)
6953 ++{
6954 ++ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
6955 ++ struct bfq_group *bfqg = bfqd->root_group;
6956 ++
6957 ++ bfq_put_async_queues(bfqd, bfqg);
6958 ++
6959 ++ spin_lock_irq(&bgrp->lock);
6960 ++ hlist_del_rcu(&bfqg->group_node);
6961 ++ spin_unlock_irq(&bgrp->lock);
6962 ++
6963 ++ /*
6964 ++ * No need to synchronize_rcu() here: since the device is gone
6965 ++ * there cannot be any read-side access to its root_group.
6966 ++ */
6967 ++ kfree(bfqg);
6968 ++}
6969 ++
6970 ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
6971 ++{
6972 ++ struct bfq_group *bfqg;
6973 ++ struct bfqio_cgroup *bgrp;
6974 ++ int i;
6975 ++
6976 ++ bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);
6977 ++ if (bfqg == NULL)
6978 ++ return NULL;
6979 ++
6980 ++ bfqg->entity.parent = NULL;
6981 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
6982 ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
6983 ++
6984 ++ bgrp = &bfqio_root_cgroup;
6985 ++ spin_lock_irq(&bgrp->lock);
6986 ++ rcu_assign_pointer(bfqg->bfqd, bfqd);
6987 ++ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
6988 ++ spin_unlock_irq(&bgrp->lock);
6989 ++
6990 ++ return bfqg;
6991 ++}
6992 ++
6993 ++#define SHOW_FUNCTION(__VAR) \
6994 ++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
6995 ++ struct cftype *cftype) \
6996 ++{ \
6997 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
6998 ++ u64 ret = -ENODEV; \
6999 ++ \
7000 ++ mutex_lock(&bfqio_mutex); \
7001 ++ if (bfqio_is_removed(bgrp)) \
7002 ++ goto out_unlock; \
7003 ++ \
7004 ++ spin_lock_irq(&bgrp->lock); \
7005 ++ ret = bgrp->__VAR; \
7006 ++ spin_unlock_irq(&bgrp->lock); \
7007 ++ \
7008 ++out_unlock: \
7009 ++ mutex_unlock(&bfqio_mutex); \
7010 ++ return ret; \
7011 ++}
7012 ++
7013 ++SHOW_FUNCTION(weight);
7014 ++SHOW_FUNCTION(ioprio);
7015 ++SHOW_FUNCTION(ioprio_class);
7016 ++#undef SHOW_FUNCTION
7017 ++
7018 ++#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
7019 ++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
7020 ++ struct cftype *cftype, \
7021 ++ u64 val) \
7022 ++{ \
7023 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
7024 ++ struct bfq_group *bfqg; \
7025 ++ int ret = -EINVAL; \
7026 ++ \
7027 ++ if (val < (__MIN) || val > (__MAX)) \
7028 ++ return ret; \
7029 ++ \
7030 ++ ret = -ENODEV; \
7031 ++ mutex_lock(&bfqio_mutex); \
7032 ++ if (bfqio_is_removed(bgrp)) \
7033 ++ goto out_unlock; \
7034 ++ ret = 0; \
7035 ++ \
7036 ++ spin_lock_irq(&bgrp->lock); \
7037 ++ bgrp->__VAR = (unsigned short)val; \
7038 ++ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
7039 ++ /* \
7040 ++ * Setting the ioprio_changed flag of the entity \
7041 ++ * to 1 with new_##__VAR == ##__VAR would re-set \
7042 ++ * the value of the weight to its ioprio mapping. \
7043 ++ * Set the flag only if necessary. \
7044 ++ */ \
7045 ++ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
7046 ++ bfqg->entity.new_##__VAR = (unsigned short)val; \
7047 ++ smp_wmb(); \
7048 ++ bfqg->entity.ioprio_changed = 1; \
7049 ++ } \
7050 ++ } \
7051 ++ spin_unlock_irq(&bgrp->lock); \
7052 ++ \
7053 ++out_unlock: \
7054 ++ mutex_unlock(&bfqio_mutex); \
7055 ++ return ret; \
7056 ++}
7057 ++
7058 ++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
7059 ++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
7060 ++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
7061 ++#undef STORE_FUNCTION
7062 ++
7063 ++static struct cftype bfqio_files[] = {
7064 ++ {
7065 ++ .name = "weight",
7066 ++ .read_u64 = bfqio_cgroup_weight_read,
7067 ++ .write_u64 = bfqio_cgroup_weight_write,
7068 ++ },
7069 ++ {
7070 ++ .name = "ioprio",
7071 ++ .read_u64 = bfqio_cgroup_ioprio_read,
7072 ++ .write_u64 = bfqio_cgroup_ioprio_write,
7073 ++ },
7074 ++ {
7075 ++ .name = "ioprio_class",
7076 ++ .read_u64 = bfqio_cgroup_ioprio_class_read,
7077 ++ .write_u64 = bfqio_cgroup_ioprio_class_write,
7078 ++ },
7079 ++ { }, /* terminate */
7080 ++};
7081 ++
7082 ++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state
7083 ++ *parent_css)
7084 ++{
7085 ++ struct bfqio_cgroup *bgrp;
7086 ++
7087 ++ if (parent_css != NULL) {
7088 ++ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
7089 ++ if (bgrp == NULL)
7090 ++ return ERR_PTR(-ENOMEM);
7091 ++ } else
7092 ++ bgrp = &bfqio_root_cgroup;
7093 ++
7094 ++ spin_lock_init(&bgrp->lock);
7095 ++ INIT_HLIST_HEAD(&bgrp->group_data);
7096 ++ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
7097 ++ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
7098 ++
7099 ++ return &bgrp->css;
7100 ++}
7101 ++
7102 ++/*
7103 ++ * We cannot support shared io contexts, as we have no means to support
7104 ++ * two tasks with the same ioc in two different groups without major rework
7105 ++ * of the main bic/bfqq data structures. By now we allow a task to change
7106 ++ * its cgroup only if it's the only owner of its ioc; the drawback of this
7107 ++ * behavior is that a group containing a task that forked using CLONE_IO
7108 ++ * will not be destroyed until the tasks sharing the ioc die.
7109 ++ */
7110 ++static int bfqio_can_attach(struct cgroup_subsys_state *css,
7111 ++ struct cgroup_taskset *tset)
7112 ++{
7113 ++ struct task_struct *task;
7114 ++ struct io_context *ioc;
7115 ++ int ret = 0;
7116 ++
7117 ++ cgroup_taskset_for_each(task, css, tset) {
7118 ++ /*
7119 ++ * task_lock() is needed to avoid races with
7120 ++ * exit_io_context()
7121 ++ */
7122 ++ task_lock(task);
7123 ++ ioc = task->io_context;
7124 ++ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
7125 ++ /*
7126 ++ * ioc == NULL means that the task is either too young
7127 ++ * or exiting: if it has still no ioc the ioc can't be
7128 ++ * shared, if the task is exiting the attach will fail
7129 ++ * anyway, no matter what we return here.
7130 ++ */
7131 ++ ret = -EINVAL;
7132 ++ task_unlock(task);
7133 ++ if (ret)
7134 ++ break;
7135 ++ }
7136 ++
7137 ++ return ret;
7138 ++}
7139 ++
7140 ++static void bfqio_attach(struct cgroup_subsys_state *css,
7141 ++ struct cgroup_taskset *tset)
7142 ++{
7143 ++ struct task_struct *task;
7144 ++ struct io_context *ioc;
7145 ++ struct io_cq *icq;
7146 ++
7147 ++ /*
7148 ++ * IMPORTANT NOTE: The move of more than one process at a time to a
7149 ++ * new group has not yet been tested.
7150 ++ */
7151 ++ cgroup_taskset_for_each(task, css, tset) {
7152 ++ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
7153 ++ if (ioc) {
7154 ++ /*
7155 ++ * Handle cgroup change here.
7156 ++ */
7157 ++ rcu_read_lock();
7158 ++ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
7159 ++ if (!strncmp(
7160 ++ icq->q->elevator->type->elevator_name,
7161 ++ "bfq", ELV_NAME_MAX))
7162 ++ bfq_bic_change_cgroup(icq_to_bic(icq),
7163 ++ css);
7164 ++ rcu_read_unlock();
7165 ++ put_io_context(ioc);
7166 ++ }
7167 ++ }
7168 ++}
7169 ++
7170 ++static void bfqio_destroy(struct cgroup_subsys_state *css)
7171 ++{
7172 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
7173 ++ struct hlist_node *tmp;
7174 ++ struct bfq_group *bfqg;
7175 ++
7176 ++ /*
7177 ++ * Since we are destroying the cgroup, there are no more tasks
7178 ++ * referencing it, and all the RCU grace periods that may have
7179 ++ * referenced it are ended (as the destruction of the parent
7180 ++ * cgroup is RCU-safe); bgrp->group_data will not be accessed by
7181 ++ * anything else and we don't need any synchronization.
7182 ++ */
7183 ++ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
7184 ++ bfq_destroy_group(bgrp, bfqg);
7185 ++
7186 ++ BUG_ON(!hlist_empty(&bgrp->group_data));
7187 ++
7188 ++ kfree(bgrp);
7189 ++}
7190 ++
7191 ++static int bfqio_css_online(struct cgroup_subsys_state *css)
7192 ++{
7193 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
7194 ++
7195 ++ mutex_lock(&bfqio_mutex);
7196 ++ bgrp->online = true;
7197 ++ mutex_unlock(&bfqio_mutex);
7198 ++
7199 ++ return 0;
7200 ++}
7201 ++
7202 ++static void bfqio_css_offline(struct cgroup_subsys_state *css)
7203 ++{
7204 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
7205 ++
7206 ++ mutex_lock(&bfqio_mutex);
7207 ++ bgrp->online = false;
7208 ++ mutex_unlock(&bfqio_mutex);
7209 ++}
7210 ++
7211 ++struct cgroup_subsys bfqio_subsys = {
7212 ++ .name = "bfqio",
7213 ++ .css_alloc = bfqio_create,
7214 ++ .css_online = bfqio_css_online,
7215 ++ .css_offline = bfqio_css_offline,
7216 ++ .can_attach = bfqio_can_attach,
7217 ++ .attach = bfqio_attach,
7218 ++ .css_free = bfqio_destroy,
7219 ++ .subsys_id = bfqio_subsys_id,
7220 ++ .base_cftypes = bfqio_files,
7221 ++};
7222 ++#else
7223 ++static inline void bfq_init_entity(struct bfq_entity *entity,
7224 ++ struct bfq_group *bfqg)
7225 ++{
7226 ++ entity->weight = entity->new_weight;
7227 ++ entity->orig_weight = entity->new_weight;
7228 ++ entity->ioprio = entity->new_ioprio;
7229 ++ entity->ioprio_class = entity->new_ioprio_class;
7230 ++ entity->sched_data = &bfqg->sched_data;
7231 ++}
7232 ++
7233 ++static inline struct bfq_group *
7234 ++bfq_bic_update_cgroup(struct bfq_io_cq *bic)
7235 ++{
7236 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
7237 ++ return bfqd->root_group;
7238 ++}
7239 ++
7240 ++static inline void bfq_bfqq_move(struct bfq_data *bfqd,
7241 ++ struct bfq_queue *bfqq,
7242 ++ struct bfq_entity *entity,
7243 ++ struct bfq_group *bfqg)
7244 ++{
7245 ++}
7246 ++
7247 ++static void bfq_end_raising_async(struct bfq_data *bfqd)
7248 ++{
7249 ++ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
7250 ++}
7251 ++
7252 ++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
7253 ++{
7254 ++ bfq_put_async_queues(bfqd, bfqd->root_group);
7255 ++}
7256 ++
7257 ++static inline void bfq_free_root_group(struct bfq_data *bfqd)
7258 ++{
7259 ++ kfree(bfqd->root_group);
7260 ++}
7261 ++
7262 ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
7263 ++{
7264 ++ struct bfq_group *bfqg;
7265 ++ int i;
7266 ++
7267 ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
7268 ++ if (bfqg == NULL)
7269 ++ return NULL;
7270 ++
7271 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
7272 ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
7273 ++
7274 ++ return bfqg;
7275 ++}
7276 ++#endif
7277 +diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
7278 +new file mode 100644
7279 +index 0000000..7f6b000
7280 +--- /dev/null
7281 ++++ b/block/bfq-ioc.c
7282 +@@ -0,0 +1,36 @@
7283 ++/*
7284 ++ * BFQ: I/O context handling.
7285 ++ *
7286 ++ * Based on ideas and code from CFQ:
7287 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
7288 ++ *
7289 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
7290 ++ * Paolo Valente <paolo.valente@×××××××.it>
7291 ++ *
7292 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
7293 ++ */
7294 ++
7295 ++/**
7296 ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
7297 ++ * @icq: the iocontext queue.
7298 ++ */
7299 ++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
7300 ++{
7301 ++ /* bic->icq is the first member, %NULL will convert to %NULL */
7302 ++ return container_of(icq, struct bfq_io_cq, icq);
7303 ++}
7304 ++
7305 ++/**
7306 ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
7307 ++ * @bfqd: the lookup key.
7308 ++ * @ioc: the io_context of the process doing I/O.
7309 ++ *
7310 ++ * Queue lock must be held.
7311 ++ */
7312 ++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
7313 ++ struct io_context *ioc)
7314 ++{
7315 ++ if (ioc)
7316 ++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
7317 ++ return NULL;
7318 ++}
7319 +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
7320 +new file mode 100644
7321 +index 0000000..eb760de
7322 +--- /dev/null
7323 ++++ b/block/bfq-iosched.c
7324 +@@ -0,0 +1,3298 @@
7325 ++/*
7326 ++ * BFQ, or Budget Fair Queueing, disk scheduler.
7327 ++ *
7328 ++ * Based on ideas and code from CFQ:
7329 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
7330 ++ *
7331 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
7332 ++ * Paolo Valente <paolo.valente@×××××××.it>
7333 ++ *
7334 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
7335 ++ *
7336 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
7337 ++ *
7338 ++ * BFQ is a proportional share disk scheduling algorithm based on the
7339 ++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in
7340 ++ * number of sectors, to tasks instead of time slices. The disk is not granted
7341 ++ * to the in-service task for a given time slice, but until it has exahusted
7342 ++ * its assigned budget. This change from the time to the service domain allows
7343 ++ * BFQ to distribute the disk bandwidth among tasks as desired, without any
7344 ++ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an
7345 ++ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to
7346 ++ * their budgets (more precisely BFQ schedules queues associated to tasks).
7347 ++ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to
7348 ++ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low
7349 ++ * latencies to interactive and soft real-time applications.
7350 ++ *
7351 ++ * BFQ is described in [1], where also a reference to the initial, more
7352 ++ * theoretical paper on BFQ can be found. The interested reader can find in
7353 ++ * the latter paper full details on the main algorithm as well as formulas of
7354 ++ * the guarantees, plus formal proofs of all the properties. With respect to
7355 ++ * the version of BFQ presented in these papers, this implementation adds a
7356 ++ * few more heuristics, such as the one that guarantees a low latency to soft
7357 ++ * real-time applications, and a hierarchical extension based on H-WF2Q+.
7358 ++ *
7359 ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
7360 ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
7361 ++ * complexity derives from the one introduced with EEVDF in [3].
7362 ++ *
7363 ++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
7364 ++ * with the BFQ Disk I/O Scheduler'',
7365 ++ * Proceedings of the 5th Annual International Systems and Storage
7366 ++ * Conference (SYSTOR '12), June 2012.
7367 ++ *
7368 ++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
7369 ++ *
7370 ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
7371 ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
7372 ++ * Oct 1997.
7373 ++ *
7374 ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
7375 ++ *
7376 ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
7377 ++ * First: A Flexible and Accurate Mechanism for Proportional Share
7378 ++ * Resource Allocation,'' technical report.
7379 ++ *
7380 ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
7381 ++ */
7382 ++#include <linux/module.h>
7383 ++#include <linux/slab.h>
7384 ++#include <linux/blkdev.h>
7385 ++#include <linux/cgroup.h>
7386 ++#include <linux/elevator.h>
7387 ++#include <linux/jiffies.h>
7388 ++#include <linux/rbtree.h>
7389 ++#include <linux/ioprio.h>
7390 ++#include "bfq.h"
7391 ++#include "blk.h"
7392 ++
7393 ++/* Max number of dispatches in one round of service. */
7394 ++static const int bfq_quantum = 4;
7395 ++
7396 ++/* Expiration time of sync (0) and async (1) requests, in jiffies. */
7397 ++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
7398 ++
7399 ++/* Maximum backwards seek, in KiB. */
7400 ++static const int bfq_back_max = 16 * 1024;
7401 ++
7402 ++/* Penalty of a backwards seek, in number of sectors. */
7403 ++static const int bfq_back_penalty = 2;
7404 ++
7405 ++/* Idling period duration, in jiffies. */
7406 ++static int bfq_slice_idle = HZ / 125;
7407 ++
7408 ++/* Default maximum budget values, in sectors and number of requests. */
7409 ++static const int bfq_default_max_budget = 16 * 1024;
7410 ++static const int bfq_max_budget_async_rq = 4;
7411 ++
7412 ++/*
7413 ++ * Async to sync throughput distribution is controlled as follows:
7414 ++ * when an async request is served, the entity is charged the number
7415 ++ * of sectors of the request, multipled by the factor below
7416 ++ */
7417 ++static const int bfq_async_charge_factor = 10;
7418 ++
7419 ++/* Default timeout values, in jiffies, approximating CFQ defaults. */
7420 ++static const int bfq_timeout_sync = HZ / 8;
7421 ++static int bfq_timeout_async = HZ / 25;
7422 ++
7423 ++struct kmem_cache *bfq_pool;
7424 ++
7425 ++/* Below this threshold (in ms), we consider thinktime immediate. */
7426 ++#define BFQ_MIN_TT 2
7427 ++
7428 ++/* hw_tag detection: parallel requests threshold and min samples needed. */
7429 ++#define BFQ_HW_QUEUE_THRESHOLD 4
7430 ++#define BFQ_HW_QUEUE_SAMPLES 32
7431 ++
7432 ++#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
7433 ++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
7434 ++
7435 ++/* Min samples used for peak rate estimation (for autotuning). */
7436 ++#define BFQ_PEAK_RATE_SAMPLES 32
7437 ++
7438 ++/* Shift used for peak rate fixed precision calculations. */
7439 ++#define BFQ_RATE_SHIFT 16
7440 ++
7441 ++/*
7442 ++ * The duration of the weight raising for interactive applications is
7443 ++ * computed automatically (as default behaviour), using the following
7444 ++ * formula: duration = (R / r) * T, where r is the peak rate of the
7445 ++ * disk, and R and T are two reference parameters. In particular, R is
7446 ++ * the peak rate of a reference disk, and T is about the maximum time
7447 ++ * for starting popular large applications on that disk, under BFQ and
7448 ++ * while reading two files in parallel. Finally, BFQ uses two
7449 ++ * different pairs (R, T) depending on whether the disk is rotational
7450 ++ * or non-rotational.
7451 ++ */
7452 ++#define T_rot (msecs_to_jiffies(5500))
7453 ++#define T_nonrot (msecs_to_jiffies(2000))
7454 ++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
7455 ++#define R_rot 17415
7456 ++#define R_nonrot 34791
7457 ++
7458 ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
7459 ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
7460 ++
7461 ++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
7462 ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
7463 ++
7464 ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
7465 ++
7466 ++#include "bfq-ioc.c"
7467 ++#include "bfq-sched.c"
7468 ++#include "bfq-cgroup.c"
7469 ++
7470 ++#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
7471 ++ IOPRIO_CLASS_IDLE)
7472 ++#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
7473 ++ IOPRIO_CLASS_RT)
7474 ++
7475 ++#define bfq_sample_valid(samples) ((samples) > 80)
7476 ++
7477 ++/*
7478 ++ * We regard a request as SYNC, if either it's a read or has the SYNC bit
7479 ++ * set (in which case it could also be a direct WRITE).
7480 ++ */
7481 ++static inline int bfq_bio_sync(struct bio *bio)
7482 ++{
7483 ++ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
7484 ++ return 1;
7485 ++
7486 ++ return 0;
7487 ++}
7488 ++
7489 ++/*
7490 ++ * Scheduler run of queue, if there are requests pending and no one in the
7491 ++ * driver that will restart queueing.
7492 ++ */
7493 ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
7494 ++{
7495 ++ if (bfqd->queued != 0) {
7496 ++ bfq_log(bfqd, "schedule dispatch");
7497 ++ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
7498 ++ }
7499 ++}
7500 ++
7501 ++/*
7502 ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
7503 ++ * We choose the request that is closesr to the head right now. Distance
7504 ++ * behind the head is penalized and only allowed to a certain extent.
7505 ++ */
7506 ++static struct request *bfq_choose_req(struct bfq_data *bfqd,
7507 ++ struct request *rq1,
7508 ++ struct request *rq2,
7509 ++ sector_t last)
7510 ++{
7511 ++ sector_t s1, s2, d1 = 0, d2 = 0;
7512 ++ unsigned long back_max;
7513 ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
7514 ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
7515 ++ unsigned wrap = 0; /* bit mask: requests behind the disk head? */
7516 ++
7517 ++ if (rq1 == NULL || rq1 == rq2)
7518 ++ return rq2;
7519 ++ if (rq2 == NULL)
7520 ++ return rq1;
7521 ++
7522 ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
7523 ++ return rq1;
7524 ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
7525 ++ return rq2;
7526 ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
7527 ++ return rq1;
7528 ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
7529 ++ return rq2;
7530 ++
7531 ++ s1 = blk_rq_pos(rq1);
7532 ++ s2 = blk_rq_pos(rq2);
7533 ++
7534 ++ /*
7535 ++ * By definition, 1KiB is 2 sectors.
7536 ++ */
7537 ++ back_max = bfqd->bfq_back_max * 2;
7538 ++
7539 ++ /*
7540 ++ * Strict one way elevator _except_ in the case where we allow
7541 ++ * short backward seeks which are biased as twice the cost of a
7542 ++ * similar forward seek.
7543 ++ */
7544 ++ if (s1 >= last)
7545 ++ d1 = s1 - last;
7546 ++ else if (s1 + back_max >= last)
7547 ++ d1 = (last - s1) * bfqd->bfq_back_penalty;
7548 ++ else
7549 ++ wrap |= BFQ_RQ1_WRAP;
7550 ++
7551 ++ if (s2 >= last)
7552 ++ d2 = s2 - last;
7553 ++ else if (s2 + back_max >= last)
7554 ++ d2 = (last - s2) * bfqd->bfq_back_penalty;
7555 ++ else
7556 ++ wrap |= BFQ_RQ2_WRAP;
7557 ++
7558 ++ /* Found required data */
7559 ++
7560 ++ /*
7561 ++ * By doing switch() on the bit mask "wrap" we avoid having to
7562 ++ * check two variables for all permutations: --> faster!
7563 ++ */
7564 ++ switch (wrap) {
7565 ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
7566 ++ if (d1 < d2)
7567 ++ return rq1;
7568 ++ else if (d2 < d1)
7569 ++ return rq2;
7570 ++ else {
7571 ++ if (s1 >= s2)
7572 ++ return rq1;
7573 ++ else
7574 ++ return rq2;
7575 ++ }
7576 ++
7577 ++ case BFQ_RQ2_WRAP:
7578 ++ return rq1;
7579 ++ case BFQ_RQ1_WRAP:
7580 ++ return rq2;
7581 ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
7582 ++ default:
7583 ++ /*
7584 ++ * Since both rqs are wrapped,
7585 ++ * start with the one that's further behind head
7586 ++ * (--> only *one* back seek required),
7587 ++ * since back seek takes more time than forward.
7588 ++ */
7589 ++ if (s1 <= s2)
7590 ++ return rq1;
7591 ++ else
7592 ++ return rq2;
7593 ++ }
7594 ++}
7595 ++
7596 ++static struct bfq_queue *
7597 ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
7598 ++ sector_t sector, struct rb_node **ret_parent,
7599 ++ struct rb_node ***rb_link)
7600 ++{
7601 ++ struct rb_node **p, *parent;
7602 ++ struct bfq_queue *bfqq = NULL;
7603 ++
7604 ++ parent = NULL;
7605 ++ p = &root->rb_node;
7606 ++ while (*p) {
7607 ++ struct rb_node **n;
7608 ++
7609 ++ parent = *p;
7610 ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
7611 ++
7612 ++ /*
7613 ++ * Sort strictly based on sector. Smallest to the left,
7614 ++ * largest to the right.
7615 ++ */
7616 ++ if (sector > blk_rq_pos(bfqq->next_rq))
7617 ++ n = &(*p)->rb_right;
7618 ++ else if (sector < blk_rq_pos(bfqq->next_rq))
7619 ++ n = &(*p)->rb_left;
7620 ++ else
7621 ++ break;
7622 ++ p = n;
7623 ++ bfqq = NULL;
7624 ++ }
7625 ++
7626 ++ *ret_parent = parent;
7627 ++ if (rb_link)
7628 ++ *rb_link = p;
7629 ++
7630 ++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
7631 ++ (long long unsigned)sector,
7632 ++ bfqq != NULL ? bfqq->pid : 0);
7633 ++
7634 ++ return bfqq;
7635 ++}
7636 ++
7637 ++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
7638 ++{
7639 ++ struct rb_node **p, *parent;
7640 ++ struct bfq_queue *__bfqq;
7641 ++
7642 ++ if (bfqq->pos_root != NULL) {
7643 ++ rb_erase(&bfqq->pos_node, bfqq->pos_root);
7644 ++ bfqq->pos_root = NULL;
7645 ++ }
7646 ++
7647 ++ if (bfq_class_idle(bfqq))
7648 ++ return;
7649 ++ if (!bfqq->next_rq)
7650 ++ return;
7651 ++
7652 ++ bfqq->pos_root = &bfqd->rq_pos_tree;
7653 ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
7654 ++ blk_rq_pos(bfqq->next_rq), &parent, &p);
7655 ++ if (__bfqq == NULL) {
7656 ++ rb_link_node(&bfqq->pos_node, parent, p);
7657 ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
7658 ++ } else
7659 ++ bfqq->pos_root = NULL;
7660 ++}
7661 ++
7662 ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
7663 ++ struct bfq_queue *bfqq,
7664 ++ struct request *last)
7665 ++{
7666 ++ struct rb_node *rbnext = rb_next(&last->rb_node);
7667 ++ struct rb_node *rbprev = rb_prev(&last->rb_node);
7668 ++ struct request *next = NULL, *prev = NULL;
7669 ++
7670 ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
7671 ++
7672 ++ if (rbprev != NULL)
7673 ++ prev = rb_entry_rq(rbprev);
7674 ++
7675 ++ if (rbnext != NULL)
7676 ++ next = rb_entry_rq(rbnext);
7677 ++ else {
7678 ++ rbnext = rb_first(&bfqq->sort_list);
7679 ++ if (rbnext && rbnext != &last->rb_node)
7680 ++ next = rb_entry_rq(rbnext);
7681 ++ }
7682 ++
7683 ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
7684 ++}
7685 ++
7686 ++static void bfq_del_rq_rb(struct request *rq)
7687 ++{
7688 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
7689 ++ struct bfq_data *bfqd = bfqq->bfqd;
7690 ++ const int sync = rq_is_sync(rq);
7691 ++
7692 ++ BUG_ON(bfqq->queued[sync] == 0);
7693 ++ bfqq->queued[sync]--;
7694 ++ bfqd->queued--;
7695 ++
7696 ++ elv_rb_del(&bfqq->sort_list, rq);
7697 ++
7698 ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
7699 ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
7700 ++ bfq_del_bfqq_busy(bfqd, bfqq, 1);
7701 ++ /*
7702 ++ * Remove queue from request-position tree as it is empty.
7703 ++ */
7704 ++ if (bfqq->pos_root != NULL) {
7705 ++ rb_erase(&bfqq->pos_node, bfqq->pos_root);
7706 ++ bfqq->pos_root = NULL;
7707 ++ }
7708 ++ }
7709 ++}
7710 ++
7711 ++/* see the definition of bfq_async_charge_factor for details */
7712 ++static inline unsigned long bfq_serv_to_charge(struct request *rq,
7713 ++ struct bfq_queue *bfqq)
7714 ++{
7715 ++ return blk_rq_sectors(rq) *
7716 ++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
7717 ++ bfq_async_charge_factor));
7718 ++}
7719 ++
7720 ++/**
7721 ++ * bfq_updated_next_req - update the queue after a new next_rq selection.
7722 ++ * @bfqd: the device data the queue belongs to.
7723 ++ * @bfqq: the queue to update.
7724 ++ *
7725 ++ * If the first request of a queue changes we make sure that the queue
7726 ++ * has enough budget to serve at least its first request (if the
7727 ++ * request has grown). We do this because if the queue has not enough
7728 ++ * budget for its first request, it has to go through two dispatch
7729 ++ * rounds to actually get it dispatched.
7730 ++ */
7731 ++static void bfq_updated_next_req(struct bfq_data *bfqd,
7732 ++ struct bfq_queue *bfqq)
7733 ++{
7734 ++ struct bfq_entity *entity = &bfqq->entity;
7735 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
7736 ++ struct request *next_rq = bfqq->next_rq;
7737 ++ unsigned long new_budget;
7738 ++
7739 ++ if (next_rq == NULL)
7740 ++ return;
7741 ++
7742 ++ if (bfqq == bfqd->in_service_queue)
7743 ++ /*
7744 ++ * In order not to break guarantees, budgets cannot be
7745 ++ * changed after an entity has been selected.
7746 ++ */
7747 ++ return;
7748 ++
7749 ++ BUG_ON(entity->tree != &st->active);
7750 ++ BUG_ON(entity == entity->sched_data->in_service_entity);
7751 ++
7752 ++ new_budget = max_t(unsigned long, bfqq->max_budget,
7753 ++ bfq_serv_to_charge(next_rq, bfqq));
7754 ++ entity->budget = new_budget;
7755 ++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
7756 ++ bfq_activate_bfqq(bfqd, bfqq);
7757 ++}
7758 ++
7759 ++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
7760 ++{
7761 ++ u64 dur;
7762 ++
7763 ++ if (bfqd->bfq_raising_max_time > 0)
7764 ++ return bfqd->bfq_raising_max_time;
7765 ++
7766 ++ dur = bfqd->RT_prod;
7767 ++ do_div(dur, bfqd->peak_rate);
7768 ++
7769 ++ return dur;
7770 ++}
7771 ++
7772 ++static void bfq_add_rq_rb(struct request *rq)
7773 ++{
7774 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
7775 ++ struct bfq_entity *entity = &bfqq->entity;
7776 ++ struct bfq_data *bfqd = bfqq->bfqd;
7777 ++ struct request *next_rq, *prev;
7778 ++ unsigned long old_raising_coeff = bfqq->raising_coeff;
7779 ++ int idle_for_long_time = 0;
7780 ++
7781 ++ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
7782 ++ bfqq->queued[rq_is_sync(rq)]++;
7783 ++ bfqd->queued++;
7784 ++
7785 ++ elv_rb_add(&bfqq->sort_list, rq);
7786 ++
7787 ++ /*
7788 ++ * Check if this request is a better next-serve candidate.
7789 ++ */
7790 ++ prev = bfqq->next_rq;
7791 ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
7792 ++ BUG_ON(next_rq == NULL);
7793 ++ bfqq->next_rq = next_rq;
7794 ++
7795 ++ /*
7796 ++ * Adjust priority tree position, if next_rq changes.
7797 ++ */
7798 ++ if (prev != bfqq->next_rq)
7799 ++ bfq_rq_pos_tree_add(bfqd, bfqq);
7800 ++
7801 ++ if (!bfq_bfqq_busy(bfqq)) {
7802 ++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
7803 ++ time_is_before_jiffies(bfqq->soft_rt_next_start);
7804 ++ idle_for_long_time = time_is_before_jiffies(
7805 ++ bfqq->budget_timeout +
7806 ++ bfqd->bfq_raising_min_idle_time);
7807 ++ entity->budget = max_t(unsigned long, bfqq->max_budget,
7808 ++ bfq_serv_to_charge(next_rq, bfqq));
7809 ++
7810 ++ if (!bfqd->low_latency)
7811 ++ goto add_bfqq_busy;
7812 ++
7813 ++ /*
7814 ++ * If the queue is not being boosted and has been idle
7815 ++ * for enough time, start a weight-raising period
7816 ++ */
7817 ++ if (old_raising_coeff == 1 &&
7818 ++ (idle_for_long_time || soft_rt)) {
7819 ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
7820 ++ if (idle_for_long_time)
7821 ++ bfqq->raising_cur_max_time =
7822 ++ bfq_wrais_duration(bfqd);
7823 ++ else
7824 ++ bfqq->raising_cur_max_time =
7825 ++ bfqd->bfq_raising_rt_max_time;
7826 ++ bfq_log_bfqq(bfqd, bfqq,
7827 ++ "wrais starting at %lu, "
7828 ++ "rais_max_time %u",
7829 ++ jiffies,
7830 ++ jiffies_to_msecs(bfqq->
7831 ++ raising_cur_max_time));
7832 ++ } else if (old_raising_coeff > 1) {
7833 ++ if (idle_for_long_time)
7834 ++ bfqq->raising_cur_max_time =
7835 ++ bfq_wrais_duration(bfqd);
7836 ++ else if (bfqq->raising_cur_max_time ==
7837 ++ bfqd->bfq_raising_rt_max_time &&
7838 ++ !soft_rt) {
7839 ++ bfqq->raising_coeff = 1;
7840 ++ bfq_log_bfqq(bfqd, bfqq,
7841 ++ "wrais ending at %lu, "
7842 ++ "rais_max_time %u",
7843 ++ jiffies,
7844 ++ jiffies_to_msecs(bfqq->
7845 ++ raising_cur_max_time));
7846 ++ } else if (time_before(
7847 ++ bfqq->last_rais_start_finish +
7848 ++ bfqq->raising_cur_max_time,
7849 ++ jiffies +
7850 ++ bfqd->bfq_raising_rt_max_time) &&
7851 ++ soft_rt) {
7852 ++ /*
7853 ++ *
7854 ++ * The remaining weight-raising time is lower
7855 ++ * than bfqd->bfq_raising_rt_max_time, which
7856 ++ * means that the application is enjoying
7857 ++ * weight raising either because deemed soft rt
7858 ++ * in the near past, or because deemed
7859 ++ * interactive a long ago. In both cases,
7860 ++ * resetting now the current remaining weight-
7861 ++ * raising time for the application to the
7862 ++ * weight-raising duration for soft rt
7863 ++ * applications would not cause any latency
7864 ++ * increase for the application (as the new
7865 ++ * duration would be higher than the remaining
7866 ++ * time).
7867 ++ *
7868 ++ * In addition, the application is now meeting
7869 ++ * the requirements for being deemed soft rt.
7870 ++ * In the end we can correctly and safely
7871 ++ * (re)charge the weight-raising duration for
7872 ++ * the application with the weight-raising
7873 ++ * duration for soft rt applications.
7874 ++ *
7875 ++ * In particular, doing this recharge now, i.e.,
7876 ++ * before the weight-raising period for the
7877 ++ * application finishes, reduces the probability
7878 ++ * of the following negative scenario:
7879 ++ * 1) the weight of a soft rt application is
7880 ++ * raised at startup (as for any newly
7881 ++ * created application),
7882 ++ * 2) since the application is not interactive,
7883 ++ * at a certain time weight-raising is
7884 ++ * stopped for the application,
7885 ++ * 3) at that time the application happens to
7886 ++ * still have pending requests, and hence
7887 ++ * is destined to not have a chance to be
7888 ++ * deemed soft rt before these requests are
7889 ++ * completed (see the comments to the
7890 ++ * function bfq_bfqq_softrt_next_start()
7891 ++ * for details on soft rt detection),
7892 ++ * 4) these pending requests experience a high
7893 ++ * latency because the application is not
7894 ++ * weight-raised while they are pending.
7895 ++ */
7896 ++ bfqq->last_rais_start_finish = jiffies;
7897 ++ bfqq->raising_cur_max_time =
7898 ++ bfqd->bfq_raising_rt_max_time;
7899 ++ }
7900 ++ }
7901 ++ if (old_raising_coeff != bfqq->raising_coeff)
7902 ++ entity->ioprio_changed = 1;
7903 ++add_bfqq_busy:
7904 ++ bfqq->last_idle_bklogged = jiffies;
7905 ++ bfqq->service_from_backlogged = 0;
7906 ++ bfq_clear_bfqq_softrt_update(bfqq);
7907 ++ bfq_add_bfqq_busy(bfqd, bfqq);
7908 ++ } else {
7909 ++ if (bfqd->low_latency && old_raising_coeff == 1 &&
7910 ++ !rq_is_sync(rq) &&
7911 ++ time_is_before_jiffies(
7912 ++ bfqq->last_rais_start_finish +
7913 ++ bfqd->bfq_raising_min_inter_arr_async)) {
7914 ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
7915 ++ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
7916 ++
7917 ++ bfqd->raised_busy_queues++;
7918 ++ entity->ioprio_changed = 1;
7919 ++ bfq_log_bfqq(bfqd, bfqq,
7920 ++ "non-idle wrais starting at %lu, "
7921 ++ "rais_max_time %u",
7922 ++ jiffies,
7923 ++ jiffies_to_msecs(bfqq->
7924 ++ raising_cur_max_time));
7925 ++ }
7926 ++ bfq_updated_next_req(bfqd, bfqq);
7927 ++ }
7928 ++
7929 ++ if (bfqd->low_latency &&
7930 ++ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
7931 ++ idle_for_long_time))
7932 ++ bfqq->last_rais_start_finish = jiffies;
7933 ++}
7934 ++
7935 ++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
7936 ++{
7937 ++ elv_rb_del(&bfqq->sort_list, rq);
7938 ++ bfqq->queued[rq_is_sync(rq)]--;
7939 ++ bfqq->bfqd->queued--;
7940 ++ bfq_add_rq_rb(rq);
7941 ++}
7942 ++
7943 ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
7944 ++ struct bio *bio)
7945 ++{
7946 ++ struct task_struct *tsk = current;
7947 ++ struct bfq_io_cq *bic;
7948 ++ struct bfq_queue *bfqq;
7949 ++
7950 ++ bic = bfq_bic_lookup(bfqd, tsk->io_context);
7951 ++ if (bic == NULL)
7952 ++ return NULL;
7953 ++
7954 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
7955 ++ if (bfqq != NULL)
7956 ++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
7957 ++
7958 ++ return NULL;
7959 ++}
7960 ++
7961 ++static void bfq_activate_request(struct request_queue *q, struct request *rq)
7962 ++{
7963 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
7964 ++
7965 ++ bfqd->rq_in_driver++;
7966 ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
7967 ++ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
7968 ++ (long long unsigned)bfqd->last_position);
7969 ++}
7970 ++
7971 ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
7972 ++{
7973 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
7974 ++
7975 ++ WARN_ON(bfqd->rq_in_driver == 0);
7976 ++ bfqd->rq_in_driver--;
7977 ++}
7978 ++
7979 ++static void bfq_remove_request(struct request *rq)
7980 ++{
7981 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
7982 ++ struct bfq_data *bfqd = bfqq->bfqd;
7983 ++
7984 ++ if (bfqq->next_rq == rq) {
7985 ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
7986 ++ bfq_updated_next_req(bfqd, bfqq);
7987 ++ }
7988 ++
7989 ++ list_del_init(&rq->queuelist);
7990 ++ bfq_del_rq_rb(rq);
7991 ++
7992 ++ if (rq->cmd_flags & REQ_META) {
7993 ++ WARN_ON(bfqq->meta_pending == 0);
7994 ++ bfqq->meta_pending--;
7995 ++ }
7996 ++}
7997 ++
7998 ++static int bfq_merge(struct request_queue *q, struct request **req,
7999 ++ struct bio *bio)
8000 ++{
8001 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
8002 ++ struct request *__rq;
8003 ++
8004 ++ __rq = bfq_find_rq_fmerge(bfqd, bio);
8005 ++ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
8006 ++ *req = __rq;
8007 ++ return ELEVATOR_FRONT_MERGE;
8008 ++ }
8009 ++
8010 ++ return ELEVATOR_NO_MERGE;
8011 ++}
8012 ++
8013 ++static void bfq_merged_request(struct request_queue *q, struct request *req,
8014 ++ int type)
8015 ++{
8016 ++ if (type == ELEVATOR_FRONT_MERGE) {
8017 ++ struct bfq_queue *bfqq = RQ_BFQQ(req);
8018 ++
8019 ++ bfq_reposition_rq_rb(bfqq, req);
8020 ++ }
8021 ++}
8022 ++
8023 ++static void bfq_merged_requests(struct request_queue *q, struct request *rq,
8024 ++ struct request *next)
8025 ++{
8026 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
8027 ++
8028 ++ /*
8029 ++ * Reposition in fifo if next is older than rq.
8030 ++ */
8031 ++ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
8032 ++ time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
8033 ++ list_move(&rq->queuelist, &next->queuelist);
8034 ++ rq_set_fifo_time(rq, rq_fifo_time(next));
8035 ++ }
8036 ++
8037 ++ if (bfqq->next_rq == next)
8038 ++ bfqq->next_rq = rq;
8039 ++
8040 ++ bfq_remove_request(next);
8041 ++}
8042 ++
8043 ++/* Must be called with bfqq != NULL */
8044 ++static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)
8045 ++{
8046 ++ BUG_ON(bfqq == NULL);
8047 ++ if (bfq_bfqq_busy(bfqq))
8048 ++ bfqq->bfqd->raised_busy_queues--;
8049 ++ bfqq->raising_coeff = 1;
8050 ++ bfqq->raising_cur_max_time = 0;
8051 ++ /* Trigger a weight change on the next activation of the queue */
8052 ++ bfqq->entity.ioprio_changed = 1;
8053 ++}
8054 ++
8055 ++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
8056 ++ struct bfq_group *bfqg)
8057 ++{
8058 ++ int i, j;
8059 ++
8060 ++ for (i = 0; i < 2; i++)
8061 ++ for (j = 0; j < IOPRIO_BE_NR; j++)
8062 ++ if (bfqg->async_bfqq[i][j] != NULL)
8063 ++ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);
8064 ++ if (bfqg->async_idle_bfqq != NULL)
8065 ++ bfq_bfqq_end_raising(bfqg->async_idle_bfqq);
8066 ++}
8067 ++
8068 ++static void bfq_end_raising(struct bfq_data *bfqd)
8069 ++{
8070 ++ struct bfq_queue *bfqq;
8071 ++
8072 ++ spin_lock_irq(bfqd->queue->queue_lock);
8073 ++
8074 ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
8075 ++ bfq_bfqq_end_raising(bfqq);
8076 ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
8077 ++ bfq_bfqq_end_raising(bfqq);
8078 ++ bfq_end_raising_async(bfqd);
8079 ++
8080 ++ spin_unlock_irq(bfqd->queue->queue_lock);
8081 ++}
8082 ++
8083 ++static int bfq_allow_merge(struct request_queue *q, struct request *rq,
8084 ++ struct bio *bio)
8085 ++{
8086 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
8087 ++ struct bfq_io_cq *bic;
8088 ++ struct bfq_queue *bfqq;
8089 ++
8090 ++ /*
8091 ++ * Disallow merge of a sync bio into an async request.
8092 ++ */
8093 ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
8094 ++ return 0;
8095 ++
8096 ++ /*
8097 ++ * Lookup the bfqq that this bio will be queued with. Allow
8098 ++ * merge only if rq is queued there.
8099 ++ * Queue lock is held here.
8100 ++ */
8101 ++ bic = bfq_bic_lookup(bfqd, current->io_context);
8102 ++ if (bic == NULL)
8103 ++ return 0;
8104 ++
8105 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
8106 ++ return bfqq == RQ_BFQQ(rq);
8107 ++}
8108 ++
8109 ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
8110 ++ struct bfq_queue *bfqq)
8111 ++{
8112 ++ if (bfqq != NULL) {
8113 ++ bfq_mark_bfqq_must_alloc(bfqq);
8114 ++ bfq_mark_bfqq_budget_new(bfqq);
8115 ++ bfq_clear_bfqq_fifo_expire(bfqq);
8116 ++
8117 ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
8118 ++
8119 ++ bfq_log_bfqq(bfqd, bfqq,
8120 ++ "set_in_service_queue, cur-budget = %lu",
8121 ++ bfqq->entity.budget);
8122 ++ }
8123 ++
8124 ++ bfqd->in_service_queue = bfqq;
8125 ++}
8126 ++
8127 ++/*
8128 ++ * Get and set a new queue for service.
8129 ++ */
8130 ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
8131 ++ struct bfq_queue *bfqq)
8132 ++{
8133 ++ if (!bfqq)
8134 ++ bfqq = bfq_get_next_queue(bfqd);
8135 ++ else
8136 ++ bfq_get_next_queue_forced(bfqd, bfqq);
8137 ++
8138 ++ __bfq_set_in_service_queue(bfqd, bfqq);
8139 ++ return bfqq;
8140 ++}
8141 ++
8142 ++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
8143 ++ struct request *rq)
8144 ++{
8145 ++ if (blk_rq_pos(rq) >= bfqd->last_position)
8146 ++ return blk_rq_pos(rq) - bfqd->last_position;
8147 ++ else
8148 ++ return bfqd->last_position - blk_rq_pos(rq);
8149 ++}
8150 ++
8151 ++/*
8152 ++ * Return true if bfqq has no request pending and rq is close enough to
8153 ++ * bfqd->last_position, or if rq is closer to bfqd->last_position than
8154 ++ * bfqq->next_rq
8155 ++ */
8156 ++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
8157 ++{
8158 ++ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
8159 ++}
8160 ++
8161 ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
8162 ++{
8163 ++ struct rb_root *root = &bfqd->rq_pos_tree;
8164 ++ struct rb_node *parent, *node;
8165 ++ struct bfq_queue *__bfqq;
8166 ++ sector_t sector = bfqd->last_position;
8167 ++
8168 ++ if (RB_EMPTY_ROOT(root))
8169 ++ return NULL;
8170 ++
8171 ++ /*
8172 ++ * First, if we find a request starting at the end of the last
8173 ++ * request, choose it.
8174 ++ */
8175 ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
8176 ++ if (__bfqq != NULL)
8177 ++ return __bfqq;
8178 ++
8179 ++ /*
8180 ++ * If the exact sector wasn't found, the parent of the NULL leaf
8181 ++ * will contain the closest sector (rq_pos_tree sorted by next_request
8182 ++ * position).
8183 ++ */
8184 ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
8185 ++ if (bfq_rq_close(bfqd, __bfqq->next_rq))
8186 ++ return __bfqq;
8187 ++
8188 ++ if (blk_rq_pos(__bfqq->next_rq) < sector)
8189 ++ node = rb_next(&__bfqq->pos_node);
8190 ++ else
8191 ++ node = rb_prev(&__bfqq->pos_node);
8192 ++ if (node == NULL)
8193 ++ return NULL;
8194 ++
8195 ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
8196 ++ if (bfq_rq_close(bfqd, __bfqq->next_rq))
8197 ++ return __bfqq;
8198 ++
8199 ++ return NULL;
8200 ++}
8201 ++
8202 ++/*
8203 ++ * bfqd - obvious
8204 ++ * cur_bfqq - passed in so that we don't decide that the current queue
8205 ++ * is closely cooperating with itself.
8206 ++ *
8207 ++ * We are assuming that cur_bfqq has dispatched at least one request,
8208 ++ * and that bfqd->last_position reflects a position on the disk associated
8209 ++ * with the I/O issued by cur_bfqq.
8210 ++ */
8211 ++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
8212 ++ struct bfq_queue *cur_bfqq)
8213 ++{
8214 ++ struct bfq_queue *bfqq;
8215 ++
8216 ++ if (bfq_class_idle(cur_bfqq))
8217 ++ return NULL;
8218 ++ if (!bfq_bfqq_sync(cur_bfqq))
8219 ++ return NULL;
8220 ++ if (BFQQ_SEEKY(cur_bfqq))
8221 ++ return NULL;
8222 ++
8223 ++ /* If device has only one backlogged bfq_queue, don't search. */
8224 ++ if (bfqd->busy_queues == 1)
8225 ++ return NULL;
8226 ++
8227 ++ /*
8228 ++ * We should notice if some of the queues are cooperating, e.g.
8229 ++ * working closely on the same area of the disk. In that case,
8230 ++ * we can group them together and don't waste time idling.
8231 ++ */
8232 ++ bfqq = bfqq_close(bfqd);
8233 ++ if (bfqq == NULL || bfqq == cur_bfqq)
8234 ++ return NULL;
8235 ++
8236 ++ /*
8237 ++ * Do not merge queues from different bfq_groups.
8238 ++ */
8239 ++ if (bfqq->entity.parent != cur_bfqq->entity.parent)
8240 ++ return NULL;
8241 ++
8242 ++ /*
8243 ++ * It only makes sense to merge sync queues.
8244 ++ */
8245 ++ if (!bfq_bfqq_sync(bfqq))
8246 ++ return NULL;
8247 ++ if (BFQQ_SEEKY(bfqq))
8248 ++ return NULL;
8249 ++
8250 ++ /*
8251 ++ * Do not merge queues of different priority classes.
8252 ++ */
8253 ++ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
8254 ++ return NULL;
8255 ++
8256 ++ return bfqq;
8257 ++}
8258 ++
8259 ++/*
8260 ++ * If enough samples have been computed, return the current max budget
8261 ++ * stored in bfqd, which is dynamically updated according to the
8262 ++ * estimated disk peak rate; otherwise return the default max budget
8263 ++ */
8264 ++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
8265 ++{
8266 ++ if (bfqd->budgets_assigned < 194)
8267 ++ return bfq_default_max_budget;
8268 ++ else
8269 ++ return bfqd->bfq_max_budget;
8270 ++}
8271 ++
8272 ++/*
8273 ++ * Return min budget, which is a fraction of the current or default
8274 ++ * max budget (trying with 1/32)
8275 ++ */
8276 ++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
8277 ++{
8278 ++ if (bfqd->budgets_assigned < 194)
8279 ++ return bfq_default_max_budget / 32;
8280 ++ else
8281 ++ return bfqd->bfq_max_budget / 32;
8282 ++}
8283 ++
8284 ++/*
8285 ++ * Decides whether idling should be done for given device and
8286 ++ * given in-service queue.
8287 ++ */
8288 ++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
8289 ++ struct bfq_queue *in_service_bfqq)
8290 ++{
8291 ++ if (in_service_bfqq == NULL)
8292 ++ return false;
8293 ++ /*
8294 ++ * If the device is non-rotational, and hence has no seek penalty,
8295 ++ * disable idling; but do so only if:
8296 ++ * - device does not support queuing, otherwise we still have
8297 ++ * a problem with sync vs async workloads;
8298 ++ * - the queue is not weight-raised, to preserve guarantees.
8299 ++ */
8300 ++ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
8301 ++ in_service_bfqq->raising_coeff == 1);
8302 ++}
8303 ++
8304 ++static void bfq_arm_slice_timer(struct bfq_data *bfqd)
8305 ++{
8306 ++ struct bfq_queue *bfqq = bfqd->in_service_queue;
8307 ++ struct bfq_io_cq *bic;
8308 ++ unsigned long sl;
8309 ++
8310 ++ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
8311 ++
8312 ++ /* Tasks have exited, don't wait. */
8313 ++ bic = bfqd->in_service_bic;
8314 ++ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
8315 ++ return;
8316 ++
8317 ++ bfq_mark_bfqq_wait_request(bfqq);
8318 ++
8319 ++ /*
8320 ++ * We don't want to idle for seeks, but we do want to allow
8321 ++ * fair distribution of slice time for a process doing back-to-back
8322 ++ * seeks. So allow a little bit of time for him to submit a new rq.
8323 ++ *
8324 ++ * To prevent processes with (partly) seeky workloads from
8325 ++ * being too ill-treated, grant them a small fraction of the
8326 ++ * assigned budget before reducing the waiting time to
8327 ++ * BFQ_MIN_TT. This happened to help reduce latency.
8328 ++ */
8329 ++ sl = bfqd->bfq_slice_idle;
8330 ++ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
8331 ++ bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
8332 ++ bfqq->raising_coeff == 1)
8333 ++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
8334 ++ else if (bfqq->raising_coeff > 1)
8335 ++ sl = sl * 3;
8336 ++ bfqd->last_idling_start = ktime_get();
8337 ++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
8338 ++ bfq_log(bfqd, "arm idle: %u/%u ms",
8339 ++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
8340 ++}
8341 ++
8342 ++/*
8343 ++ * Set the maximum time for the in-service queue to consume its
8344 ++ * budget. This prevents seeky processes from lowering the disk
8345 ++ * throughput (always guaranteed with a time slice scheme as in CFQ).
8346 ++ */
8347 ++static void bfq_set_budget_timeout(struct bfq_data *bfqd)
8348 ++{
8349 ++ struct bfq_queue *bfqq = bfqd->in_service_queue;
8350 ++ unsigned int timeout_coeff;
8351 ++ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
8352 ++ timeout_coeff = 1;
8353 ++ else
8354 ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
8355 ++
8356 ++ bfqd->last_budget_start = ktime_get();
8357 ++
8358 ++ bfq_clear_bfqq_budget_new(bfqq);
8359 ++ bfqq->budget_timeout = jiffies +
8360 ++ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
8361 ++
8362 ++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
8363 ++ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
8364 ++ timeout_coeff));
8365 ++}
8366 ++
8367 ++/*
8368 ++ * Move request from internal lists to the request queue dispatch list.
8369 ++ */
8370 ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
8371 ++{
8372 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
8373 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
8374 ++
8375 ++ bfq_remove_request(rq);
8376 ++ bfqq->dispatched++;
8377 ++ elv_dispatch_sort(q, rq);
8378 ++
8379 ++ if (bfq_bfqq_sync(bfqq))
8380 ++ bfqd->sync_flight++;
8381 ++}
8382 ++
8383 ++/*
8384 ++ * Return expired entry, or NULL to just start from scratch in rbtree.
8385 ++ */
8386 ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
8387 ++{
8388 ++ struct request *rq = NULL;
8389 ++
8390 ++ if (bfq_bfqq_fifo_expire(bfqq))
8391 ++ return NULL;
8392 ++
8393 ++ bfq_mark_bfqq_fifo_expire(bfqq);
8394 ++
8395 ++ if (list_empty(&bfqq->fifo))
8396 ++ return NULL;
8397 ++
8398 ++ rq = rq_entry_fifo(bfqq->fifo.next);
8399 ++
8400 ++ if (time_before(jiffies, rq_fifo_time(rq)))
8401 ++ return NULL;
8402 ++
8403 ++ return rq;
8404 ++}
8405 ++
8406 ++/*
8407 ++ * Must be called with the queue_lock held.
8408 ++ */
8409 ++static int bfqq_process_refs(struct bfq_queue *bfqq)
8410 ++{
8411 ++ int process_refs, io_refs;
8412 ++
8413 ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
8414 ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
8415 ++ BUG_ON(process_refs < 0);
8416 ++ return process_refs;
8417 ++}
8418 ++
8419 ++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
8420 ++{
8421 ++ int process_refs, new_process_refs;
8422 ++ struct bfq_queue *__bfqq;
8423 ++
8424 ++ /*
8425 ++ * If there are no process references on the new_bfqq, then it is
8426 ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
8427 ++ * may have dropped their last reference (not just their last process
8428 ++ * reference).
8429 ++ */
8430 ++ if (!bfqq_process_refs(new_bfqq))
8431 ++ return;
8432 ++
8433 ++ /* Avoid a circular list and skip interim queue merges. */
8434 ++ while ((__bfqq = new_bfqq->new_bfqq)) {
8435 ++ if (__bfqq == bfqq)
8436 ++ return;
8437 ++ new_bfqq = __bfqq;
8438 ++ }
8439 ++
8440 ++ process_refs = bfqq_process_refs(bfqq);
8441 ++ new_process_refs = bfqq_process_refs(new_bfqq);
8442 ++ /*
8443 ++ * If the process for the bfqq has gone away, there is no
8444 ++ * sense in merging the queues.
8445 ++ */
8446 ++ if (process_refs == 0 || new_process_refs == 0)
8447 ++ return;
8448 ++
8449 ++ /*
8450 ++ * Merge in the direction of the lesser amount of work.
8451 ++ */
8452 ++ if (new_process_refs >= process_refs) {
8453 ++ bfqq->new_bfqq = new_bfqq;
8454 ++ atomic_add(process_refs, &new_bfqq->ref);
8455 ++ } else {
8456 ++ new_bfqq->new_bfqq = bfqq;
8457 ++ atomic_add(new_process_refs, &bfqq->ref);
8458 ++ }
8459 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
8460 ++ new_bfqq->pid);
8461 ++}
8462 ++
8463 ++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
8464 ++{
8465 ++ struct bfq_entity *entity = &bfqq->entity;
8466 ++ return entity->budget - entity->service;
8467 ++}
8468 ++
8469 ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
8470 ++{
8471 ++ BUG_ON(bfqq != bfqd->in_service_queue);
8472 ++
8473 ++ __bfq_bfqd_reset_in_service(bfqd);
8474 ++
8475 ++ /*
8476 ++ * If this bfqq is shared between multiple processes, check
8477 ++ * to make sure that those processes are still issuing I/Os
8478 ++ * within the mean seek distance. If not, it may be time to
8479 ++ * break the queues apart again.
8480 ++ */
8481 ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
8482 ++ bfq_mark_bfqq_split_coop(bfqq);
8483 ++
8484 ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
8485 ++ /*
8486 ++ * overloading budget_timeout field to store when
8487 ++ * the queue remains with no backlog, used by
8488 ++ * the weight-raising mechanism
8489 ++ */
8490 ++ bfqq->budget_timeout = jiffies;
8491 ++ bfq_del_bfqq_busy(bfqd, bfqq, 1);
8492 ++ } else {
8493 ++ bfq_activate_bfqq(bfqd, bfqq);
8494 ++ /*
8495 ++ * Resort priority tree of potential close cooperators.
8496 ++ */
8497 ++ bfq_rq_pos_tree_add(bfqd, bfqq);
8498 ++ }
8499 ++}
8500 ++
8501 ++/**
8502 ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
8503 ++ * @bfqd: device data.
8504 ++ * @bfqq: queue to update.
8505 ++ * @reason: reason for expiration.
8506 ++ *
8507 ++ * Handle the feedback on @bfqq budget. See the body for detailed
8508 ++ * comments.
8509 ++ */
8510 ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
8511 ++ struct bfq_queue *bfqq,
8512 ++ enum bfqq_expiration reason)
8513 ++{
8514 ++ struct request *next_rq;
8515 ++ unsigned long budget, min_budget;
8516 ++
8517 ++ budget = bfqq->max_budget;
8518 ++ min_budget = bfq_min_budget(bfqd);
8519 ++
8520 ++ BUG_ON(bfqq != bfqd->in_service_queue);
8521 ++
8522 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
8523 ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
8524 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
8525 ++ budget, bfq_min_budget(bfqd));
8526 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
8527 ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
8528 ++
8529 ++ if (bfq_bfqq_sync(bfqq)) {
8530 ++ switch (reason) {
8531 ++ /*
8532 ++ * Caveat: in all the following cases we trade latency
8533 ++ * for throughput.
8534 ++ */
8535 ++ case BFQ_BFQQ_TOO_IDLE:
8536 ++ /*
8537 ++ * This is the only case where we may reduce
8538 ++ * the budget: if there is no requets of the
8539 ++ * process still waiting for completion, then
8540 ++ * we assume (tentatively) that the timer has
8541 ++ * expired because the batch of requests of
8542 ++ * the process could have been served with a
8543 ++ * smaller budget. Hence, betting that
8544 ++ * process will behave in the same way when it
8545 ++ * becomes backlogged again, we reduce its
8546 ++ * next budget. As long as we guess right,
8547 ++ * this budget cut reduces the latency
8548 ++ * experienced by the process.
8549 ++ *
8550 ++ * However, if there are still outstanding
8551 ++ * requests, then the process may have not yet
8552 ++ * issued its next request just because it is
8553 ++ * still waiting for the completion of some of
8554 ++ * the still oustanding ones. So in this
8555 ++ * subcase we do not reduce its budget, on the
8556 ++ * contrary we increase it to possibly boost
8557 ++ * the throughput, as discussed in the
8558 ++ * comments to the BUDGET_TIMEOUT case.
8559 ++ */
8560 ++ if (bfqq->dispatched > 0) /* still oustanding reqs */
8561 ++ budget = min(budget * 2, bfqd->bfq_max_budget);
8562 ++ else {
8563 ++ if (budget > 5 * min_budget)
8564 ++ budget -= 4 * min_budget;
8565 ++ else
8566 ++ budget = min_budget;
8567 ++ }
8568 ++ break;
8569 ++ case BFQ_BFQQ_BUDGET_TIMEOUT:
8570 ++ /*
8571 ++ * We double the budget here because: 1) it
8572 ++ * gives the chance to boost the throughput if
8573 ++ * this is not a seeky process (which may have
8574 ++ * bumped into this timeout because of, e.g.,
8575 ++ * ZBR), 2) together with charge_full_budget
8576 ++ * it helps give seeky processes higher
8577 ++ * timestamps, and hence be served less
8578 ++ * frequently.
8579 ++ */
8580 ++ budget = min(budget * 2, bfqd->bfq_max_budget);
8581 ++ break;
8582 ++ case BFQ_BFQQ_BUDGET_EXHAUSTED:
8583 ++ /*
8584 ++ * The process still has backlog, and did not
8585 ++ * let either the budget timeout or the disk
8586 ++ * idling timeout expire. Hence it is not
8587 ++ * seeky, has a short thinktime and may be
8588 ++ * happy with a higher budget too. So
8589 ++ * definitely increase the budget of this good
8590 ++ * candidate to boost the disk throughput.
8591 ++ */
8592 ++ budget = min(budget * 4, bfqd->bfq_max_budget);
8593 ++ break;
8594 ++ case BFQ_BFQQ_NO_MORE_REQUESTS:
8595 ++ /*
8596 ++ * Leave the budget unchanged.
8597 ++ */
8598 ++ default:
8599 ++ return;
8600 ++ }
8601 ++ } else /* async queue */
8602 ++ /* async queues get always the maximum possible budget
8603 ++ * (their ability to dispatch is limited by
8604 ++ * @bfqd->bfq_max_budget_async_rq).
8605 ++ */
8606 ++ budget = bfqd->bfq_max_budget;
8607 ++
8608 ++ bfqq->max_budget = budget;
8609 ++
8610 ++ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
8611 ++ bfqq->max_budget > bfqd->bfq_max_budget)
8612 ++ bfqq->max_budget = bfqd->bfq_max_budget;
8613 ++
8614 ++ /*
8615 ++ * Make sure that we have enough budget for the next request.
8616 ++ * Since the finish time of the bfqq must be kept in sync with
8617 ++ * the budget, be sure to call __bfq_bfqq_expire() after the
8618 ++ * update.
8619 ++ */
8620 ++ next_rq = bfqq->next_rq;
8621 ++ if (next_rq != NULL)
8622 ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
8623 ++ bfq_serv_to_charge(next_rq, bfqq));
8624 ++ else
8625 ++ bfqq->entity.budget = bfqq->max_budget;
8626 ++
8627 ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
8628 ++ next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
8629 ++ bfqq->entity.budget);
8630 ++}
8631 ++
8632 ++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
8633 ++{
8634 ++ unsigned long max_budget;
8635 ++
8636 ++ /*
8637 ++ * The max_budget calculated when autotuning is equal to the
8638 ++ * amount of sectors transfered in timeout_sync at the
8639 ++ * estimated peak rate.
8640 ++ */
8641 ++ max_budget = (unsigned long)(peak_rate * 1000 *
8642 ++ timeout >> BFQ_RATE_SHIFT);
8643 ++
8644 ++ return max_budget;
8645 ++}
8646 ++
8647 ++/*
8648 ++ * In addition to updating the peak rate, checks whether the process
8649 ++ * is "slow", and returns 1 if so. This slow flag is used, in addition
8650 ++ * to the budget timeout, to reduce the amount of service provided to
8651 ++ * seeky processes, and hence reduce their chances to lower the
8652 ++ * throughput. See the code for more details.
8653 ++ */
8654 ++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
8655 ++ int compensate, enum bfqq_expiration reason)
8656 ++{
8657 ++ u64 bw, usecs, expected, timeout;
8658 ++ ktime_t delta;
8659 ++ int update = 0;
8660 ++
8661 ++ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
8662 ++ return 0;
8663 ++
8664 ++ if (compensate)
8665 ++ delta = bfqd->last_idling_start;
8666 ++ else
8667 ++ delta = ktime_get();
8668 ++ delta = ktime_sub(delta, bfqd->last_budget_start);
8669 ++ usecs = ktime_to_us(delta);
8670 ++
8671 ++ /* Don't trust short/unrealistic values. */
8672 ++ if (usecs < 100 || usecs >= LONG_MAX)
8673 ++ return 0;
8674 ++
8675 ++ /*
8676 ++ * Calculate the bandwidth for the last slice. We use a 64 bit
8677 ++ * value to store the peak rate, in sectors per usec in fixed
8678 ++ * point math. We do so to have enough precision in the estimate
8679 ++ * and to avoid overflows.
8680 ++ */
8681 ++ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
8682 ++ do_div(bw, (unsigned long)usecs);
8683 ++
8684 ++ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
8685 ++
8686 ++ /*
8687 ++ * Use only long (> 20ms) intervals to filter out spikes for
8688 ++ * the peak rate estimation.
8689 ++ */
8690 ++ if (usecs > 20000) {
8691 ++ if (bw > bfqd->peak_rate ||
8692 ++ (!BFQQ_SEEKY(bfqq) &&
8693 ++ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
8694 ++ bfq_log(bfqd, "measured bw =%llu", bw);
8695 ++ /*
8696 ++ * To smooth oscillations use a low-pass filter with
8697 ++ * alpha=7/8, i.e.,
8698 ++ * new_rate = (7/8) * old_rate + (1/8) * bw
8699 ++ */
8700 ++ do_div(bw, 8);
8701 ++ if (bw == 0)
8702 ++ return 0;
8703 ++ bfqd->peak_rate *= 7;
8704 ++ do_div(bfqd->peak_rate, 8);
8705 ++ bfqd->peak_rate += bw;
8706 ++ update = 1;
8707 ++ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
8708 ++ }
8709 ++
8710 ++ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
8711 ++
8712 ++ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
8713 ++ bfqd->peak_rate_samples++;
8714 ++
8715 ++ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
8716 ++ update && bfqd->bfq_user_max_budget == 0) {
8717 ++ bfqd->bfq_max_budget =
8718 ++ bfq_calc_max_budget(bfqd->peak_rate, timeout);
8719 ++ bfq_log(bfqd, "new max_budget=%lu",
8720 ++ bfqd->bfq_max_budget);
8721 ++ }
8722 ++ }
8723 ++
8724 ++ /*
8725 ++ * If the process has been served for a too short time
8726 ++ * interval to let its possible sequential accesses prevail on
8727 ++ * the initial seek time needed to move the disk head on the
8728 ++ * first sector it requested, then give the process a chance
8729 ++ * and for the moment return false.
8730 ++ */
8731 ++ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
8732 ++ return 0;
8733 ++
8734 ++ /*
8735 ++ * A process is considered ``slow'' (i.e., seeky, so that we
8736 ++ * cannot treat it fairly in the service domain, as it would
8737 ++ * slow down too much the other processes) if, when a slice
8738 ++ * ends for whatever reason, it has received service at a
8739 ++ * rate that would not be high enough to complete the budget
8740 ++ * before the budget timeout expiration.
8741 ++ */
8742 ++ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
8743 ++
8744 ++ /*
8745 ++ * Caveat: processes doing IO in the slower disk zones will
8746 ++ * tend to be slow(er) even if not seeky. And the estimated
8747 ++ * peak rate will actually be an average over the disk
8748 ++ * surface. Hence, to not be too harsh with unlucky processes,
8749 ++ * we keep a budget/3 margin of safety before declaring a
8750 ++ * process slow.
8751 ++ */
8752 ++ return expected > (4 * bfqq->entity.budget) / 3;
8753 ++}
8754 ++
8755 ++/*
8756 ++ * To be deemed as soft real-time, an application must meet two requirements.
8757 ++ * The first is that the application must not require an average bandwidth
8758 ++ * higher than the approximate bandwidth required to playback or record a
8759 ++ * compressed high-definition video.
8760 ++ * The next function is invoked on the completion of the last request of a
8761 ++ * batch, to compute the next-start time instant, soft_rt_next_start, such
8762 ++ * that, if the next request of the application does not arrive before
8763 ++ * soft_rt_next_start, then the above requirement on the bandwidth is met.
8764 ++ *
8765 ++ * The second requirement is that the request pattern of the application is
8766 ++ * isochronous, i.e., that, after issuing a request or a batch of requests, the
8767 ++ * application stops for a while, then issues a new batch, and so on. For this
8768 ++ * reason the next function is invoked to compute soft_rt_next_start only for
8769 ++ * applications that meet this requirement, whereas soft_rt_next_start is set
8770 ++ * to infinity for applications that do not.
8771 ++ *
8772 ++ * Unfortunately, even a greedy application may happen to behave in an
8773 ++ * isochronous way if several processes are competing for the CPUs. In fact,
8774 ++ * in this scenario the application stops issuing requests while the CPUs are
8775 ++ * busy serving other processes, then restarts, then stops again for a while,
8776 ++ * and so on. In addition, if the disk achieves a low enough throughput with
8777 ++ * the request pattern issued by the application (e.g., because the request
8778 ++ * pattern is random and/or the device is slow), then the above bandwidth
8779 ++ * requirement may happen to be met too. To prevent such a greedy application
8780 ++ * to be deemed as soft real-time, a further rule is used in the computation
8781 ++ * of soft_rt_next_start: soft_rt_next_start must be higher than the current
8782 ++ * time plus the maximum time for which the arrival of a request is waited
8783 ++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This
8784 ++ * filters out greedy applications, as the latter issue instead their next
8785 ++ * request as soon as possible after the last one has been completed (in
8786 ++ * contrast, when a batch of requests is completed, a soft real-time
8787 ++ * application spends some time processing data).
8788 ++ *
8789 ++ * Actually, the last filter may easily generate false positives if: only
8790 ++ * bfqd->bfq_slice_idle is used as a reference time interval, and one or
8791 ++ * both the following two cases occur:
8792 ++ * 1) HZ is so low that the duration of a jiffie is comparable to or higher
8793 ++ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
8794 ++ * HZ=100.
8795 ++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
8796 ++ * for a while, then suddenly 'jump' by several units to recover the lost
8797 ++ * increments. This seems to happen, e.g., inside virtual machines.
8798 ++ * To address this issue, we do not use as a reference time interval just
8799 ++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
8800 ++ * particular we add the minimum number of jiffies for which the filter seems
8801 ++ * to be quite precise also in embedded systems and KVM/QEMU virtual machines.
8802 ++ */
8803 ++static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
8804 ++ struct bfq_queue *bfqq)
8805 ++{
8806 ++ return max(bfqq->last_idle_bklogged +
8807 ++ HZ * bfqq->service_from_backlogged /
8808 ++ bfqd->bfq_raising_max_softrt_rate,
8809 ++ jiffies + bfqq->bfqd->bfq_slice_idle + 4);
8810 ++}
8811 ++
8812 ++/*
8813 ++ * Largest-possible time instant such that, for as long as possible, the
8814 ++ * current time will be lower than this time instant according to the macro
8815 ++ * time_is_before_jiffies().
8816 ++ */
8817 ++static inline unsigned long bfq_infinity_from_now(unsigned long now)
8818 ++{
8819 ++ return now + ULONG_MAX / 2;
8820 ++}
8821 ++
8822 ++/**
8823 ++ * bfq_bfqq_expire - expire a queue.
8824 ++ * @bfqd: device owning the queue.
8825 ++ * @bfqq: the queue to expire.
8826 ++ * @compensate: if true, compensate for the time spent idling.
8827 ++ * @reason: the reason causing the expiration.
8828 ++ *
8829 ++ *
8830 ++ * If the process associated to the queue is slow (i.e., seeky), or in
8831 ++ * case of budget timeout, or, finally, if it is async, we
8832 ++ * artificially charge it an entire budget (independently of the
8833 ++ * actual service it received). As a consequence, the queue will get
8834 ++ * higher timestamps than the correct ones upon reactivation, and
8835 ++ * hence it will be rescheduled as if it had received more service
8836 ++ * than what it actually received. In the end, this class of processes
8837 ++ * will receive less service in proportion to how slowly they consume
8838 ++ * their budgets (and hence how seriously they tend to lower the
8839 ++ * throughput).
8840 ++ *
8841 ++ * In contrast, when a queue expires because it has been idling for
8842 ++ * too much or because it exhausted its budget, we do not touch the
8843 ++ * amount of service it has received. Hence when the queue will be
8844 ++ * reactivated and its timestamps updated, the latter will be in sync
8845 ++ * with the actual service received by the queue until expiration.
8846 ++ *
8847 ++ * Charging a full budget to the first type of queues and the exact
8848 ++ * service to the others has the effect of using the WF2Q+ policy to
8849 ++ * schedule the former on a timeslice basis, without violating the
8850 ++ * service domain guarantees of the latter.
8851 ++ */
8852 ++static void bfq_bfqq_expire(struct bfq_data *bfqd,
8853 ++ struct bfq_queue *bfqq,
8854 ++ int compensate,
8855 ++ enum bfqq_expiration reason)
8856 ++{
8857 ++ int slow;
8858 ++ BUG_ON(bfqq != bfqd->in_service_queue);
8859 ++
8860 ++ /* Update disk peak rate for autotuning and check whether the
8861 ++ * process is slow (see bfq_update_peak_rate).
8862 ++ */
8863 ++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
8864 ++
8865 ++ /*
8866 ++ * As above explained, 'punish' slow (i.e., seeky), timed-out
8867 ++ * and async queues, to favor sequential sync workloads.
8868 ++ *
8869 ++ * Processes doing IO in the slower disk zones will tend to be
8870 ++ * slow(er) even if not seeky. Hence, since the estimated peak
8871 ++ * rate is actually an average over the disk surface, these
8872 ++ * processes may timeout just for bad luck. To avoid punishing
8873 ++ * them we do not charge a full budget to a process that
8874 ++ * succeeded in consuming at least 2/3 of its budget.
8875 ++ */
8876 ++ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
8877 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
8878 ++ bfq_bfqq_charge_full_budget(bfqq);
8879 ++
8880 ++ bfqq->service_from_backlogged += bfqq->entity.service;
8881 ++
8882 ++ if (bfqd->low_latency && bfqq->raising_coeff == 1)
8883 ++ bfqq->last_rais_start_finish = jiffies;
8884 ++
8885 ++ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0 &&
8886 ++ RB_EMPTY_ROOT(&bfqq->sort_list)) {
8887 ++ /*
8888 ++ * If we get here, then the request pattern is
8889 ++ * isochronous (see the comments to the function
8890 ++ * bfq_bfqq_softrt_next_start()). However, if the
8891 ++ * queue still has in-flight requests, then it is
8892 ++ * better to postpone the computation of next_start
8893 ++ * to the next request completion. In fact, if we
8894 ++ * computed it now, then the application might pass
8895 ++ * the greedy-application filter improperly, because
8896 ++ * the arrival of its next request may happen to be
8897 ++ * higher than (jiffies + bfqq->bfqd->bfq_slice_idle)
8898 ++ * not because the application is truly soft real-
8899 ++ * time, but just because the application is currently
8900 ++ * waiting for the completion of some request before
8901 ++ * issuing, as quickly as possible, its next request.
8902 ++ */
8903 ++ if (bfqq->dispatched > 0) {
8904 ++ /*
8905 ++ * The application is still waiting for the
8906 ++ * completion of one or more requests:
8907 ++ * prevent it from possibly being incorrectly
8908 ++ * deemed as soft real-time by setting its
8909 ++ * soft_rt_next_start to infinity. In fact,
8910 ++ * without this assignment, the application
8911 ++ * would be incorrectly deemed as soft
8912 ++ * real-time if:
8913 ++ * 1) it issued a new request before the
8914 ++ * completion of all its in-flight
8915 ++ * requests, and
8916 ++ * 2) at that time, its soft_rt_next_start
8917 ++ * happened to be in the past.
8918 ++ */
8919 ++ bfqq->soft_rt_next_start =
8920 ++ bfq_infinity_from_now(jiffies);
8921 ++ bfq_mark_bfqq_softrt_update(bfqq);
8922 ++ } else
8923 ++ bfqq->soft_rt_next_start =
8924 ++ bfq_bfqq_softrt_next_start(bfqd, bfqq);
8925 ++ }
8926 ++
8927 ++ bfq_log_bfqq(bfqd, bfqq,
8928 ++ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
8929 ++ bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
8930 ++
8931 ++ /* Increase, decrease or leave budget unchanged according to reason */
8932 ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
8933 ++ __bfq_bfqq_expire(bfqd, bfqq);
8934 ++}
8935 ++
8936 ++/*
8937 ++ * Budget timeout is not implemented through a dedicated timer, but
8938 ++ * just checked on request arrivals and completions, as well as on
8939 ++ * idle timer expirations.
8940 ++ */
8941 ++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
8942 ++{
8943 ++ if (bfq_bfqq_budget_new(bfqq))
8944 ++ return 0;
8945 ++
8946 ++ if (time_before(jiffies, bfqq->budget_timeout))
8947 ++ return 0;
8948 ++
8949 ++ return 1;
8950 ++}
8951 ++
8952 ++/*
8953 ++ * If we expire a queue that is waiting for the arrival of a new
8954 ++ * request, we may prevent the fictitious timestamp backshifting that
8955 ++ * allows the guarantees of the queue to be preserved (see [1] for
8956 ++ * this tricky aspect). Hence we return true only if this condition
8957 ++ * does not hold, or if the queue is slow enough to deserve only to be
8958 ++ * kicked off for preserving a high throughput.
8959 ++*/
8960 ++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
8961 ++{
8962 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
8963 ++ "may_budget_timeout: wr %d left %d timeout %d",
8964 ++ bfq_bfqq_wait_request(bfqq),
8965 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
8966 ++ bfq_bfqq_budget_timeout(bfqq));
8967 ++
8968 ++ return (!bfq_bfqq_wait_request(bfqq) ||
8969 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
8970 ++ &&
8971 ++ bfq_bfqq_budget_timeout(bfqq);
8972 ++}
8973 ++
8974 ++/*
8975 ++ * For weight-raised queues issuing sync requests, idling is always performed,
8976 ++ * as this is instrumental in guaranteeing a high fraction of the throughput
8977 ++ * to these queues, and hence in guaranteeing a lower latency for their
8978 ++ * requests. See [1] for details.
8979 ++ *
8980 ++ * For non-weight-raised queues, idling is instead disabled if the device is
8981 ++ * NCQ-enabled and non-rotational, as this boosts the throughput on such
8982 ++ * devices.
8983 ++ */
8984 ++static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
8985 ++{
8986 ++ struct bfq_data *bfqd = bfqq->bfqd;
8987 ++
8988 ++ return bfq_bfqq_sync(bfqq) && (
8989 ++ bfqq->raising_coeff > 1 ||
8990 ++ (bfq_bfqq_idle_window(bfqq) &&
8991 ++ !(bfqd->hw_tag &&
8992 ++ (blk_queue_nonrot(bfqd->queue) ||
8993 ++ /*
8994 ++ * If there are weight-raised busy queues, then do not idle
8995 ++ * the disk for a sync non-weight-raised queue, and hence
8996 ++ * expire the queue immediately if empty. Combined with the
8997 ++ * timestamping rules of BFQ (see [1] for details), this
8998 ++ * causes sync non-weight-raised queues to get a lower
8999 ++ * fraction of the disk throughput, and hence reduces the rate
9000 ++ * at which the processes associated to these queues ask for
9001 ++ * requests from the request pool.
9002 ++ *
9003 ++ * This is beneficial for weight-raised processes, when the
9004 ++ * system operates in request-pool saturation conditions
9005 ++ * (e.g., in the presence of write hogs). In fact, if
9006 ++ * non-weight-raised processes ask for requests at a lower
9007 ++ * rate, then weight-raised processes have a higher
9008 ++ * probability to get a request from the pool immediately
9009 ++ * (or at least soon) when they need one. Hence they have a
9010 ++ * higher probability to actually get a fraction of the disk
9011 ++ * throughput proportional to their high weight. This is
9012 ++ * especially true with NCQ-enabled drives, which enqueue
9013 ++ * several requests in advance and further reorder
9014 ++ * internally-queued requests.
9015 ++ *
9016 ++ * Mistreating non-weight-raised queues in the above-described
9017 ++ * way, when there are busy weight-raised queues, seems to
9018 ++ * mitigate starvation problems in the presence of heavy write
9019 ++ * workloads and NCQ, and hence to guarantee a higher
9020 ++ * application and system responsiveness in these hostile
9021 ++ * scenarios.
9022 ++ */
9023 ++ bfqd->raised_busy_queues > 0)
9024 ++ )
9025 ++ )
9026 ++ );
9027 ++}
9028 ++
9029 ++/*
9030 ++ * If the in-service queue is empty, but it is sync and either of the following
9031 ++ * conditions holds, then: 1) the queue must remain in service and cannot be
9032 ++ * expired, and 2) the disk must be idled to wait for the possible arrival
9033 ++ * of a new request for the queue. The conditions are:
9034 ++ * - the device is rotational and not performing NCQ, and the queue has its
9035 ++ * idle window set (in this case, waiting for a new request for the queue
9036 ++ * is likely to boost the disk throughput);
9037 ++ * - the queue is weight-raised (waiting for the request is necessary to
9038 ++ * provide the queue with fairness and latency guarantees, see [1] for
9039 ++ * details).
9040 ++ */
9041 ++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
9042 ++{
9043 ++ struct bfq_data *bfqd = bfqq->bfqd;
9044 ++
9045 ++ return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
9046 ++ bfq_bfqq_must_not_expire(bfqq) &&
9047 ++ !bfq_queue_nonrot_noidle(bfqd, bfqq));
9048 ++}
9049 ++
9050 ++/*
9051 ++ * Select a queue for service. If we have a current queue in service,
9052 ++ * check whether to continue servicing it, or retrieve and set a new one.
9053 ++ */
9054 ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
9055 ++{
9056 ++ struct bfq_queue *bfqq, *new_bfqq = NULL;
9057 ++ struct request *next_rq;
9058 ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
9059 ++
9060 ++ bfqq = bfqd->in_service_queue;
9061 ++ if (bfqq == NULL)
9062 ++ goto new_queue;
9063 ++
9064 ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
9065 ++
9066 ++ /*
9067 ++ * If another queue has a request waiting within our mean seek
9068 ++ * distance, let it run. The expire code will check for close
9069 ++ * cooperators and put the close queue at the front of the
9070 ++ * service tree. If possible, merge the expiring queue with the
9071 ++ * new bfqq.
9072 ++ */
9073 ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq);
9074 ++ if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
9075 ++ bfq_setup_merge(bfqq, new_bfqq);
9076 ++
9077 ++ if (bfq_may_expire_for_budg_timeout(bfqq) &&
9078 ++ !timer_pending(&bfqd->idle_slice_timer) &&
9079 ++ !bfq_bfqq_must_idle(bfqq))
9080 ++ goto expire;
9081 ++
9082 ++ next_rq = bfqq->next_rq;
9083 ++ /*
9084 ++ * If bfqq has requests queued and it has enough budget left to
9085 ++ * serve them, keep the queue, otherwise expire it.
9086 ++ */
9087 ++ if (next_rq != NULL) {
9088 ++ if (bfq_serv_to_charge(next_rq, bfqq) >
9089 ++ bfq_bfqq_budget_left(bfqq)) {
9090 ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
9091 ++ goto expire;
9092 ++ } else {
9093 ++ /*
9094 ++ * The idle timer may be pending because we may not
9095 ++ * disable disk idling even when a new request arrives
9096 ++ */
9097 ++ if (timer_pending(&bfqd->idle_slice_timer)) {
9098 ++ /*
9099 ++ * If we get here: 1) at least a new request
9100 ++ * has arrived but we have not disabled the
9101 ++ * timer because the request was too small,
9102 ++ * 2) then the block layer has unplugged the
9103 ++ * device, causing the dispatch to be invoked.
9104 ++ *
9105 ++ * Since the device is unplugged, now the
9106 ++ * requests are probably large enough to
9107 ++ * provide a reasonable throughput.
9108 ++ * So we disable idling.
9109 ++ */
9110 ++ bfq_clear_bfqq_wait_request(bfqq);
9111 ++ del_timer(&bfqd->idle_slice_timer);
9112 ++ }
9113 ++ if (new_bfqq == NULL)
9114 ++ goto keep_queue;
9115 ++ else
9116 ++ goto expire;
9117 ++ }
9118 ++ }
9119 ++
9120 ++ /*
9121 ++ * No requests pending. If the in-service queue has no cooperator and
9122 ++ * still has requests in flight (possibly waiting for a completion)
9123 ++ * or is idling for a new request, then keep it.
9124 ++ */
9125 ++ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
9126 ++ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
9127 ++ bfqq = NULL;
9128 ++ goto keep_queue;
9129 ++ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
9130 ++ /*
9131 ++ * Expiring the queue because there is a close cooperator,
9132 ++ * cancel timer.
9133 ++ */
9134 ++ bfq_clear_bfqq_wait_request(bfqq);
9135 ++ del_timer(&bfqd->idle_slice_timer);
9136 ++ }
9137 ++
9138 ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
9139 ++expire:
9140 ++ bfq_bfqq_expire(bfqd, bfqq, 0, reason);
9141 ++new_queue:
9142 ++ bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
9143 ++ bfq_log(bfqd, "select_queue: new queue %d returned",
9144 ++ bfqq != NULL ? bfqq->pid : 0);
9145 ++keep_queue:
9146 ++ return bfqq;
9147 ++}
9148 ++
9149 ++static void bfq_update_raising_data(struct bfq_data *bfqd,
9150 ++ struct bfq_queue *bfqq)
9151 ++{
9152 ++ if (bfqq->raising_coeff > 1) { /* queue is being boosted */
9153 ++ struct bfq_entity *entity = &bfqq->entity;
9154 ++
9155 ++ bfq_log_bfqq(bfqd, bfqq,
9156 ++ "raising period dur %u/%u msec, "
9157 ++ "old raising coeff %u, w %d(%d)",
9158 ++ jiffies_to_msecs(jiffies -
9159 ++ bfqq->last_rais_start_finish),
9160 ++ jiffies_to_msecs(bfqq->raising_cur_max_time),
9161 ++ bfqq->raising_coeff,
9162 ++ bfqq->entity.weight, bfqq->entity.orig_weight);
9163 ++
9164 ++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
9165 ++ entity->orig_weight * bfqq->raising_coeff);
9166 ++ if (entity->ioprio_changed)
9167 ++ bfq_log_bfqq(bfqd, bfqq,
9168 ++ "WARN: pending prio change");
9169 ++ /*
9170 ++ * If too much time has elapsed from the beginning
9171 ++ * of this weight-raising, stop it.
9172 ++ */
9173 ++ if (time_is_before_jiffies(bfqq->last_rais_start_finish +
9174 ++ bfqq->raising_cur_max_time)) {
9175 ++ bfqq->last_rais_start_finish = jiffies;
9176 ++ bfq_log_bfqq(bfqd, bfqq,
9177 ++ "wrais ending at %lu, "
9178 ++ "rais_max_time %u",
9179 ++ bfqq->last_rais_start_finish,
9180 ++ jiffies_to_msecs(bfqq->
9181 ++ raising_cur_max_time));
9182 ++ bfq_bfqq_end_raising(bfqq);
9183 ++ __bfq_entity_update_weight_prio(
9184 ++ bfq_entity_service_tree(entity),
9185 ++ entity);
9186 ++ }
9187 ++ }
9188 ++}
9189 ++
9190 ++/*
9191 ++ * Dispatch one request from bfqq, moving it to the request queue
9192 ++ * dispatch list.
9193 ++ */
9194 ++static int bfq_dispatch_request(struct bfq_data *bfqd,
9195 ++ struct bfq_queue *bfqq)
9196 ++{
9197 ++ int dispatched = 0;
9198 ++ struct request *rq;
9199 ++ unsigned long service_to_charge;
9200 ++
9201 ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
9202 ++
9203 ++ /* Follow expired path, else get first next available. */
9204 ++ rq = bfq_check_fifo(bfqq);
9205 ++ if (rq == NULL)
9206 ++ rq = bfqq->next_rq;
9207 ++ service_to_charge = bfq_serv_to_charge(rq, bfqq);
9208 ++
9209 ++ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
9210 ++ /*
9211 ++ * This may happen if the next rq is chosen
9212 ++ * in fifo order instead of sector order.
9213 ++ * The budget is properly dimensioned
9214 ++ * to be always sufficient to serve the next request
9215 ++ * only if it is chosen in sector order. The reason is
9216 ++ * that it would be quite inefficient and little useful
9217 ++ * to always make sure that the budget is large enough
9218 ++ * to serve even the possible next rq in fifo order.
9219 ++ * In fact, requests are seldom served in fifo order.
9220 ++ *
9221 ++ * Expire the queue for budget exhaustion, and
9222 ++ * make sure that the next act_budget is enough
9223 ++ * to serve the next request, even if it comes
9224 ++ * from the fifo expired path.
9225 ++ */
9226 ++ bfqq->next_rq = rq;
9227 ++ /*
9228 ++ * Since this dispatch is failed, make sure that
9229 ++ * a new one will be performed
9230 ++ */
9231 ++ if (!bfqd->rq_in_driver)
9232 ++ bfq_schedule_dispatch(bfqd);
9233 ++ goto expire;
9234 ++ }
9235 ++
9236 ++ /* Finally, insert request into driver dispatch list. */
9237 ++ bfq_bfqq_served(bfqq, service_to_charge);
9238 ++ bfq_dispatch_insert(bfqd->queue, rq);
9239 ++
9240 ++ bfq_update_raising_data(bfqd, bfqq);
9241 ++
9242 ++ bfq_log_bfqq(bfqd, bfqq,
9243 ++ "dispatched %u sec req (%llu), budg left %lu",
9244 ++ blk_rq_sectors(rq),
9245 ++ (long long unsigned)blk_rq_pos(rq),
9246 ++ bfq_bfqq_budget_left(bfqq));
9247 ++
9248 ++ dispatched++;
9249 ++
9250 ++ if (bfqd->in_service_bic == NULL) {
9251 ++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
9252 ++ bfqd->in_service_bic = RQ_BIC(rq);
9253 ++ }
9254 ++
9255 ++ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
9256 ++ dispatched >= bfqd->bfq_max_budget_async_rq) ||
9257 ++ bfq_class_idle(bfqq)))
9258 ++ goto expire;
9259 ++
9260 ++ return dispatched;
9261 ++
9262 ++expire:
9263 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
9264 ++ return dispatched;
9265 ++}
9266 ++
9267 ++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
9268 ++{
9269 ++ int dispatched = 0;
9270 ++
9271 ++ while (bfqq->next_rq != NULL) {
9272 ++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
9273 ++ dispatched++;
9274 ++ }
9275 ++
9276 ++ BUG_ON(!list_empty(&bfqq->fifo));
9277 ++ return dispatched;
9278 ++}
9279 ++
9280 ++/*
9281 ++ * Drain our current requests. Used for barriers and when switching
9282 ++ * io schedulers on-the-fly.
9283 ++ */
9284 ++static int bfq_forced_dispatch(struct bfq_data *bfqd)
9285 ++{
9286 ++ struct bfq_queue *bfqq, *n;
9287 ++ struct bfq_service_tree *st;
9288 ++ int dispatched = 0;
9289 ++
9290 ++ bfqq = bfqd->in_service_queue;
9291 ++ if (bfqq != NULL)
9292 ++ __bfq_bfqq_expire(bfqd, bfqq);
9293 ++
9294 ++ /*
9295 ++ * Loop through classes, and be careful to leave the scheduler
9296 ++ * in a consistent state, as feedback mechanisms and vtime
9297 ++ * updates cannot be disabled during the process.
9298 ++ */
9299 ++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
9300 ++ st = bfq_entity_service_tree(&bfqq->entity);
9301 ++
9302 ++ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
9303 ++ bfqq->max_budget = bfq_max_budget(bfqd);
9304 ++
9305 ++ bfq_forget_idle(st);
9306 ++ }
9307 ++
9308 ++ BUG_ON(bfqd->busy_queues != 0);
9309 ++
9310 ++ return dispatched;
9311 ++}
9312 ++
9313 ++static int bfq_dispatch_requests(struct request_queue *q, int force)
9314 ++{
9315 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
9316 ++ struct bfq_queue *bfqq;
9317 ++ int max_dispatch;
9318 ++
9319 ++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
9320 ++ if (bfqd->busy_queues == 0)
9321 ++ return 0;
9322 ++
9323 ++ if (unlikely(force))
9324 ++ return bfq_forced_dispatch(bfqd);
9325 ++
9326 ++ bfqq = bfq_select_queue(bfqd);
9327 ++ if (bfqq == NULL)
9328 ++ return 0;
9329 ++
9330 ++ max_dispatch = bfqd->bfq_quantum;
9331 ++ if (bfq_class_idle(bfqq))
9332 ++ max_dispatch = 1;
9333 ++
9334 ++ if (!bfq_bfqq_sync(bfqq))
9335 ++ max_dispatch = bfqd->bfq_max_budget_async_rq;
9336 ++
9337 ++ if (bfqq->dispatched >= max_dispatch) {
9338 ++ if (bfqd->busy_queues > 1)
9339 ++ return 0;
9340 ++ if (bfqq->dispatched >= 4 * max_dispatch)
9341 ++ return 0;
9342 ++ }
9343 ++
9344 ++ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
9345 ++ return 0;
9346 ++
9347 ++ bfq_clear_bfqq_wait_request(bfqq);
9348 ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
9349 ++
9350 ++ if (!bfq_dispatch_request(bfqd, bfqq))
9351 ++ return 0;
9352 ++
9353 ++ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",
9354 ++ bfqq->pid, max_dispatch);
9355 ++
9356 ++ return 1;
9357 ++}
9358 ++
9359 ++/*
9360 ++ * Task holds one reference to the queue, dropped when task exits. Each rq
9361 ++ * in-flight on this queue also holds a reference, dropped when rq is freed.
9362 ++ *
9363 ++ * Queue lock must be held here.
9364 ++ */
9365 ++static void bfq_put_queue(struct bfq_queue *bfqq)
9366 ++{
9367 ++ struct bfq_data *bfqd = bfqq->bfqd;
9368 ++
9369 ++ BUG_ON(atomic_read(&bfqq->ref) <= 0);
9370 ++
9371 ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
9372 ++ atomic_read(&bfqq->ref));
9373 ++ if (!atomic_dec_and_test(&bfqq->ref))
9374 ++ return;
9375 ++
9376 ++ BUG_ON(rb_first(&bfqq->sort_list) != NULL);
9377 ++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
9378 ++ BUG_ON(bfqq->entity.tree != NULL);
9379 ++ BUG_ON(bfq_bfqq_busy(bfqq));
9380 ++ BUG_ON(bfqd->in_service_queue == bfqq);
9381 ++
9382 ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
9383 ++
9384 ++ kmem_cache_free(bfq_pool, bfqq);
9385 ++}
9386 ++
9387 ++static void bfq_put_cooperator(struct bfq_queue *bfqq)
9388 ++{
9389 ++ struct bfq_queue *__bfqq, *next;
9390 ++
9391 ++ /*
9392 ++ * If this queue was scheduled to merge with another queue, be
9393 ++ * sure to drop the reference taken on that queue (and others in
9394 ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
9395 ++ */
9396 ++ __bfqq = bfqq->new_bfqq;
9397 ++ while (__bfqq) {
9398 ++ if (__bfqq == bfqq) {
9399 ++ WARN(1, "bfqq->new_bfqq loop detected.\n");
9400 ++ break;
9401 ++ }
9402 ++ next = __bfqq->new_bfqq;
9403 ++ bfq_put_queue(__bfqq);
9404 ++ __bfqq = next;
9405 ++ }
9406 ++}
9407 ++
9408 ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
9409 ++{
9410 ++ if (bfqq == bfqd->in_service_queue) {
9411 ++ __bfq_bfqq_expire(bfqd, bfqq);
9412 ++ bfq_schedule_dispatch(bfqd);
9413 ++ }
9414 ++
9415 ++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
9416 ++ atomic_read(&bfqq->ref));
9417 ++
9418 ++ bfq_put_cooperator(bfqq);
9419 ++
9420 ++ bfq_put_queue(bfqq);
9421 ++}
9422 ++
9423 ++static void bfq_init_icq(struct io_cq *icq)
9424 ++{
9425 ++ struct bfq_io_cq *bic = icq_to_bic(icq);
9426 ++
9427 ++ bic->ttime.last_end_request = jiffies;
9428 ++}
9429 ++
9430 ++static void bfq_exit_icq(struct io_cq *icq)
9431 ++{
9432 ++ struct bfq_io_cq *bic = icq_to_bic(icq);
9433 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
9434 ++
9435 ++ if (bic->bfqq[BLK_RW_ASYNC]) {
9436 ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
9437 ++ bic->bfqq[BLK_RW_ASYNC] = NULL;
9438 ++ }
9439 ++
9440 ++ if (bic->bfqq[BLK_RW_SYNC]) {
9441 ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
9442 ++ bic->bfqq[BLK_RW_SYNC] = NULL;
9443 ++ }
9444 ++}
9445 ++
9446 ++/*
9447 ++ * Update the entity prio values; note that the new values will not
9448 ++ * be used until the next (re)activation.
9449 ++ */
9450 ++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
9451 ++{
9452 ++ struct task_struct *tsk = current;
9453 ++ int ioprio_class;
9454 ++
9455 ++ if (!bfq_bfqq_prio_changed(bfqq))
9456 ++ return;
9457 ++
9458 ++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
9459 ++ switch (ioprio_class) {
9460 ++ default:
9461 ++ dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
9462 ++ "bfq: bad prio %x\n", ioprio_class);
9463 ++ case IOPRIO_CLASS_NONE:
9464 ++ /*
9465 ++ * No prio set, inherit CPU scheduling settings.
9466 ++ */
9467 ++ bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
9468 ++ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
9469 ++ break;
9470 ++ case IOPRIO_CLASS_RT:
9471 ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
9472 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
9473 ++ break;
9474 ++ case IOPRIO_CLASS_BE:
9475 ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
9476 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
9477 ++ break;
9478 ++ case IOPRIO_CLASS_IDLE:
9479 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
9480 ++ bfqq->entity.new_ioprio = 7;
9481 ++ bfq_clear_bfqq_idle_window(bfqq);
9482 ++ break;
9483 ++ }
9484 ++
9485 ++ bfqq->entity.ioprio_changed = 1;
9486 ++
9487 ++ /*
9488 ++ * Keep track of original prio settings in case we have to temporarily
9489 ++ * elevate the priority of this queue.
9490 ++ */
9491 ++ bfqq->org_ioprio = bfqq->entity.new_ioprio;
9492 ++ bfq_clear_bfqq_prio_changed(bfqq);
9493 ++}
9494 ++
9495 ++static void bfq_changed_ioprio(struct bfq_io_cq *bic)
9496 ++{
9497 ++ struct bfq_data *bfqd;
9498 ++ struct bfq_queue *bfqq, *new_bfqq;
9499 ++ struct bfq_group *bfqg;
9500 ++ unsigned long uninitialized_var(flags);
9501 ++ int ioprio = bic->icq.ioc->ioprio;
9502 ++
9503 ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
9504 ++ &flags);
9505 ++ /*
9506 ++ * This condition may trigger on a newly created bic, be sure to drop
9507 ++ * the lock before returning.
9508 ++ */
9509 ++ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
9510 ++ goto out;
9511 ++
9512 ++ bfqq = bic->bfqq[BLK_RW_ASYNC];
9513 ++ if (bfqq != NULL) {
9514 ++ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
9515 ++ sched_data);
9516 ++ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
9517 ++ GFP_ATOMIC);
9518 ++ if (new_bfqq != NULL) {
9519 ++ bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
9520 ++ bfq_log_bfqq(bfqd, bfqq,
9521 ++ "changed_ioprio: bfqq %p %d",
9522 ++ bfqq, atomic_read(&bfqq->ref));
9523 ++ bfq_put_queue(bfqq);
9524 ++ }
9525 ++ }
9526 ++
9527 ++ bfqq = bic->bfqq[BLK_RW_SYNC];
9528 ++ if (bfqq != NULL)
9529 ++ bfq_mark_bfqq_prio_changed(bfqq);
9530 ++
9531 ++ bic->ioprio = ioprio;
9532 ++
9533 ++out:
9534 ++ bfq_put_bfqd_unlock(bfqd, &flags);
9535 ++}
9536 ++
9537 ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
9538 ++ pid_t pid, int is_sync)
9539 ++{
9540 ++ RB_CLEAR_NODE(&bfqq->entity.rb_node);
9541 ++ INIT_LIST_HEAD(&bfqq->fifo);
9542 ++
9543 ++ atomic_set(&bfqq->ref, 0);
9544 ++ bfqq->bfqd = bfqd;
9545 ++
9546 ++ bfq_mark_bfqq_prio_changed(bfqq);
9547 ++
9548 ++ if (is_sync) {
9549 ++ if (!bfq_class_idle(bfqq))
9550 ++ bfq_mark_bfqq_idle_window(bfqq);
9551 ++ bfq_mark_bfqq_sync(bfqq);
9552 ++ }
9553 ++
9554 ++ /* Tentative initial value to trade off between thr and lat */
9555 ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
9556 ++ bfqq->pid = pid;
9557 ++
9558 ++ bfqq->raising_coeff = 1;
9559 ++ bfqq->last_rais_start_finish = 0;
9560 ++ /*
9561 ++ * Set to the value for which bfqq will not be deemed as
9562 ++ * soft rt when it becomes backlogged.
9563 ++ */
9564 ++ bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);
9565 ++}
9566 ++
9567 ++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
9568 ++ struct bfq_group *bfqg,
9569 ++ int is_sync,
9570 ++ struct bfq_io_cq *bic,
9571 ++ gfp_t gfp_mask)
9572 ++{
9573 ++ struct bfq_queue *bfqq, *new_bfqq = NULL;
9574 ++
9575 ++retry:
9576 ++ /* bic always exists here */
9577 ++ bfqq = bic_to_bfqq(bic, is_sync);
9578 ++
9579 ++ /*
9580 ++ * Always try a new alloc if we fall back to the OOM bfqq
9581 ++ * originally, since it should just be a temporary situation.
9582 ++ */
9583 ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
9584 ++ bfqq = NULL;
9585 ++ if (new_bfqq != NULL) {
9586 ++ bfqq = new_bfqq;
9587 ++ new_bfqq = NULL;
9588 ++ } else if (gfp_mask & __GFP_WAIT) {
9589 ++ spin_unlock_irq(bfqd->queue->queue_lock);
9590 ++ new_bfqq = kmem_cache_alloc_node(bfq_pool,
9591 ++ gfp_mask | __GFP_ZERO,
9592 ++ bfqd->queue->node);
9593 ++ spin_lock_irq(bfqd->queue->queue_lock);
9594 ++ if (new_bfqq != NULL)
9595 ++ goto retry;
9596 ++ } else {
9597 ++ bfqq = kmem_cache_alloc_node(bfq_pool,
9598 ++ gfp_mask | __GFP_ZERO,
9599 ++ bfqd->queue->node);
9600 ++ }
9601 ++
9602 ++ if (bfqq != NULL) {
9603 ++ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
9604 ++ bfq_log_bfqq(bfqd, bfqq, "allocated");
9605 ++ } else {
9606 ++ bfqq = &bfqd->oom_bfqq;
9607 ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
9608 ++ }
9609 ++
9610 ++ bfq_init_prio_data(bfqq, bic);
9611 ++ bfq_init_entity(&bfqq->entity, bfqg);
9612 ++ }
9613 ++
9614 ++ if (new_bfqq != NULL)
9615 ++ kmem_cache_free(bfq_pool, new_bfqq);
9616 ++
9617 ++ return bfqq;
9618 ++}
9619 ++
9620 ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
9621 ++ struct bfq_group *bfqg,
9622 ++ int ioprio_class, int ioprio)
9623 ++{
9624 ++ switch (ioprio_class) {
9625 ++ case IOPRIO_CLASS_RT:
9626 ++ return &bfqg->async_bfqq[0][ioprio];
9627 ++ case IOPRIO_CLASS_NONE:
9628 ++ ioprio = IOPRIO_NORM;
9629 ++ /* fall through */
9630 ++ case IOPRIO_CLASS_BE:
9631 ++ return &bfqg->async_bfqq[1][ioprio];
9632 ++ case IOPRIO_CLASS_IDLE:
9633 ++ return &bfqg->async_idle_bfqq;
9634 ++ default:
9635 ++ BUG();
9636 ++ }
9637 ++}
9638 ++
9639 ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
9640 ++ struct bfq_group *bfqg, int is_sync,
9641 ++ struct bfq_io_cq *bic, gfp_t gfp_mask)
9642 ++{
9643 ++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
9644 ++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
9645 ++ struct bfq_queue **async_bfqq = NULL;
9646 ++ struct bfq_queue *bfqq = NULL;
9647 ++
9648 ++ if (!is_sync) {
9649 ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
9650 ++ ioprio);
9651 ++ bfqq = *async_bfqq;
9652 ++ }
9653 ++
9654 ++ if (bfqq == NULL)
9655 ++ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
9656 ++
9657 ++ /*
9658 ++ * Pin the queue now that it's allocated, scheduler exit will prune it.
9659 ++ */
9660 ++ if (!is_sync && *async_bfqq == NULL) {
9661 ++ atomic_inc(&bfqq->ref);
9662 ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
9663 ++ bfqq, atomic_read(&bfqq->ref));
9664 ++ *async_bfqq = bfqq;
9665 ++ }
9666 ++
9667 ++ atomic_inc(&bfqq->ref);
9668 ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
9669 ++ atomic_read(&bfqq->ref));
9670 ++ return bfqq;
9671 ++}
9672 ++
9673 ++static void bfq_update_io_thinktime(struct bfq_data *bfqd,
9674 ++ struct bfq_io_cq *bic)
9675 ++{
9676 ++ unsigned long elapsed = jiffies - bic->ttime.last_end_request;
9677 ++ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
9678 ++
9679 ++ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
9680 ++ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
9681 ++ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /
9682 ++ bic->ttime.ttime_samples;
9683 ++}
9684 ++
9685 ++static void bfq_update_io_seektime(struct bfq_data *bfqd,
9686 ++ struct bfq_queue *bfqq,
9687 ++ struct request *rq)
9688 ++{
9689 ++ sector_t sdist;
9690 ++ u64 total;
9691 ++
9692 ++ if (bfqq->last_request_pos < blk_rq_pos(rq))
9693 ++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
9694 ++ else
9695 ++ sdist = bfqq->last_request_pos - blk_rq_pos(rq);
9696 ++
9697 ++ /*
9698 ++ * Don't allow the seek distance to get too large from the
9699 ++ * odd fragment, pagein, etc.
9700 ++ */
9701 ++ if (bfqq->seek_samples == 0) /* first request, not really a seek */
9702 ++ sdist = 0;
9703 ++ else if (bfqq->seek_samples <= 60) /* second & third seek */
9704 ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
9705 ++ else
9706 ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
9707 ++
9708 ++ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
9709 ++ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
9710 ++ total = bfqq->seek_total + (bfqq->seek_samples/2);
9711 ++ do_div(total, bfqq->seek_samples);
9712 ++ bfqq->seek_mean = (sector_t)total;
9713 ++
9714 ++ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
9715 ++ (u64)bfqq->seek_mean);
9716 ++}
9717 ++
9718 ++/*
9719 ++ * Disable idle window if the process thinks too long or seeks so much that
9720 ++ * it doesn't matter.
9721 ++ */
9722 ++static void bfq_update_idle_window(struct bfq_data *bfqd,
9723 ++ struct bfq_queue *bfqq,
9724 ++ struct bfq_io_cq *bic)
9725 ++{
9726 ++ int enable_idle;
9727 ++
9728 ++ /* Don't idle for async or idle io prio class. */
9729 ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
9730 ++ return;
9731 ++
9732 ++ enable_idle = bfq_bfqq_idle_window(bfqq);
9733 ++
9734 ++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
9735 ++ bfqd->bfq_slice_idle == 0 ||
9736 ++ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
9737 ++ bfqq->raising_coeff == 1))
9738 ++ enable_idle = 0;
9739 ++ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
9740 ++ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
9741 ++ bfqq->raising_coeff == 1)
9742 ++ enable_idle = 0;
9743 ++ else
9744 ++ enable_idle = 1;
9745 ++ }
9746 ++ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
9747 ++ enable_idle);
9748 ++
9749 ++ if (enable_idle)
9750 ++ bfq_mark_bfqq_idle_window(bfqq);
9751 ++ else
9752 ++ bfq_clear_bfqq_idle_window(bfqq);
9753 ++}
9754 ++
9755 ++/*
9756 ++ * Called when a new fs request (rq) is added to bfqq. Check if there's
9757 ++ * something we should do about it.
9758 ++ */
9759 ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
9760 ++ struct request *rq)
9761 ++{
9762 ++ struct bfq_io_cq *bic = RQ_BIC(rq);
9763 ++
9764 ++ if (rq->cmd_flags & REQ_META)
9765 ++ bfqq->meta_pending++;
9766 ++
9767 ++ bfq_update_io_thinktime(bfqd, bic);
9768 ++ bfq_update_io_seektime(bfqd, bfqq, rq);
9769 ++ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
9770 ++ !BFQQ_SEEKY(bfqq))
9771 ++ bfq_update_idle_window(bfqd, bfqq, bic);
9772 ++
9773 ++ bfq_log_bfqq(bfqd, bfqq,
9774 ++ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
9775 ++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
9776 ++ (long long unsigned)bfqq->seek_mean);
9777 ++
9778 ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
9779 ++
9780 ++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
9781 ++ int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
9782 ++ blk_rq_sectors(rq) < 32;
9783 ++ int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
9784 ++
9785 ++ /*
9786 ++ * There is just this request queued: if the request
9787 ++ * is small and the queue is not to be expired, then
9788 ++ * just exit.
9789 ++ *
9790 ++ * In this way, if the disk is being idled to wait for
9791 ++ * a new request from the in-service queue, we avoid
9792 ++ * unplugging the device and committing the disk to serve
9793 ++ * just a small request. On the contrary, we wait for
9794 ++ * the block layer to decide when to unplug the device:
9795 ++ * hopefully, new requests will be merged to this one
9796 ++ * quickly, then the device will be unplugged and
9797 ++ * larger requests will be dispatched.
9798 ++ */
9799 ++ if (small_req && !budget_timeout)
9800 ++ return;
9801 ++
9802 ++ /*
9803 ++ * A large enough request arrived, or the queue is to
9804 ++ * be expired: in both cases disk idling is to be
9805 ++ * stopped, so clear wait_request flag and reset
9806 ++ * timer.
9807 ++ */
9808 ++ bfq_clear_bfqq_wait_request(bfqq);
9809 ++ del_timer(&bfqd->idle_slice_timer);
9810 ++
9811 ++ /*
9812 ++ * The queue is not empty, because a new request just
9813 ++ * arrived. Hence we can safely expire the queue, in
9814 ++ * case of budget timeout, without risking that the
9815 ++ * timestamps of the queue are not updated correctly.
9816 ++ * See [1] for more details.
9817 ++ */
9818 ++ if (budget_timeout)
9819 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
9820 ++
9821 ++ /*
9822 ++ * Let the request rip immediately, or let a new queue be
9823 ++ * selected if bfqq has just been expired.
9824 ++ */
9825 ++ __blk_run_queue(bfqd->queue);
9826 ++ }
9827 ++}
9828 ++
9829 ++static void bfq_insert_request(struct request_queue *q, struct request *rq)
9830 ++{
9831 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
9832 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
9833 ++
9834 ++ assert_spin_locked(bfqd->queue->queue_lock);
9835 ++ bfq_init_prio_data(bfqq, RQ_BIC(rq));
9836 ++
9837 ++ bfq_add_rq_rb(rq);
9838 ++
9839 ++ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
9840 ++ list_add_tail(&rq->queuelist, &bfqq->fifo);
9841 ++
9842 ++ bfq_rq_enqueued(bfqd, bfqq, rq);
9843 ++}
9844 ++
9845 ++static void bfq_update_hw_tag(struct bfq_data *bfqd)
9846 ++{
9847 ++ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
9848 ++ bfqd->rq_in_driver);
9849 ++
9850 ++ if (bfqd->hw_tag == 1)
9851 ++ return;
9852 ++
9853 ++ /*
9854 ++ * This sample is valid if the number of outstanding requests
9855 ++ * is large enough to allow a queueing behavior. Note that the
9856 ++ * sum is not exact, as it's not taking into account deactivated
9857 ++ * requests.
9858 ++ */
9859 ++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
9860 ++ return;
9861 ++
9862 ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
9863 ++ return;
9864 ++
9865 ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
9866 ++ bfqd->max_rq_in_driver = 0;
9867 ++ bfqd->hw_tag_samples = 0;
9868 ++}
9869 ++
9870 ++static void bfq_completed_request(struct request_queue *q, struct request *rq)
9871 ++{
9872 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
9873 ++ struct bfq_data *bfqd = bfqq->bfqd;
9874 ++ const int sync = rq_is_sync(rq);
9875 ++
9876 ++ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
9877 ++ blk_rq_sectors(rq), sync);
9878 ++
9879 ++ bfq_update_hw_tag(bfqd);
9880 ++
9881 ++ WARN_ON(!bfqd->rq_in_driver);
9882 ++ WARN_ON(!bfqq->dispatched);
9883 ++ bfqd->rq_in_driver--;
9884 ++ bfqq->dispatched--;
9885 ++
9886 ++ if (bfq_bfqq_sync(bfqq))
9887 ++ bfqd->sync_flight--;
9888 ++
9889 ++ if (sync)
9890 ++ RQ_BIC(rq)->ttime.last_end_request = jiffies;
9891 ++
9892 ++ /*
9893 ++ * The computation of softrt_next_start was scheduled for the next
9894 ++ * request completion: it is now time to compute it.
9895 ++ */
9896 ++ if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list))
9897 ++ bfqq->soft_rt_next_start =
9898 ++ bfq_bfqq_softrt_next_start(bfqd, bfqq);
9899 ++
9900 ++ /*
9901 ++ * If this is the in-service queue, check if it needs to be expired,
9902 ++ * or if we want to idle in case it has no pending requests.
9903 ++ */
9904 ++ if (bfqd->in_service_queue == bfqq) {
9905 ++ if (bfq_bfqq_budget_new(bfqq))
9906 ++ bfq_set_budget_timeout(bfqd);
9907 ++
9908 ++ if (bfq_bfqq_must_idle(bfqq)) {
9909 ++ bfq_arm_slice_timer(bfqd);
9910 ++ goto out;
9911 ++ } else if (bfq_may_expire_for_budg_timeout(bfqq))
9912 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
9913 ++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
9914 ++ (bfqq->dispatched == 0 ||
9915 ++ !bfq_bfqq_must_not_expire(bfqq)))
9916 ++ bfq_bfqq_expire(bfqd, bfqq, 0,
9917 ++ BFQ_BFQQ_NO_MORE_REQUESTS);
9918 ++ }
9919 ++
9920 ++ if (!bfqd->rq_in_driver)
9921 ++ bfq_schedule_dispatch(bfqd);
9922 ++
9923 ++out:
9924 ++ return;
9925 ++}
9926 ++
9927 ++static inline int __bfq_may_queue(struct bfq_queue *bfqq)
9928 ++{
9929 ++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
9930 ++ bfq_clear_bfqq_must_alloc(bfqq);
9931 ++ return ELV_MQUEUE_MUST;
9932 ++ }
9933 ++
9934 ++ return ELV_MQUEUE_MAY;
9935 ++}
9936 ++
9937 ++static int bfq_may_queue(struct request_queue *q, int rw)
9938 ++{
9939 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
9940 ++ struct task_struct *tsk = current;
9941 ++ struct bfq_io_cq *bic;
9942 ++ struct bfq_queue *bfqq;
9943 ++
9944 ++ /*
9945 ++ * Don't force setup of a queue from here, as a call to may_queue
9946 ++ * does not necessarily imply that a request actually will be queued.
9947 ++ * So just lookup a possibly existing queue, or return 'may queue'
9948 ++ * if that fails.
9949 ++ */
9950 ++ bic = bfq_bic_lookup(bfqd, tsk->io_context);
9951 ++ if (bic == NULL)
9952 ++ return ELV_MQUEUE_MAY;
9953 ++
9954 ++ bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
9955 ++ if (bfqq != NULL) {
9956 ++ bfq_init_prio_data(bfqq, bic);
9957 ++
9958 ++ return __bfq_may_queue(bfqq);
9959 ++ }
9960 ++
9961 ++ return ELV_MQUEUE_MAY;
9962 ++}
9963 ++
9964 ++/*
9965 ++ * Queue lock held here.
9966 ++ */
9967 ++static void bfq_put_request(struct request *rq)
9968 ++{
9969 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
9970 ++
9971 ++ if (bfqq != NULL) {
9972 ++ const int rw = rq_data_dir(rq);
9973 ++
9974 ++ BUG_ON(!bfqq->allocated[rw]);
9975 ++ bfqq->allocated[rw]--;
9976 ++
9977 ++ rq->elv.priv[0] = NULL;
9978 ++ rq->elv.priv[1] = NULL;
9979 ++
9980 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
9981 ++ bfqq, atomic_read(&bfqq->ref));
9982 ++ bfq_put_queue(bfqq);
9983 ++ }
9984 ++}
9985 ++
9986 ++static struct bfq_queue *
9987 ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
9988 ++ struct bfq_queue *bfqq)
9989 ++{
9990 ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
9991 ++ (long unsigned)bfqq->new_bfqq->pid);
9992 ++ bic_set_bfqq(bic, bfqq->new_bfqq, 1);
9993 ++ bfq_mark_bfqq_coop(bfqq->new_bfqq);
9994 ++ bfq_put_queue(bfqq);
9995 ++ return bic_to_bfqq(bic, 1);
9996 ++}
9997 ++
9998 ++/*
9999 ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
10000 ++ * was the last process referring to said bfqq.
10001 ++ */
10002 ++static struct bfq_queue *
10003 ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
10004 ++{
10005 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
10006 ++ if (bfqq_process_refs(bfqq) == 1) {
10007 ++ bfqq->pid = current->pid;
10008 ++ bfq_clear_bfqq_coop(bfqq);
10009 ++ bfq_clear_bfqq_split_coop(bfqq);
10010 ++ return bfqq;
10011 ++ }
10012 ++
10013 ++ bic_set_bfqq(bic, NULL, 1);
10014 ++
10015 ++ bfq_put_cooperator(bfqq);
10016 ++
10017 ++ bfq_put_queue(bfqq);
10018 ++ return NULL;
10019 ++}
10020 ++
10021 ++/*
10022 ++ * Allocate bfq data structures associated with this request.
10023 ++ */
10024 ++static int bfq_set_request(struct request_queue *q, struct request *rq,
10025 ++ struct bio *bio, gfp_t gfp_mask)
10026 ++{
10027 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
10028 ++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
10029 ++ const int rw = rq_data_dir(rq);
10030 ++ const int is_sync = rq_is_sync(rq);
10031 ++ struct bfq_queue *bfqq;
10032 ++ struct bfq_group *bfqg;
10033 ++ unsigned long flags;
10034 ++
10035 ++ might_sleep_if(gfp_mask & __GFP_WAIT);
10036 ++
10037 ++ bfq_changed_ioprio(bic);
10038 ++
10039 ++ spin_lock_irqsave(q->queue_lock, flags);
10040 ++
10041 ++ if (bic == NULL)
10042 ++ goto queue_fail;
10043 ++
10044 ++ bfqg = bfq_bic_update_cgroup(bic);
10045 ++
10046 ++new_queue:
10047 ++ bfqq = bic_to_bfqq(bic, is_sync);
10048 ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
10049 ++ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
10050 ++ bic_set_bfqq(bic, bfqq, is_sync);
10051 ++ } else {
10052 ++ /*
10053 ++ * If the queue was seeky for too long, break it apart.
10054 ++ */
10055 ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
10056 ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
10057 ++ bfqq = bfq_split_bfqq(bic, bfqq);
10058 ++ if (!bfqq)
10059 ++ goto new_queue;
10060 ++ }
10061 ++
10062 ++ /*
10063 ++ * Check to see if this queue is scheduled to merge with
10064 ++ * another closely cooperating queue. The merging of queues
10065 ++ * happens here as it must be done in process context.
10066 ++ * The reference on new_bfqq was taken in merge_bfqqs.
10067 ++ */
10068 ++ if (bfqq->new_bfqq != NULL)
10069 ++ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
10070 ++ }
10071 ++
10072 ++ bfqq->allocated[rw]++;
10073 ++ atomic_inc(&bfqq->ref);
10074 ++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
10075 ++ atomic_read(&bfqq->ref));
10076 ++
10077 ++ rq->elv.priv[0] = bic;
10078 ++ rq->elv.priv[1] = bfqq;
10079 ++
10080 ++ spin_unlock_irqrestore(q->queue_lock, flags);
10081 ++
10082 ++ return 0;
10083 ++
10084 ++queue_fail:
10085 ++ bfq_schedule_dispatch(bfqd);
10086 ++ spin_unlock_irqrestore(q->queue_lock, flags);
10087 ++
10088 ++ return 1;
10089 ++}
10090 ++
10091 ++static void bfq_kick_queue(struct work_struct *work)
10092 ++{
10093 ++ struct bfq_data *bfqd =
10094 ++ container_of(work, struct bfq_data, unplug_work);
10095 ++ struct request_queue *q = bfqd->queue;
10096 ++
10097 ++ spin_lock_irq(q->queue_lock);
10098 ++ __blk_run_queue(q);
10099 ++ spin_unlock_irq(q->queue_lock);
10100 ++}
10101 ++
10102 ++/*
10103 ++ * Handler of the expiration of the timer running if the in-service queue
10104 ++ * is idling inside its time slice.
10105 ++ */
10106 ++static void bfq_idle_slice_timer(unsigned long data)
10107 ++{
10108 ++ struct bfq_data *bfqd = (struct bfq_data *)data;
10109 ++ struct bfq_queue *bfqq;
10110 ++ unsigned long flags;
10111 ++ enum bfqq_expiration reason;
10112 ++
10113 ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
10114 ++
10115 ++ bfqq = bfqd->in_service_queue;
10116 ++ /*
10117 ++ * Theoretical race here: the in-service queue can be NULL or different
10118 ++ * from the queue that was idling if the timer handler spins on
10119 ++ * the queue_lock and a new request arrives for the current
10120 ++ * queue and there is a full dispatch cycle that changes the
10121 ++ * in-service queue. This can hardly happen, but in the worst case
10122 ++ * we just expire a queue too early.
10123 ++ */
10124 ++ if (bfqq != NULL) {
10125 ++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
10126 ++ if (bfq_bfqq_budget_timeout(bfqq))
10127 ++ /*
10128 ++ * Also here the queue can be safely expired
10129 ++ * for budget timeout without wasting
10130 ++ * guarantees
10131 ++ */
10132 ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
10133 ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
10134 ++ /*
10135 ++ * The queue may not be empty upon timer expiration,
10136 ++ * because we may not disable the timer when the first
10137 ++ * request of the in-service queue arrives during
10138 ++ * disk idling
10139 ++ */
10140 ++ reason = BFQ_BFQQ_TOO_IDLE;
10141 ++ else
10142 ++ goto schedule_dispatch;
10143 ++
10144 ++ bfq_bfqq_expire(bfqd, bfqq, 1, reason);
10145 ++ }
10146 ++
10147 ++schedule_dispatch:
10148 ++ bfq_schedule_dispatch(bfqd);
10149 ++
10150 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
10151 ++}
10152 ++
10153 ++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
10154 ++{
10155 ++ del_timer_sync(&bfqd->idle_slice_timer);
10156 ++ cancel_work_sync(&bfqd->unplug_work);
10157 ++}
10158 ++
10159 ++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
10160 ++ struct bfq_queue **bfqq_ptr)
10161 ++{
10162 ++ struct bfq_group *root_group = bfqd->root_group;
10163 ++ struct bfq_queue *bfqq = *bfqq_ptr;
10164 ++
10165 ++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
10166 ++ if (bfqq != NULL) {
10167 ++ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
10168 ++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
10169 ++ bfqq, atomic_read(&bfqq->ref));
10170 ++ bfq_put_queue(bfqq);
10171 ++ *bfqq_ptr = NULL;
10172 ++ }
10173 ++}
10174 ++
10175 ++/*
10176 ++ * Release all the bfqg references to its async queues. If we are
10177 ++ * deallocating the group these queues may still contain requests, so
10178 ++ * we reparent them to the root cgroup (i.e., the only one that will
10179 ++ * exist for sure untill all the requests on a device are gone).
10180 ++ */
10181 ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
10182 ++{
10183 ++ int i, j;
10184 ++
10185 ++ for (i = 0; i < 2; i++)
10186 ++ for (j = 0; j < IOPRIO_BE_NR; j++)
10187 ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
10188 ++
10189 ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
10190 ++}
10191 ++
10192 ++static void bfq_exit_queue(struct elevator_queue *e)
10193 ++{
10194 ++ struct bfq_data *bfqd = e->elevator_data;
10195 ++ struct request_queue *q = bfqd->queue;
10196 ++ struct bfq_queue *bfqq, *n;
10197 ++
10198 ++ bfq_shutdown_timer_wq(bfqd);
10199 ++
10200 ++ spin_lock_irq(q->queue_lock);
10201 ++
10202 ++ BUG_ON(bfqd->in_service_queue != NULL);
10203 ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
10204 ++ bfq_deactivate_bfqq(bfqd, bfqq, 0);
10205 ++
10206 ++ bfq_disconnect_groups(bfqd);
10207 ++ spin_unlock_irq(q->queue_lock);
10208 ++
10209 ++ bfq_shutdown_timer_wq(bfqd);
10210 ++
10211 ++ synchronize_rcu();
10212 ++
10213 ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
10214 ++
10215 ++ bfq_free_root_group(bfqd);
10216 ++ kfree(bfqd);
10217 ++}
10218 ++
10219 ++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
10220 ++{
10221 ++ struct bfq_group *bfqg;
10222 ++ struct bfq_data *bfqd;
10223 ++ struct elevator_queue *eq;
10224 ++
10225 ++ eq = elevator_alloc(q, e);
10226 ++ if (eq == NULL)
10227 ++ return -ENOMEM;
10228 ++
10229 ++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
10230 ++ if (bfqd == NULL) {
10231 ++ kobject_put(&eq->kobj);
10232 ++ return -ENOMEM;
10233 ++ }
10234 ++ eq->elevator_data = bfqd;
10235 ++
10236 ++ /*
10237 ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
10238 ++ * Grab a permanent reference to it, so that the normal code flow
10239 ++ * will not attempt to free it.
10240 ++ */
10241 ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
10242 ++ atomic_inc(&bfqd->oom_bfqq.ref);
10243 ++
10244 ++ bfqd->queue = q;
10245 ++
10246 ++ spin_lock_irq(q->queue_lock);
10247 ++ q->elevator = eq;
10248 ++ spin_unlock_irq(q->queue_lock);
10249 ++
10250 ++ bfqg = bfq_alloc_root_group(bfqd, q->node);
10251 ++ if (bfqg == NULL) {
10252 ++ kfree(bfqd);
10253 ++ kobject_put(&eq->kobj);
10254 ++ return -ENOMEM;
10255 ++ }
10256 ++
10257 ++ bfqd->root_group = bfqg;
10258 ++
10259 ++ init_timer(&bfqd->idle_slice_timer);
10260 ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
10261 ++ bfqd->idle_slice_timer.data = (unsigned long)bfqd;
10262 ++
10263 ++ bfqd->rq_pos_tree = RB_ROOT;
10264 ++
10265 ++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
10266 ++
10267 ++ INIT_LIST_HEAD(&bfqd->active_list);
10268 ++ INIT_LIST_HEAD(&bfqd->idle_list);
10269 ++
10270 ++ bfqd->hw_tag = -1;
10271 ++
10272 ++ bfqd->bfq_max_budget = bfq_default_max_budget;
10273 ++
10274 ++ bfqd->bfq_quantum = bfq_quantum;
10275 ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
10276 ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
10277 ++ bfqd->bfq_back_max = bfq_back_max;
10278 ++ bfqd->bfq_back_penalty = bfq_back_penalty;
10279 ++ bfqd->bfq_slice_idle = bfq_slice_idle;
10280 ++ bfqd->bfq_class_idle_last_service = 0;
10281 ++ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
10282 ++ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
10283 ++ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
10284 ++
10285 ++ bfqd->low_latency = true;
10286 ++
10287 ++ bfqd->bfq_raising_coeff = 20;
10288 ++ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
10289 ++ bfqd->bfq_raising_max_time = 0;
10290 ++ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
10291 ++ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
10292 ++ bfqd->bfq_raising_max_softrt_rate = 7000; /*
10293 ++ * Approximate rate required
10294 ++ * to playback or record a
10295 ++ * high-definition compressed
10296 ++ * video.
10297 ++ */
10298 ++ bfqd->raised_busy_queues = 0;
10299 ++
10300 ++ /* Initially estimate the device's peak rate as the reference rate */
10301 ++ if (blk_queue_nonrot(bfqd->queue)) {
10302 ++ bfqd->RT_prod = R_nonrot * T_nonrot;
10303 ++ bfqd->peak_rate = R_nonrot;
10304 ++ } else {
10305 ++ bfqd->RT_prod = R_rot * T_rot;
10306 ++ bfqd->peak_rate = R_rot;
10307 ++ }
10308 ++
10309 ++ return 0;
10310 ++}
10311 ++
10312 ++static void bfq_slab_kill(void)
10313 ++{
10314 ++ if (bfq_pool != NULL)
10315 ++ kmem_cache_destroy(bfq_pool);
10316 ++}
10317 ++
10318 ++static int __init bfq_slab_setup(void)
10319 ++{
10320 ++ bfq_pool = KMEM_CACHE(bfq_queue, 0);
10321 ++ if (bfq_pool == NULL)
10322 ++ return -ENOMEM;
10323 ++ return 0;
10324 ++}
10325 ++
10326 ++static ssize_t bfq_var_show(unsigned int var, char *page)
10327 ++{
10328 ++ return sprintf(page, "%d\n", var);
10329 ++}
10330 ++
10331 ++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
10332 ++{
10333 ++ unsigned long new_val;
10334 ++ int ret = kstrtoul(page, 10, &new_val);
10335 ++
10336 ++ if (ret == 0)
10337 ++ *var = new_val;
10338 ++
10339 ++ return count;
10340 ++}
10341 ++
10342 ++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
10343 ++{
10344 ++ struct bfq_data *bfqd = e->elevator_data;
10345 ++ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
10346 ++ jiffies_to_msecs(bfqd->bfq_raising_max_time) :
10347 ++ jiffies_to_msecs(bfq_wrais_duration(bfqd)));
10348 ++}
10349 ++
10350 ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
10351 ++{
10352 ++ struct bfq_queue *bfqq;
10353 ++ struct bfq_data *bfqd = e->elevator_data;
10354 ++ ssize_t num_char = 0;
10355 ++
10356 ++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
10357 ++ bfqd->queued);
10358 ++
10359 ++ spin_lock_irq(bfqd->queue->queue_lock);
10360 ++
10361 ++ num_char += sprintf(page + num_char, "Active:\n");
10362 ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
10363 ++ num_char += sprintf(page + num_char,
10364 ++ "pid%d: weight %hu, nr_queued %d %d,"
10365 ++ " dur %d/%u\n",
10366 ++ bfqq->pid,
10367 ++ bfqq->entity.weight,
10368 ++ bfqq->queued[0],
10369 ++ bfqq->queued[1],
10370 ++ jiffies_to_msecs(jiffies -
10371 ++ bfqq->last_rais_start_finish),
10372 ++ jiffies_to_msecs(bfqq->raising_cur_max_time));
10373 ++ }
10374 ++
10375 ++ num_char += sprintf(page + num_char, "Idle:\n");
10376 ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
10377 ++ num_char += sprintf(page + num_char,
10378 ++ "pid%d: weight %hu, dur %d/%u\n",
10379 ++ bfqq->pid,
10380 ++ bfqq->entity.weight,
10381 ++ jiffies_to_msecs(jiffies -
10382 ++ bfqq->last_rais_start_finish),
10383 ++ jiffies_to_msecs(bfqq->raising_cur_max_time));
10384 ++ }
10385 ++
10386 ++ spin_unlock_irq(bfqd->queue->queue_lock);
10387 ++
10388 ++ return num_char;
10389 ++}
10390 ++
10391 ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
10392 ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \
10393 ++{ \
10394 ++ struct bfq_data *bfqd = e->elevator_data; \
10395 ++ unsigned int __data = __VAR; \
10396 ++ if (__CONV) \
10397 ++ __data = jiffies_to_msecs(__data); \
10398 ++ return bfq_var_show(__data, (page)); \
10399 ++}
10400 ++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
10401 ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
10402 ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
10403 ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
10404 ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
10405 ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
10406 ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
10407 ++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
10408 ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
10409 ++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
10410 ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
10411 ++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
10412 ++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
10413 ++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
10414 ++ 1);
10415 ++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
10416 ++ bfqd->bfq_raising_min_inter_arr_async,
10417 ++ 1);
10418 ++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
10419 ++ bfqd->bfq_raising_max_softrt_rate, 0);
10420 ++#undef SHOW_FUNCTION
10421 ++
10422 ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
10423 ++static ssize_t \
10424 ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \
10425 ++{ \
10426 ++ struct bfq_data *bfqd = e->elevator_data; \
10427 ++ unsigned long uninitialized_var(__data); \
10428 ++ int ret = bfq_var_store(&__data, (page), count); \
10429 ++ if (__data < (MIN)) \
10430 ++ __data = (MIN); \
10431 ++ else if (__data > (MAX)) \
10432 ++ __data = (MAX); \
10433 ++ if (__CONV) \
10434 ++ *(__PTR) = msecs_to_jiffies(__data); \
10435 ++ else \
10436 ++ *(__PTR) = __data; \
10437 ++ return ret; \
10438 ++}
10439 ++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
10440 ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
10441 ++ INT_MAX, 1);
10442 ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
10443 ++ INT_MAX, 1);
10444 ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
10445 ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
10446 ++ INT_MAX, 0);
10447 ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
10448 ++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
10449 ++ 1, INT_MAX, 0);
10450 ++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
10451 ++ INT_MAX, 1);
10452 ++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
10453 ++ INT_MAX, 0);
10454 ++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
10455 ++ INT_MAX, 1);
10456 ++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
10457 ++ INT_MAX, 1);
10458 ++STORE_FUNCTION(bfq_raising_min_idle_time_store,
10459 ++ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
10460 ++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
10461 ++ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
10462 ++STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
10463 ++ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
10464 ++#undef STORE_FUNCTION
10465 ++
10466 ++/* do nothing for the moment */
10467 ++static ssize_t bfq_weights_store(struct elevator_queue *e,
10468 ++ const char *page, size_t count)
10469 ++{
10470 ++ return count;
10471 ++}
10472 ++
10473 ++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
10474 ++{
10475 ++ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
10476 ++
10477 ++ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
10478 ++ return bfq_calc_max_budget(bfqd->peak_rate, timeout);
10479 ++ else
10480 ++ return bfq_default_max_budget;
10481 ++}
10482 ++
10483 ++static ssize_t bfq_max_budget_store(struct elevator_queue *e,
10484 ++ const char *page, size_t count)
10485 ++{
10486 ++ struct bfq_data *bfqd = e->elevator_data;
10487 ++ unsigned long uninitialized_var(__data);
10488 ++ int ret = bfq_var_store(&__data, (page), count);
10489 ++
10490 ++ if (__data == 0)
10491 ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
10492 ++ else {
10493 ++ if (__data > INT_MAX)
10494 ++ __data = INT_MAX;
10495 ++ bfqd->bfq_max_budget = __data;
10496 ++ }
10497 ++
10498 ++ bfqd->bfq_user_max_budget = __data;
10499 ++
10500 ++ return ret;
10501 ++}
10502 ++
10503 ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
10504 ++ const char *page, size_t count)
10505 ++{
10506 ++ struct bfq_data *bfqd = e->elevator_data;
10507 ++ unsigned long uninitialized_var(__data);
10508 ++ int ret = bfq_var_store(&__data, (page), count);
10509 ++
10510 ++ if (__data < 1)
10511 ++ __data = 1;
10512 ++ else if (__data > INT_MAX)
10513 ++ __data = INT_MAX;
10514 ++
10515 ++ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
10516 ++ if (bfqd->bfq_user_max_budget == 0)
10517 ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
10518 ++
10519 ++ return ret;
10520 ++}
10521 ++
10522 ++static ssize_t bfq_low_latency_store(struct elevator_queue *e,
10523 ++ const char *page, size_t count)
10524 ++{
10525 ++ struct bfq_data *bfqd = e->elevator_data;
10526 ++ unsigned long uninitialized_var(__data);
10527 ++ int ret = bfq_var_store(&__data, (page), count);
10528 ++
10529 ++ if (__data > 1)
10530 ++ __data = 1;
10531 ++ if (__data == 0 && bfqd->low_latency != 0)
10532 ++ bfq_end_raising(bfqd);
10533 ++ bfqd->low_latency = __data;
10534 ++
10535 ++ return ret;
10536 ++}
10537 ++
10538 ++#define BFQ_ATTR(name) \
10539 ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
10540 ++
10541 ++static struct elv_fs_entry bfq_attrs[] = {
10542 ++ BFQ_ATTR(quantum),
10543 ++ BFQ_ATTR(fifo_expire_sync),
10544 ++ BFQ_ATTR(fifo_expire_async),
10545 ++ BFQ_ATTR(back_seek_max),
10546 ++ BFQ_ATTR(back_seek_penalty),
10547 ++ BFQ_ATTR(slice_idle),
10548 ++ BFQ_ATTR(max_budget),
10549 ++ BFQ_ATTR(max_budget_async_rq),
10550 ++ BFQ_ATTR(timeout_sync),
10551 ++ BFQ_ATTR(timeout_async),
10552 ++ BFQ_ATTR(low_latency),
10553 ++ BFQ_ATTR(raising_coeff),
10554 ++ BFQ_ATTR(raising_max_time),
10555 ++ BFQ_ATTR(raising_rt_max_time),
10556 ++ BFQ_ATTR(raising_min_idle_time),
10557 ++ BFQ_ATTR(raising_min_inter_arr_async),
10558 ++ BFQ_ATTR(raising_max_softrt_rate),
10559 ++ BFQ_ATTR(weights),
10560 ++ __ATTR_NULL
10561 ++};
10562 ++
10563 ++static struct elevator_type iosched_bfq = {
10564 ++ .ops = {
10565 ++ .elevator_merge_fn = bfq_merge,
10566 ++ .elevator_merged_fn = bfq_merged_request,
10567 ++ .elevator_merge_req_fn = bfq_merged_requests,
10568 ++ .elevator_allow_merge_fn = bfq_allow_merge,
10569 ++ .elevator_dispatch_fn = bfq_dispatch_requests,
10570 ++ .elevator_add_req_fn = bfq_insert_request,
10571 ++ .elevator_activate_req_fn = bfq_activate_request,
10572 ++ .elevator_deactivate_req_fn = bfq_deactivate_request,
10573 ++ .elevator_completed_req_fn = bfq_completed_request,
10574 ++ .elevator_former_req_fn = elv_rb_former_request,
10575 ++ .elevator_latter_req_fn = elv_rb_latter_request,
10576 ++ .elevator_init_icq_fn = bfq_init_icq,
10577 ++ .elevator_exit_icq_fn = bfq_exit_icq,
10578 ++ .elevator_set_req_fn = bfq_set_request,
10579 ++ .elevator_put_req_fn = bfq_put_request,
10580 ++ .elevator_may_queue_fn = bfq_may_queue,
10581 ++ .elevator_init_fn = bfq_init_queue,
10582 ++ .elevator_exit_fn = bfq_exit_queue,
10583 ++ },
10584 ++ .icq_size = sizeof(struct bfq_io_cq),
10585 ++ .icq_align = __alignof__(struct bfq_io_cq),
10586 ++ .elevator_attrs = bfq_attrs,
10587 ++ .elevator_name = "bfq",
10588 ++ .elevator_owner = THIS_MODULE,
10589 ++};
10590 ++
10591 ++static int __init bfq_init(void)
10592 ++{
10593 ++ /*
10594 ++ * Can be 0 on HZ < 1000 setups.
10595 ++ */
10596 ++ if (bfq_slice_idle == 0)
10597 ++ bfq_slice_idle = 1;
10598 ++
10599 ++ if (bfq_timeout_async == 0)
10600 ++ bfq_timeout_async = 1;
10601 ++
10602 ++ if (bfq_slab_setup())
10603 ++ return -ENOMEM;
10604 ++
10605 ++ elv_register(&iosched_bfq);
10606 ++ printk(KERN_INFO "BFQ I/O-scheduler version: v7r1");
10607 ++
10608 ++ return 0;
10609 ++}
10610 ++
10611 ++static void __exit bfq_exit(void)
10612 ++{
10613 ++ elv_unregister(&iosched_bfq);
10614 ++ bfq_slab_kill();
10615 ++}
10616 ++
10617 ++module_init(bfq_init);
10618 ++module_exit(bfq_exit);
10619 ++
10620 ++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
10621 ++MODULE_LICENSE("GPL");
10622 ++MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
10623 +diff --git a/block/bfq-sched.c b/block/bfq-sched.c
10624 +new file mode 100644
10625 +index 0000000..999b475
10626 +--- /dev/null
10627 ++++ b/block/bfq-sched.c
10628 +@@ -0,0 +1,1078 @@
10629 ++/*
10630 ++ * BFQ: Hierarchical B-WF2Q+ scheduler.
10631 ++ *
10632 ++ * Based on ideas and code from CFQ:
10633 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
10634 ++ *
10635 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
10636 ++ * Paolo Valente <paolo.valente@×××××××.it>
10637 ++ *
10638 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
10639 ++ */
10640 ++
10641 ++#ifdef CONFIG_CGROUP_BFQIO
10642 ++#define for_each_entity(entity) \
10643 ++ for (; entity != NULL; entity = entity->parent)
10644 ++
10645 ++#define for_each_entity_safe(entity, parent) \
10646 ++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
10647 ++
10648 ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
10649 ++ int extract,
10650 ++ struct bfq_data *bfqd);
10651 ++
10652 ++static inline void bfq_update_budget(struct bfq_entity *next_in_service)
10653 ++{
10654 ++ struct bfq_entity *bfqg_entity;
10655 ++ struct bfq_group *bfqg;
10656 ++ struct bfq_sched_data *group_sd;
10657 ++
10658 ++ BUG_ON(next_in_service == NULL);
10659 ++
10660 ++ group_sd = next_in_service->sched_data;
10661 ++
10662 ++ bfqg = container_of(group_sd, struct bfq_group, sched_data);
10663 ++ /*
10664 ++ * bfq_group's my_entity field is not NULL only if the group
10665 ++ * is not the root group. We must not touch the root entity
10666 ++ * as it must never become an in-service entity.
10667 ++ */
10668 ++ bfqg_entity = bfqg->my_entity;
10669 ++ if (bfqg_entity != NULL)
10670 ++ bfqg_entity->budget = next_in_service->budget;
10671 ++}
10672 ++
10673 ++static int bfq_update_next_in_service(struct bfq_sched_data *sd)
10674 ++{
10675 ++ struct bfq_entity *next_in_service;
10676 ++
10677 ++ if (sd->in_service_entity != NULL)
10678 ++ /* will update/requeue at the end of service */
10679 ++ return 0;
10680 ++
10681 ++ /*
10682 ++ * NOTE: this can be improved in many ways, such as returning
10683 ++ * 1 (and thus propagating upwards the update) only when the
10684 ++ * budget changes, or caching the bfqq that will be scheduled
10685 ++ * next from this subtree. By now we worry more about
10686 ++ * correctness than about performance...
10687 ++ */
10688 ++ next_in_service = bfq_lookup_next_entity(sd, 0, NULL);
10689 ++ sd->next_in_service = next_in_service;
10690 ++
10691 ++ if (next_in_service != NULL)
10692 ++ bfq_update_budget(next_in_service);
10693 ++
10694 ++ return 1;
10695 ++}
10696 ++
10697 ++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
10698 ++ struct bfq_entity *entity)
10699 ++{
10700 ++ BUG_ON(sd->next_in_service != entity);
10701 ++}
10702 ++#else
10703 ++#define for_each_entity(entity) \
10704 ++ for (; entity != NULL; entity = NULL)
10705 ++
10706 ++#define for_each_entity_safe(entity, parent) \
10707 ++ for (parent = NULL; entity != NULL; entity = parent)
10708 ++
10709 ++static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)
10710 ++{
10711 ++ return 0;
10712 ++}
10713 ++
10714 ++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
10715 ++ struct bfq_entity *entity)
10716 ++{
10717 ++}
10718 ++
10719 ++static inline void bfq_update_budget(struct bfq_entity *next_in_service)
10720 ++{
10721 ++}
10722 ++#endif
10723 ++
10724 ++/*
10725 ++ * Shift for timestamp calculations. This actually limits the maximum
10726 ++ * service allowed in one timestamp delta (small shift values increase it),
10727 ++ * the maximum total weight that can be used for the queues in the system
10728 ++ * (big shift values increase it), and the period of virtual time wraparounds.
10729 ++ */
10730 ++#define WFQ_SERVICE_SHIFT 22
10731 ++
10732 ++/**
10733 ++ * bfq_gt - compare two timestamps.
10734 ++ * @a: first ts.
10735 ++ * @b: second ts.
10736 ++ *
10737 ++ * Return @a > @b, dealing with wrapping correctly.
10738 ++ */
10739 ++static inline int bfq_gt(u64 a, u64 b)
10740 ++{
10741 ++ return (s64)(a - b) > 0;
10742 ++}
10743 ++
10744 ++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
10745 ++{
10746 ++ struct bfq_queue *bfqq = NULL;
10747 ++
10748 ++ BUG_ON(entity == NULL);
10749 ++
10750 ++ if (entity->my_sched_data == NULL)
10751 ++ bfqq = container_of(entity, struct bfq_queue, entity);
10752 ++
10753 ++ return bfqq;
10754 ++}
10755 ++
10756 ++
10757 ++/**
10758 ++ * bfq_delta - map service into the virtual time domain.
10759 ++ * @service: amount of service.
10760 ++ * @weight: scale factor (weight of an entity or weight sum).
10761 ++ */
10762 ++static inline u64 bfq_delta(unsigned long service,
10763 ++ unsigned long weight)
10764 ++{
10765 ++ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
10766 ++
10767 ++ do_div(d, weight);
10768 ++ return d;
10769 ++}
10770 ++
10771 ++/**
10772 ++ * bfq_calc_finish - assign the finish time to an entity.
10773 ++ * @entity: the entity to act upon.
10774 ++ * @service: the service to be charged to the entity.
10775 ++ */
10776 ++static inline void bfq_calc_finish(struct bfq_entity *entity,
10777 ++ unsigned long service)
10778 ++{
10779 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10780 ++
10781 ++ BUG_ON(entity->weight == 0);
10782 ++
10783 ++ entity->finish = entity->start +
10784 ++ bfq_delta(service, entity->weight);
10785 ++
10786 ++ if (bfqq != NULL) {
10787 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
10788 ++ "calc_finish: serv %lu, w %d",
10789 ++ service, entity->weight);
10790 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
10791 ++ "calc_finish: start %llu, finish %llu, delta %llu",
10792 ++ entity->start, entity->finish,
10793 ++ bfq_delta(service, entity->weight));
10794 ++ }
10795 ++}
10796 ++
10797 ++/**
10798 ++ * bfq_entity_of - get an entity from a node.
10799 ++ * @node: the node field of the entity.
10800 ++ *
10801 ++ * Convert a node pointer to the relative entity. This is used only
10802 ++ * to simplify the logic of some functions and not as the generic
10803 ++ * conversion mechanism because, e.g., in the tree walking functions,
10804 ++ * the check for a %NULL value would be redundant.
10805 ++ */
10806 ++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
10807 ++{
10808 ++ struct bfq_entity *entity = NULL;
10809 ++
10810 ++ if (node != NULL)
10811 ++ entity = rb_entry(node, struct bfq_entity, rb_node);
10812 ++
10813 ++ return entity;
10814 ++}
10815 ++
10816 ++/**
10817 ++ * bfq_extract - remove an entity from a tree.
10818 ++ * @root: the tree root.
10819 ++ * @entity: the entity to remove.
10820 ++ */
10821 ++static inline void bfq_extract(struct rb_root *root,
10822 ++ struct bfq_entity *entity)
10823 ++{
10824 ++ BUG_ON(entity->tree != root);
10825 ++
10826 ++ entity->tree = NULL;
10827 ++ rb_erase(&entity->rb_node, root);
10828 ++}
10829 ++
10830 ++/**
10831 ++ * bfq_idle_extract - extract an entity from the idle tree.
10832 ++ * @st: the service tree of the owning @entity.
10833 ++ * @entity: the entity being removed.
10834 ++ */
10835 ++static void bfq_idle_extract(struct bfq_service_tree *st,
10836 ++ struct bfq_entity *entity)
10837 ++{
10838 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10839 ++ struct rb_node *next;
10840 ++
10841 ++ BUG_ON(entity->tree != &st->idle);
10842 ++
10843 ++ if (entity == st->first_idle) {
10844 ++ next = rb_next(&entity->rb_node);
10845 ++ st->first_idle = bfq_entity_of(next);
10846 ++ }
10847 ++
10848 ++ if (entity == st->last_idle) {
10849 ++ next = rb_prev(&entity->rb_node);
10850 ++ st->last_idle = bfq_entity_of(next);
10851 ++ }
10852 ++
10853 ++ bfq_extract(&st->idle, entity);
10854 ++
10855 ++ if (bfqq != NULL)
10856 ++ list_del(&bfqq->bfqq_list);
10857 ++}
10858 ++
10859 ++/**
10860 ++ * bfq_insert - generic tree insertion.
10861 ++ * @root: tree root.
10862 ++ * @entity: entity to insert.
10863 ++ *
10864 ++ * This is used for the idle and the active tree, since they are both
10865 ++ * ordered by finish time.
10866 ++ */
10867 ++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
10868 ++{
10869 ++ struct bfq_entity *entry;
10870 ++ struct rb_node **node = &root->rb_node;
10871 ++ struct rb_node *parent = NULL;
10872 ++
10873 ++ BUG_ON(entity->tree != NULL);
10874 ++
10875 ++ while (*node != NULL) {
10876 ++ parent = *node;
10877 ++ entry = rb_entry(parent, struct bfq_entity, rb_node);
10878 ++
10879 ++ if (bfq_gt(entry->finish, entity->finish))
10880 ++ node = &parent->rb_left;
10881 ++ else
10882 ++ node = &parent->rb_right;
10883 ++ }
10884 ++
10885 ++ rb_link_node(&entity->rb_node, parent, node);
10886 ++ rb_insert_color(&entity->rb_node, root);
10887 ++
10888 ++ entity->tree = root;
10889 ++}
10890 ++
10891 ++/**
10892 ++ * bfq_update_min - update the min_start field of a entity.
10893 ++ * @entity: the entity to update.
10894 ++ * @node: one of its children.
10895 ++ *
10896 ++ * This function is called when @entity may store an invalid value for
10897 ++ * min_start due to updates to the active tree. The function assumes
10898 ++ * that the subtree rooted at @node (which may be its left or its right
10899 ++ * child) has a valid min_start value.
10900 ++ */
10901 ++static inline void bfq_update_min(struct bfq_entity *entity,
10902 ++ struct rb_node *node)
10903 ++{
10904 ++ struct bfq_entity *child;
10905 ++
10906 ++ if (node != NULL) {
10907 ++ child = rb_entry(node, struct bfq_entity, rb_node);
10908 ++ if (bfq_gt(entity->min_start, child->min_start))
10909 ++ entity->min_start = child->min_start;
10910 ++ }
10911 ++}
10912 ++
10913 ++/**
10914 ++ * bfq_update_active_node - recalculate min_start.
10915 ++ * @node: the node to update.
10916 ++ *
10917 ++ * @node may have changed position or one of its children may have moved,
10918 ++ * this function updates its min_start value. The left and right subtrees
10919 ++ * are assumed to hold a correct min_start value.
10920 ++ */
10921 ++static inline void bfq_update_active_node(struct rb_node *node)
10922 ++{
10923 ++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
10924 ++
10925 ++ entity->min_start = entity->start;
10926 ++ bfq_update_min(entity, node->rb_right);
10927 ++ bfq_update_min(entity, node->rb_left);
10928 ++}
10929 ++
10930 ++/**
10931 ++ * bfq_update_active_tree - update min_start for the whole active tree.
10932 ++ * @node: the starting node.
10933 ++ *
10934 ++ * @node must be the deepest modified node after an update. This function
10935 ++ * updates its min_start using the values held by its children, assuming
10936 ++ * that they did not change, and then updates all the nodes that may have
10937 ++ * changed in the path to the root. The only nodes that may have changed
10938 ++ * are the ones in the path or their siblings.
10939 ++ */
10940 ++static void bfq_update_active_tree(struct rb_node *node)
10941 ++{
10942 ++ struct rb_node *parent;
10943 ++
10944 ++up:
10945 ++ bfq_update_active_node(node);
10946 ++
10947 ++ parent = rb_parent(node);
10948 ++ if (parent == NULL)
10949 ++ return;
10950 ++
10951 ++ if (node == parent->rb_left && parent->rb_right != NULL)
10952 ++ bfq_update_active_node(parent->rb_right);
10953 ++ else if (parent->rb_left != NULL)
10954 ++ bfq_update_active_node(parent->rb_left);
10955 ++
10956 ++ node = parent;
10957 ++ goto up;
10958 ++}
10959 ++
10960 ++/**
10961 ++ * bfq_active_insert - insert an entity in the active tree of its group/device.
10962 ++ * @st: the service tree of the entity.
10963 ++ * @entity: the entity being inserted.
10964 ++ *
10965 ++ * The active tree is ordered by finish time, but an extra key is kept
10966 ++ * per each node, containing the minimum value for the start times of
10967 ++ * its children (and the node itself), so it's possible to search for
10968 ++ * the eligible node with the lowest finish time in logarithmic time.
10969 ++ */
10970 ++static void bfq_active_insert(struct bfq_service_tree *st,
10971 ++ struct bfq_entity *entity)
10972 ++{
10973 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10974 ++ struct rb_node *node = &entity->rb_node;
10975 ++
10976 ++ bfq_insert(&st->active, entity);
10977 ++
10978 ++ if (node->rb_left != NULL)
10979 ++ node = node->rb_left;
10980 ++ else if (node->rb_right != NULL)
10981 ++ node = node->rb_right;
10982 ++
10983 ++ bfq_update_active_tree(node);
10984 ++
10985 ++ if (bfqq != NULL)
10986 ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
10987 ++}
10988 ++
10989 ++/**
10990 ++ * bfq_ioprio_to_weight - calc a weight from an ioprio.
10991 ++ * @ioprio: the ioprio value to convert.
10992 ++ */
10993 ++static unsigned short bfq_ioprio_to_weight(int ioprio)
10994 ++{
10995 ++ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
10996 ++ return IOPRIO_BE_NR - ioprio;
10997 ++}
10998 ++
10999 ++/**
11000 ++ * bfq_weight_to_ioprio - calc an ioprio from a weight.
11001 ++ * @weight: the weight value to convert.
11002 ++ *
11003 ++ * To preserve as mush as possible the old only-ioprio user interface,
11004 ++ * 0 is used as an escape ioprio value for weights (numerically) equal or
11005 ++ * larger than IOPRIO_BE_NR
11006 ++ */
11007 ++static unsigned short bfq_weight_to_ioprio(int weight)
11008 ++{
11009 ++ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
11010 ++ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
11011 ++}
11012 ++
11013 ++static inline void bfq_get_entity(struct bfq_entity *entity)
11014 ++{
11015 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11016 ++ struct bfq_sched_data *sd;
11017 ++
11018 ++ if (bfqq != NULL) {
11019 ++ sd = entity->sched_data;
11020 ++ atomic_inc(&bfqq->ref);
11021 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
11022 ++ bfqq, atomic_read(&bfqq->ref));
11023 ++ }
11024 ++}
11025 ++
11026 ++/**
11027 ++ * bfq_find_deepest - find the deepest node that an extraction can modify.
11028 ++ * @node: the node being removed.
11029 ++ *
11030 ++ * Do the first step of an extraction in an rb tree, looking for the
11031 ++ * node that will replace @node, and returning the deepest node that
11032 ++ * the following modifications to the tree can touch. If @node is the
11033 ++ * last node in the tree return %NULL.
11034 ++ */
11035 ++static struct rb_node *bfq_find_deepest(struct rb_node *node)
11036 ++{
11037 ++ struct rb_node *deepest;
11038 ++
11039 ++ if (node->rb_right == NULL && node->rb_left == NULL)
11040 ++ deepest = rb_parent(node);
11041 ++ else if (node->rb_right == NULL)
11042 ++ deepest = node->rb_left;
11043 ++ else if (node->rb_left == NULL)
11044 ++ deepest = node->rb_right;
11045 ++ else {
11046 ++ deepest = rb_next(node);
11047 ++ if (deepest->rb_right != NULL)
11048 ++ deepest = deepest->rb_right;
11049 ++ else if (rb_parent(deepest) != node)
11050 ++ deepest = rb_parent(deepest);
11051 ++ }
11052 ++
11053 ++ return deepest;
11054 ++}
11055 ++
11056 ++/**
11057 ++ * bfq_active_extract - remove an entity from the active tree.
11058 ++ * @st: the service_tree containing the tree.
11059 ++ * @entity: the entity being removed.
11060 ++ */
11061 ++static void bfq_active_extract(struct bfq_service_tree *st,
11062 ++ struct bfq_entity *entity)
11063 ++{
11064 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11065 ++ struct rb_node *node;
11066 ++
11067 ++ node = bfq_find_deepest(&entity->rb_node);
11068 ++ bfq_extract(&st->active, entity);
11069 ++
11070 ++ if (node != NULL)
11071 ++ bfq_update_active_tree(node);
11072 ++
11073 ++ if (bfqq != NULL)
11074 ++ list_del(&bfqq->bfqq_list);
11075 ++}
11076 ++
11077 ++/**
11078 ++ * bfq_idle_insert - insert an entity into the idle tree.
11079 ++ * @st: the service tree containing the tree.
11080 ++ * @entity: the entity to insert.
11081 ++ */
11082 ++static void bfq_idle_insert(struct bfq_service_tree *st,
11083 ++ struct bfq_entity *entity)
11084 ++{
11085 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11086 ++ struct bfq_entity *first_idle = st->first_idle;
11087 ++ struct bfq_entity *last_idle = st->last_idle;
11088 ++
11089 ++ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
11090 ++ st->first_idle = entity;
11091 ++ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
11092 ++ st->last_idle = entity;
11093 ++
11094 ++ bfq_insert(&st->idle, entity);
11095 ++
11096 ++ if (bfqq != NULL)
11097 ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
11098 ++}
11099 ++
11100 ++/**
11101 ++ * bfq_forget_entity - remove an entity from the wfq trees.
11102 ++ * @st: the service tree.
11103 ++ * @entity: the entity being removed.
11104 ++ *
11105 ++ * Update the device status and forget everything about @entity, putting
11106 ++ * the device reference to it, if it is a queue. Entities belonging to
11107 ++ * groups are not refcounted.
11108 ++ */
11109 ++static void bfq_forget_entity(struct bfq_service_tree *st,
11110 ++ struct bfq_entity *entity)
11111 ++{
11112 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11113 ++ struct bfq_sched_data *sd;
11114 ++
11115 ++ BUG_ON(!entity->on_st);
11116 ++
11117 ++ entity->on_st = 0;
11118 ++ st->wsum -= entity->weight;
11119 ++ if (bfqq != NULL) {
11120 ++ sd = entity->sched_data;
11121 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
11122 ++ bfqq, atomic_read(&bfqq->ref));
11123 ++ bfq_put_queue(bfqq);
11124 ++ }
11125 ++}
11126 ++
11127 ++/**
11128 ++ * bfq_put_idle_entity - release the idle tree ref of an entity.
11129 ++ * @st: service tree for the entity.
11130 ++ * @entity: the entity being released.
11131 ++ */
11132 ++static void bfq_put_idle_entity(struct bfq_service_tree *st,
11133 ++ struct bfq_entity *entity)
11134 ++{
11135 ++ bfq_idle_extract(st, entity);
11136 ++ bfq_forget_entity(st, entity);
11137 ++}
11138 ++
11139 ++/**
11140 ++ * bfq_forget_idle - update the idle tree if necessary.
11141 ++ * @st: the service tree to act upon.
11142 ++ *
11143 ++ * To preserve the global O(log N) complexity we only remove one entry here;
11144 ++ * as the idle tree will not grow indefinitely this can be done safely.
11145 ++ */
11146 ++static void bfq_forget_idle(struct bfq_service_tree *st)
11147 ++{
11148 ++ struct bfq_entity *first_idle = st->first_idle;
11149 ++ struct bfq_entity *last_idle = st->last_idle;
11150 ++
11151 ++ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
11152 ++ !bfq_gt(last_idle->finish, st->vtime)) {
11153 ++ /*
11154 ++ * Forget the whole idle tree, increasing the vtime past
11155 ++ * the last finish time of idle entities.
11156 ++ */
11157 ++ st->vtime = last_idle->finish;
11158 ++ }
11159 ++
11160 ++ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
11161 ++ bfq_put_idle_entity(st, first_idle);
11162 ++}
11163 ++
11164 ++static struct bfq_service_tree *
11165 ++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
11166 ++ struct bfq_entity *entity)
11167 ++{
11168 ++ struct bfq_service_tree *new_st = old_st;
11169 ++
11170 ++ if (entity->ioprio_changed) {
11171 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11172 ++
11173 ++ BUG_ON(old_st->wsum < entity->weight);
11174 ++ old_st->wsum -= entity->weight;
11175 ++
11176 ++ if (entity->new_weight != entity->orig_weight) {
11177 ++ entity->orig_weight = entity->new_weight;
11178 ++ entity->ioprio =
11179 ++ bfq_weight_to_ioprio(entity->orig_weight);
11180 ++ } else if (entity->new_ioprio != entity->ioprio) {
11181 ++ entity->ioprio = entity->new_ioprio;
11182 ++ entity->orig_weight =
11183 ++ bfq_ioprio_to_weight(entity->ioprio);
11184 ++ } else
11185 ++ entity->new_weight = entity->orig_weight =
11186 ++ bfq_ioprio_to_weight(entity->ioprio);
11187 ++
11188 ++ entity->ioprio_class = entity->new_ioprio_class;
11189 ++ entity->ioprio_changed = 0;
11190 ++
11191 ++ /*
11192 ++ * NOTE: here we may be changing the weight too early,
11193 ++ * this will cause unfairness. The correct approach
11194 ++ * would have required additional complexity to defer
11195 ++ * weight changes to the proper time instants (i.e.,
11196 ++ * when entity->finish <= old_st->vtime).
11197 ++ */
11198 ++ new_st = bfq_entity_service_tree(entity);
11199 ++ entity->weight = entity->orig_weight *
11200 ++ (bfqq != NULL ? bfqq->raising_coeff : 1);
11201 ++ new_st->wsum += entity->weight;
11202 ++
11203 ++ if (new_st != old_st)
11204 ++ entity->start = new_st->vtime;
11205 ++ }
11206 ++
11207 ++ return new_st;
11208 ++}
11209 ++
11210 ++/**
11211 ++ * bfq_bfqq_served - update the scheduler status after selection for service.
11212 ++ * @bfqq: the queue being served.
11213 ++ * @served: bytes to transfer.
11214 ++ *
11215 ++ * NOTE: this can be optimized, as the timestamps of upper level entities
11216 ++ * are synchronized every time a new bfqq is selected for service. By now,
11217 ++ * we keep it to better check consistency.
11218 ++ */
11219 ++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
11220 ++{
11221 ++ struct bfq_entity *entity = &bfqq->entity;
11222 ++ struct bfq_service_tree *st;
11223 ++
11224 ++ for_each_entity(entity) {
11225 ++ st = bfq_entity_service_tree(entity);
11226 ++
11227 ++ entity->service += served;
11228 ++ BUG_ON(entity->service > entity->budget);
11229 ++ BUG_ON(st->wsum == 0);
11230 ++
11231 ++ st->vtime += bfq_delta(served, st->wsum);
11232 ++ bfq_forget_idle(st);
11233 ++ }
11234 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
11235 ++}
11236 ++
11237 ++/**
11238 ++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
11239 ++ * @bfqq: the queue that needs a service update.
11240 ++ *
11241 ++ * When it's not possible to be fair in the service domain, because
11242 ++ * a queue is not consuming its budget fast enough (the meaning of
11243 ++ * fast depends on the timeout parameter), we charge it a full
11244 ++ * budget. In this way we should obtain a sort of time-domain
11245 ++ * fairness among all the seeky/slow queues.
11246 ++ */
11247 ++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
11248 ++{
11249 ++ struct bfq_entity *entity = &bfqq->entity;
11250 ++
11251 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
11252 ++
11253 ++ bfq_bfqq_served(bfqq, entity->budget - entity->service);
11254 ++}
11255 ++
11256 ++/**
11257 ++ * __bfq_activate_entity - activate an entity.
11258 ++ * @entity: the entity being activated.
11259 ++ *
11260 ++ * Called whenever an entity is activated, i.e., it is not active and one
11261 ++ * of its children receives a new request, or has to be reactivated due to
11262 ++ * budget exhaustion. It uses the current budget of the entity (and the
11263 ++ * service received if @entity is active) of the queue to calculate its
11264 ++ * timestamps.
11265 ++ */
11266 ++static void __bfq_activate_entity(struct bfq_entity *entity)
11267 ++{
11268 ++ struct bfq_sched_data *sd = entity->sched_data;
11269 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
11270 ++
11271 ++ if (entity == sd->in_service_entity) {
11272 ++ BUG_ON(entity->tree != NULL);
11273 ++ /*
11274 ++ * If we are requeueing the current entity we have
11275 ++ * to take care of not charging to it service it has
11276 ++ * not received.
11277 ++ */
11278 ++ bfq_calc_finish(entity, entity->service);
11279 ++ entity->start = entity->finish;
11280 ++ sd->in_service_entity = NULL;
11281 ++ } else if (entity->tree == &st->active) {
11282 ++ /*
11283 ++ * Requeueing an entity due to a change of some
11284 ++ * next_in_service entity below it. We reuse the
11285 ++ * old start time.
11286 ++ */
11287 ++ bfq_active_extract(st, entity);
11288 ++ } else if (entity->tree == &st->idle) {
11289 ++ /*
11290 ++ * Must be on the idle tree, bfq_idle_extract() will
11291 ++ * check for that.
11292 ++ */
11293 ++ bfq_idle_extract(st, entity);
11294 ++ entity->start = bfq_gt(st->vtime, entity->finish) ?
11295 ++ st->vtime : entity->finish;
11296 ++ } else {
11297 ++ /*
11298 ++ * The finish time of the entity may be invalid, and
11299 ++ * it is in the past for sure, otherwise the queue
11300 ++ * would have been on the idle tree.
11301 ++ */
11302 ++ entity->start = st->vtime;
11303 ++ st->wsum += entity->weight;
11304 ++ bfq_get_entity(entity);
11305 ++
11306 ++ BUG_ON(entity->on_st);
11307 ++ entity->on_st = 1;
11308 ++ }
11309 ++
11310 ++ st = __bfq_entity_update_weight_prio(st, entity);
11311 ++ bfq_calc_finish(entity, entity->budget);
11312 ++ bfq_active_insert(st, entity);
11313 ++}
11314 ++
11315 ++/**
11316 ++ * bfq_activate_entity - activate an entity and its ancestors if necessary.
11317 ++ * @entity: the entity to activate.
11318 ++ *
11319 ++ * Activate @entity and all the entities on the path from it to the root.
11320 ++ */
11321 ++static void bfq_activate_entity(struct bfq_entity *entity)
11322 ++{
11323 ++ struct bfq_sched_data *sd;
11324 ++
11325 ++ for_each_entity(entity) {
11326 ++ __bfq_activate_entity(entity);
11327 ++
11328 ++ sd = entity->sched_data;
11329 ++ if (!bfq_update_next_in_service(sd))
11330 ++ /*
11331 ++ * No need to propagate the activation to the
11332 ++ * upper entities, as they will be updated when
11333 ++ * the in-service entity is rescheduled.
11334 ++ */
11335 ++ break;
11336 ++ }
11337 ++}
11338 ++
11339 ++/**
11340 ++ * __bfq_deactivate_entity - deactivate an entity from its service tree.
11341 ++ * @entity: the entity to deactivate.
11342 ++ * @requeue: if false, the entity will not be put into the idle tree.
11343 ++ *
11344 ++ * Deactivate an entity, independently from its previous state. If the
11345 ++ * entity was not on a service tree just return, otherwise if it is on
11346 ++ * any scheduler tree, extract it from that tree, and if necessary
11347 ++ * and if the caller did not specify @requeue, put it on the idle tree.
11348 ++ *
11349 ++ * Return %1 if the caller should update the entity hierarchy, i.e.,
11350 ++ * if the entity was under service or if it was the next_in_service for
11351 ++ * its sched_data; return %0 otherwise.
11352 ++ */
11353 ++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
11354 ++{
11355 ++ struct bfq_sched_data *sd = entity->sched_data;
11356 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
11357 ++ int was_in_service = entity == sd->in_service_entity;
11358 ++ int ret = 0;
11359 ++
11360 ++ if (!entity->on_st)
11361 ++ return 0;
11362 ++
11363 ++ BUG_ON(was_in_service && entity->tree != NULL);
11364 ++
11365 ++ if (was_in_service) {
11366 ++ bfq_calc_finish(entity, entity->service);
11367 ++ sd->in_service_entity = NULL;
11368 ++ } else if (entity->tree == &st->active)
11369 ++ bfq_active_extract(st, entity);
11370 ++ else if (entity->tree == &st->idle)
11371 ++ bfq_idle_extract(st, entity);
11372 ++ else if (entity->tree != NULL)
11373 ++ BUG();
11374 ++
11375 ++ if (was_in_service || sd->next_in_service == entity)
11376 ++ ret = bfq_update_next_in_service(sd);
11377 ++
11378 ++ if (!requeue || !bfq_gt(entity->finish, st->vtime))
11379 ++ bfq_forget_entity(st, entity);
11380 ++ else
11381 ++ bfq_idle_insert(st, entity);
11382 ++
11383 ++ BUG_ON(sd->in_service_entity == entity);
11384 ++ BUG_ON(sd->next_in_service == entity);
11385 ++
11386 ++ return ret;
11387 ++}
11388 ++
11389 ++/**
11390 ++ * bfq_deactivate_entity - deactivate an entity.
11391 ++ * @entity: the entity to deactivate.
11392 ++ * @requeue: true if the entity can be put on the idle tree
11393 ++ */
11394 ++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
11395 ++{
11396 ++ struct bfq_sched_data *sd;
11397 ++ struct bfq_entity *parent;
11398 ++
11399 ++ for_each_entity_safe(entity, parent) {
11400 ++ sd = entity->sched_data;
11401 ++
11402 ++ if (!__bfq_deactivate_entity(entity, requeue))
11403 ++ /*
11404 ++ * The parent entity is still backlogged, and
11405 ++ * we don't need to update it as it is still
11406 ++ * under service.
11407 ++ */
11408 ++ break;
11409 ++
11410 ++ if (sd->next_in_service != NULL)
11411 ++ /*
11412 ++ * The parent entity is still backlogged and
11413 ++ * the budgets on the path towards the root
11414 ++ * need to be updated.
11415 ++ */
11416 ++ goto update;
11417 ++
11418 ++ /*
11419 ++ * If we reach there the parent is no more backlogged and
11420 ++ * we want to propagate the dequeue upwards.
11421 ++ */
11422 ++ requeue = 1;
11423 ++ }
11424 ++
11425 ++ return;
11426 ++
11427 ++update:
11428 ++ entity = parent;
11429 ++ for_each_entity(entity) {
11430 ++ __bfq_activate_entity(entity);
11431 ++
11432 ++ sd = entity->sched_data;
11433 ++ if (!bfq_update_next_in_service(sd))
11434 ++ break;
11435 ++ }
11436 ++}
11437 ++
11438 ++/**
11439 ++ * bfq_update_vtime - update vtime if necessary.
11440 ++ * @st: the service tree to act upon.
11441 ++ *
11442 ++ * If necessary update the service tree vtime to have at least one
11443 ++ * eligible entity, skipping to its start time. Assumes that the
11444 ++ * active tree of the device is not empty.
11445 ++ *
11446 ++ * NOTE: this hierarchical implementation updates vtimes quite often,
11447 ++ * we may end up with reactivated tasks getting timestamps after a
11448 ++ * vtime skip done because we needed a ->first_active entity on some
11449 ++ * intermediate node.
11450 ++ */
11451 ++static void bfq_update_vtime(struct bfq_service_tree *st)
11452 ++{
11453 ++ struct bfq_entity *entry;
11454 ++ struct rb_node *node = st->active.rb_node;
11455 ++
11456 ++ entry = rb_entry(node, struct bfq_entity, rb_node);
11457 ++ if (bfq_gt(entry->min_start, st->vtime)) {
11458 ++ st->vtime = entry->min_start;
11459 ++ bfq_forget_idle(st);
11460 ++ }
11461 ++}
11462 ++
11463 ++/**
11464 ++ * bfq_first_active_entity - find the eligible entity with
11465 ++ * the smallest finish time
11466 ++ * @st: the service tree to select from.
11467 ++ *
11468 ++ * This function searches the first schedulable entity, starting from the
11469 ++ * root of the tree and going on the left every time on this side there is
11470 ++ * a subtree with at least one eligible (start >= vtime) entity. The path
11471 ++ * on the right is followed only if a) the left subtree contains no eligible
11472 ++ * entities and b) no eligible entity has been found yet.
11473 ++ */
11474 ++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
11475 ++{
11476 ++ struct bfq_entity *entry, *first = NULL;
11477 ++ struct rb_node *node = st->active.rb_node;
11478 ++
11479 ++ while (node != NULL) {
11480 ++ entry = rb_entry(node, struct bfq_entity, rb_node);
11481 ++left:
11482 ++ if (!bfq_gt(entry->start, st->vtime))
11483 ++ first = entry;
11484 ++
11485 ++ BUG_ON(bfq_gt(entry->min_start, st->vtime));
11486 ++
11487 ++ if (node->rb_left != NULL) {
11488 ++ entry = rb_entry(node->rb_left,
11489 ++ struct bfq_entity, rb_node);
11490 ++ if (!bfq_gt(entry->min_start, st->vtime)) {
11491 ++ node = node->rb_left;
11492 ++ goto left;
11493 ++ }
11494 ++ }
11495 ++ if (first != NULL)
11496 ++ break;
11497 ++ node = node->rb_right;
11498 ++ }
11499 ++
11500 ++ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
11501 ++ return first;
11502 ++}
11503 ++
11504 ++/**
11505 ++ * __bfq_lookup_next_entity - return the first eligible entity in @st.
11506 ++ * @st: the service tree.
11507 ++ *
11508 ++ * Update the virtual time in @st and return the first eligible entity
11509 ++ * it contains.
11510 ++ */
11511 ++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
11512 ++ bool force)
11513 ++{
11514 ++ struct bfq_entity *entity, *new_next_in_service = NULL;
11515 ++
11516 ++ if (RB_EMPTY_ROOT(&st->active))
11517 ++ return NULL;
11518 ++
11519 ++ bfq_update_vtime(st);
11520 ++ entity = bfq_first_active_entity(st);
11521 ++ BUG_ON(bfq_gt(entity->start, st->vtime));
11522 ++
11523 ++ /*
11524 ++ * If the chosen entity does not match with the sched_data's
11525 ++ * next_in_service and we are forcedly serving the IDLE priority
11526 ++ * class tree, bubble up budget update.
11527 ++ */
11528 ++ if (unlikely(force && entity != entity->sched_data->next_in_service)) {
11529 ++ new_next_in_service = entity;
11530 ++ for_each_entity(new_next_in_service)
11531 ++ bfq_update_budget(new_next_in_service);
11532 ++ }
11533 ++
11534 ++ return entity;
11535 ++}
11536 ++
11537 ++/**
11538 ++ * bfq_lookup_next_entity - return the first eligible entity in @sd.
11539 ++ * @sd: the sched_data.
11540 ++ * @extract: if true the returned entity will be also extracted from @sd.
11541 ++ *
11542 ++ * NOTE: since we cache the next_in_service entity at each level of the
11543 ++ * hierarchy, the complexity of the lookup can be decreased with
11544 ++ * absolutely no effort just returning the cached next_in_service value;
11545 ++ * we prefer to do full lookups to test the consistency of * the data
11546 ++ * structures.
11547 ++ */
11548 ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
11549 ++ int extract,
11550 ++ struct bfq_data *bfqd)
11551 ++{
11552 ++ struct bfq_service_tree *st = sd->service_tree;
11553 ++ struct bfq_entity *entity;
11554 ++ int i = 0;
11555 ++
11556 ++ BUG_ON(sd->in_service_entity != NULL);
11557 ++
11558 ++ if (bfqd != NULL &&
11559 ++ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
11560 ++ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
11561 ++ true);
11562 ++ if (entity != NULL) {
11563 ++ i = BFQ_IOPRIO_CLASSES - 1;
11564 ++ bfqd->bfq_class_idle_last_service = jiffies;
11565 ++ sd->next_in_service = entity;
11566 ++ }
11567 ++ }
11568 ++ for (; i < BFQ_IOPRIO_CLASSES; i++) {
11569 ++ entity = __bfq_lookup_next_entity(st + i, false);
11570 ++ if (entity != NULL) {
11571 ++ if (extract) {
11572 ++ bfq_check_next_in_service(sd, entity);
11573 ++ bfq_active_extract(st + i, entity);
11574 ++ sd->in_service_entity = entity;
11575 ++ sd->next_in_service = NULL;
11576 ++ }
11577 ++ break;
11578 ++ }
11579 ++ }
11580 ++
11581 ++ return entity;
11582 ++}
11583 ++
11584 ++/*
11585 ++ * Get next queue for service.
11586 ++ */
11587 ++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
11588 ++{
11589 ++ struct bfq_entity *entity = NULL;
11590 ++ struct bfq_sched_data *sd;
11591 ++ struct bfq_queue *bfqq;
11592 ++
11593 ++ BUG_ON(bfqd->in_service_queue != NULL);
11594 ++
11595 ++ if (bfqd->busy_queues == 0)
11596 ++ return NULL;
11597 ++
11598 ++ sd = &bfqd->root_group->sched_data;
11599 ++ for (; sd != NULL; sd = entity->my_sched_data) {
11600 ++ entity = bfq_lookup_next_entity(sd, 1, bfqd);
11601 ++ BUG_ON(entity == NULL);
11602 ++ entity->service = 0;
11603 ++ }
11604 ++
11605 ++ bfqq = bfq_entity_to_bfqq(entity);
11606 ++ BUG_ON(bfqq == NULL);
11607 ++
11608 ++ return bfqq;
11609 ++}
11610 ++
11611 ++/*
11612 ++ * Forced extraction of the given queue.
11613 ++ */
11614 ++static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
11615 ++ struct bfq_queue *bfqq)
11616 ++{
11617 ++ struct bfq_entity *entity;
11618 ++ struct bfq_sched_data *sd;
11619 ++
11620 ++ BUG_ON(bfqd->in_service_queue != NULL);
11621 ++
11622 ++ entity = &bfqq->entity;
11623 ++ /*
11624 ++ * Bubble up extraction/update from the leaf to the root.
11625 ++ */
11626 ++ for_each_entity(entity) {
11627 ++ sd = entity->sched_data;
11628 ++ bfq_update_budget(entity);
11629 ++ bfq_update_vtime(bfq_entity_service_tree(entity));
11630 ++ bfq_active_extract(bfq_entity_service_tree(entity), entity);
11631 ++ sd->active_entity = entity;
11632 ++ sd->next_active = NULL;
11633 ++ entity->service = 0;
11634 ++ }
11635 ++
11636 ++ return;
11637 ++}
11638 ++
11639 ++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
11640 ++{
11641 ++ if (bfqd->in_service_bic != NULL) {
11642 ++ put_io_context(bfqd->in_service_bic->icq.ioc);
11643 ++ bfqd->in_service_bic = NULL;
11644 ++ }
11645 ++
11646 ++ bfqd->in_service_queue = NULL;
11647 ++ del_timer(&bfqd->idle_slice_timer);
11648 ++}
11649 ++
11650 ++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
11651 ++ int requeue)
11652 ++{
11653 ++ struct bfq_entity *entity = &bfqq->entity;
11654 ++
11655 ++ if (bfqq == bfqd->in_service_queue)
11656 ++ __bfq_bfqd_reset_in_service(bfqd);
11657 ++
11658 ++ bfq_deactivate_entity(entity, requeue);
11659 ++}
11660 ++
11661 ++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
11662 ++{
11663 ++ struct bfq_entity *entity = &bfqq->entity;
11664 ++
11665 ++ bfq_activate_entity(entity);
11666 ++}
11667 ++
11668 ++/*
11669 ++ * Called when the bfqq no longer has requests pending, remove it from
11670 ++ * the service tree.
11671 ++ */
11672 ++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
11673 ++ int requeue)
11674 ++{
11675 ++ BUG_ON(!bfq_bfqq_busy(bfqq));
11676 ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
11677 ++
11678 ++ bfq_log_bfqq(bfqd, bfqq, "del from busy");
11679 ++
11680 ++ bfq_clear_bfqq_busy(bfqq);
11681 ++
11682 ++ BUG_ON(bfqd->busy_queues == 0);
11683 ++ bfqd->busy_queues--;
11684 ++ if (bfqq->raising_coeff > 1)
11685 ++ bfqd->raised_busy_queues--;
11686 ++
11687 ++ bfq_deactivate_bfqq(bfqd, bfqq, requeue);
11688 ++}
11689 ++
11690 ++/*
11691 ++ * Called when an inactive queue receives a new request.
11692 ++ */
11693 ++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
11694 ++{
11695 ++ BUG_ON(bfq_bfqq_busy(bfqq));
11696 ++ BUG_ON(bfqq == bfqd->in_service_queue);
11697 ++
11698 ++ bfq_log_bfqq(bfqd, bfqq, "add to busy");
11699 ++
11700 ++ bfq_activate_bfqq(bfqd, bfqq);
11701 ++
11702 ++ bfq_mark_bfqq_busy(bfqq);
11703 ++ bfqd->busy_queues++;
11704 ++ if (bfqq->raising_coeff > 1)
11705 ++ bfqd->raised_busy_queues++;
11706 ++}
11707 +diff --git a/block/bfq.h b/block/bfq.h
11708 +new file mode 100644
11709 +index 0000000..f9b5881
11710 +--- /dev/null
11711 ++++ b/block/bfq.h
11712 +@@ -0,0 +1,614 @@
11713 ++/*
11714 ++ * BFQ-v7r1 for 3.13.0: data structures and common functions prototypes.
11715 ++ *
11716 ++ * Based on ideas and code from CFQ:
11717 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
11718 ++ *
11719 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
11720 ++ * Paolo Valente <paolo.valente@×××××××.it>
11721 ++ *
11722 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
11723 ++ */
11724 ++
11725 ++#ifndef _BFQ_H
11726 ++#define _BFQ_H
11727 ++
11728 ++#include <linux/blktrace_api.h>
11729 ++#include <linux/hrtimer.h>
11730 ++#include <linux/ioprio.h>
11731 ++#include <linux/rbtree.h>
11732 ++
11733 ++#define BFQ_IOPRIO_CLASSES 3
11734 ++#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
11735 ++
11736 ++#define BFQ_MIN_WEIGHT 1
11737 ++#define BFQ_MAX_WEIGHT 1000
11738 ++
11739 ++#define BFQ_DEFAULT_GRP_WEIGHT 10
11740 ++#define BFQ_DEFAULT_GRP_IOPRIO 0
11741 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
11742 ++
11743 ++struct bfq_entity;
11744 ++
11745 ++/**
11746 ++ * struct bfq_service_tree - per ioprio_class service tree.
11747 ++ * @active: tree for active entities (i.e., those backlogged).
11748 ++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
11749 ++ * @first_idle: idle entity with minimum F_i.
11750 ++ * @last_idle: idle entity with maximum F_i.
11751 ++ * @vtime: scheduler virtual time.
11752 ++ * @wsum: scheduler weight sum; active and idle entities contribute to it.
11753 ++ *
11754 ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
11755 ++ * ioprio_class has its own independent scheduler, and so its own
11756 ++ * bfq_service_tree. All the fields are protected by the queue lock
11757 ++ * of the containing bfqd.
11758 ++ */
11759 ++struct bfq_service_tree {
11760 ++ struct rb_root active;
11761 ++ struct rb_root idle;
11762 ++
11763 ++ struct bfq_entity *first_idle;
11764 ++ struct bfq_entity *last_idle;
11765 ++
11766 ++ u64 vtime;
11767 ++ unsigned long wsum;
11768 ++};
11769 ++
11770 ++/**
11771 ++ * struct bfq_sched_data - multi-class scheduler.
11772 ++ * @in_service_entity: entity under service.
11773 ++ * @next_in_service: head-of-the-line entity in the scheduler.
11774 ++ * @service_tree: array of service trees, one per ioprio_class.
11775 ++ *
11776 ++ * bfq_sched_data is the basic scheduler queue. It supports three
11777 ++ * ioprio_classes, and can be used either as a toplevel queue or as
11778 ++ * an intermediate queue on a hierarchical setup.
11779 ++ * @next_in_service points to the active entity of the sched_data
11780 ++ * service trees that will be scheduled next.
11781 ++ *
11782 ++ * The supported ioprio_classes are the same as in CFQ, in descending
11783 ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
11784 ++ * Requests from higher priority queues are served before all the
11785 ++ * requests from lower priority queues; among requests of the same
11786 ++ * queue requests are served according to B-WF2Q+.
11787 ++ * All the fields are protected by the queue lock of the containing bfqd.
11788 ++ */
11789 ++struct bfq_sched_data {
11790 ++ struct bfq_entity *in_service_entity;
11791 ++ struct bfq_entity *next_in_service;
11792 ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
11793 ++};
11794 ++
11795 ++/**
11796 ++ * struct bfq_entity - schedulable entity.
11797 ++ * @rb_node: service_tree member.
11798 ++ * @on_st: flag, true if the entity is on a tree (either the active or
11799 ++ * the idle one of its service_tree).
11800 ++ * @finish: B-WF2Q+ finish timestamp (aka F_i).
11801 ++ * @start: B-WF2Q+ start timestamp (aka S_i).
11802 ++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
11803 ++ * @min_start: minimum start time of the (active) subtree rooted at
11804 ++ * this entity; used for O(log N) lookups into active trees.
11805 ++ * @service: service received during the last round of service.
11806 ++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
11807 ++ * @weight: weight of the queue
11808 ++ * @parent: parent entity, for hierarchical scheduling.
11809 ++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
11810 ++ * associated scheduler queue, %NULL on leaf nodes.
11811 ++ * @sched_data: the scheduler queue this entity belongs to.
11812 ++ * @ioprio: the ioprio in use.
11813 ++ * @new_weight: when a weight change is requested, the new weight value.
11814 ++ * @orig_weight: original weight, used to implement weight boosting
11815 ++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
11816 ++ * @ioprio_class: the ioprio_class in use.
11817 ++ * @new_ioprio_class: when an ioprio_class change is requested, the new
11818 ++ * ioprio_class value.
11819 ++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
11820 ++ * ioprio_class change.
11821 ++ *
11822 ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
11823 ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
11824 ++ * entity belongs to the sched_data of the parent group in the cgroup
11825 ++ * hierarchy. Non-leaf entities have also their own sched_data, stored
11826 ++ * in @my_sched_data.
11827 ++ *
11828 ++ * Each entity stores independently its priority values; this would
11829 ++ * allow different weights on different devices, but this
11830 ++ * functionality is not exported to userspace by now. Priorities and
11831 ++ * weights are updated lazily, first storing the new values into the
11832 ++ * new_* fields, then setting the @ioprio_changed flag. As soon as
11833 ++ * there is a transition in the entity state that allows the priority
11834 ++ * update to take place the effective and the requested priority
11835 ++ * values are synchronized.
11836 ++ *
11837 ++ * Unless cgroups are used, the weight value is calculated from the
11838 ++ * ioprio to export the same interface as CFQ. When dealing with
11839 ++ * ``well-behaved'' queues (i.e., queues that do not spend too much
11840 ++ * time to consume their budget and have true sequential behavior, and
11841 ++ * when there are no external factors breaking anticipation) the
11842 ++ * relative weights at each level of the cgroups hierarchy should be
11843 ++ * guaranteed. All the fields are protected by the queue lock of the
11844 ++ * containing bfqd.
11845 ++ */
11846 ++struct bfq_entity {
11847 ++ struct rb_node rb_node;
11848 ++
11849 ++ int on_st;
11850 ++
11851 ++ u64 finish;
11852 ++ u64 start;
11853 ++
11854 ++ struct rb_root *tree;
11855 ++
11856 ++ u64 min_start;
11857 ++
11858 ++ unsigned long service, budget;
11859 ++ unsigned short weight, new_weight;
11860 ++ unsigned short orig_weight;
11861 ++
11862 ++ struct bfq_entity *parent;
11863 ++
11864 ++ struct bfq_sched_data *my_sched_data;
11865 ++ struct bfq_sched_data *sched_data;
11866 ++
11867 ++ unsigned short ioprio, new_ioprio;
11868 ++ unsigned short ioprio_class, new_ioprio_class;
11869 ++
11870 ++ int ioprio_changed;
11871 ++};
11872 ++
11873 ++struct bfq_group;
11874 ++
11875 ++/**
11876 ++ * struct bfq_queue - leaf schedulable entity.
11877 ++ * @ref: reference counter.
11878 ++ * @bfqd: parent bfq_data.
11879 ++ * @new_bfqq: shared bfq_queue if queue is cooperating with
11880 ++ * one or more other queues.
11881 ++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
11882 ++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
11883 ++ * @sort_list: sorted list of pending requests.
11884 ++ * @next_rq: if fifo isn't expired, next request to serve.
11885 ++ * @queued: nr of requests queued in @sort_list.
11886 ++ * @allocated: currently allocated requests.
11887 ++ * @meta_pending: pending metadata requests.
11888 ++ * @fifo: fifo list of requests in sort_list.
11889 ++ * @entity: entity representing this queue in the scheduler.
11890 ++ * @max_budget: maximum budget allowed from the feedback mechanism.
11891 ++ * @budget_timeout: budget expiration (in jiffies).
11892 ++ * @dispatched: number of requests on the dispatch list or inside driver.
11893 ++ * @org_ioprio: saved ioprio during boosted periods.
11894 ++ * @flags: status flags.
11895 ++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
11896 ++ * @seek_samples: number of seeks sampled
11897 ++ * @seek_total: sum of the distances of the seeks sampled
11898 ++ * @seek_mean: mean seek distance
11899 ++ * @last_request_pos: position of the last request enqueued
11900 ++ * @pid: pid of the process owning the queue, used for logging purposes.
11901 ++ * @last_rais_start_time: last (idle -> weight-raised) transition attempt
11902 ++ * @raising_cur_max_time: current max raising time for this queue
11903 ++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from
11904 ++ * idle to backlogged
11905 ++ * @service_from_backlogged: cumulative service received from the @bfq_queue
11906 ++ * since the last transition from idle to backlogged
11907 ++ *
11908 ++ * A bfq_queue is a leaf request queue; it can be associated to an io_context
11909 ++ * or more (if it is an async one). @cgroup holds a reference to the
11910 ++ * cgroup, to be sure that it does not disappear while a bfqq still
11911 ++ * references it (mostly to avoid races between request issuing and task
11912 ++ * migration followed by cgroup distruction).
11913 ++ * All the fields are protected by the queue lock of the containing bfqd.
11914 ++ */
11915 ++struct bfq_queue {
11916 ++ atomic_t ref;
11917 ++ struct bfq_data *bfqd;
11918 ++
11919 ++ /* fields for cooperating queues handling */
11920 ++ struct bfq_queue *new_bfqq;
11921 ++ struct rb_node pos_node;
11922 ++ struct rb_root *pos_root;
11923 ++
11924 ++ struct rb_root sort_list;
11925 ++ struct request *next_rq;
11926 ++ int queued[2];
11927 ++ int allocated[2];
11928 ++ int meta_pending;
11929 ++ struct list_head fifo;
11930 ++
11931 ++ struct bfq_entity entity;
11932 ++
11933 ++ unsigned long max_budget;
11934 ++ unsigned long budget_timeout;
11935 ++
11936 ++ int dispatched;
11937 ++
11938 ++ unsigned short org_ioprio;
11939 ++
11940 ++ unsigned int flags;
11941 ++
11942 ++ struct list_head bfqq_list;
11943 ++
11944 ++ unsigned int seek_samples;
11945 ++ u64 seek_total;
11946 ++ sector_t seek_mean;
11947 ++ sector_t last_request_pos;
11948 ++
11949 ++ pid_t pid;
11950 ++
11951 ++ /* weight-raising fields */
11952 ++ unsigned long raising_cur_max_time;
11953 ++ unsigned long soft_rt_next_start;
11954 ++ unsigned long last_rais_start_finish;
11955 ++ unsigned int raising_coeff;
11956 ++ unsigned long last_idle_bklogged;
11957 ++ unsigned long service_from_backlogged;
11958 ++};
11959 ++
11960 ++/**
11961 ++ * struct bfq_ttime - per process thinktime stats.
11962 ++ * @ttime_total: total process thinktime
11963 ++ * @ttime_samples: number of thinktime samples
11964 ++ * @ttime_mean: average process thinktime
11965 ++ */
11966 ++struct bfq_ttime {
11967 ++ unsigned long last_end_request;
11968 ++
11969 ++ unsigned long ttime_total;
11970 ++ unsigned long ttime_samples;
11971 ++ unsigned long ttime_mean;
11972 ++};
11973 ++
11974 ++/**
11975 ++ * struct bfq_io_cq - per (request_queue, io_context) structure.
11976 ++ * @icq: associated io_cq structure
11977 ++ * @bfqq: array of two process queues, the sync and the async
11978 ++ * @ttime: associated @bfq_ttime struct
11979 ++ */
11980 ++struct bfq_io_cq {
11981 ++ struct io_cq icq; /* must be the first member */
11982 ++ struct bfq_queue *bfqq[2];
11983 ++ struct bfq_ttime ttime;
11984 ++ int ioprio;
11985 ++};
11986 ++
11987 ++/**
11988 ++ * struct bfq_data - per device data structure.
11989 ++ * @queue: request queue for the managed device.
11990 ++ * @root_group: root bfq_group for the device.
11991 ++ * @rq_pos_tree: rbtree sorted by next_request position,
11992 ++ * used when determining if two or more queues
11993 ++ * have interleaving requests (see bfq_close_cooperator).
11994 ++ * @busy_queues: number of bfq_queues containing requests (including the
11995 ++ * queue under service, even if it is idling).
11996 ++ * @raised_busy_queues: number of weight-raised busy bfq_queues.
11997 ++ * @queued: number of queued requests.
11998 ++ * @rq_in_driver: number of requests dispatched and waiting for completion.
11999 ++ * @sync_flight: number of sync requests in the driver.
12000 ++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
12001 ++ * completed requests .
12002 ++ * @hw_tag_samples: nr of samples used to calculate hw_tag.
12003 ++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
12004 ++ * @budgets_assigned: number of budgets assigned.
12005 ++ * @idle_slice_timer: timer set when idling for the next sequential request
12006 ++ * from the queue under service.
12007 ++ * @unplug_work: delayed work to restart dispatching on the request queue.
12008 ++ * @in_service_queue: bfq_queue under service.
12009 ++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.
12010 ++ * @last_position: on-disk position of the last served request.
12011 ++ * @last_budget_start: beginning of the last budget.
12012 ++ * @last_idling_start: beginning of the last idle slice.
12013 ++ * @peak_rate: peak transfer rate observed for a budget.
12014 ++ * @peak_rate_samples: number of samples used to calculate @peak_rate.
12015 ++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
12016 ++ * @group_list: list of all the bfq_groups active on the device.
12017 ++ * @active_list: list of all the bfq_queues active on the device.
12018 ++ * @idle_list: list of all the bfq_queues idle on the device.
12019 ++ * @bfq_quantum: max number of requests dispatched per dispatch round.
12020 ++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
12021 ++ * requests are served in fifo order.
12022 ++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
12023 ++ * @bfq_back_max: maximum allowed backward seek.
12024 ++ * @bfq_slice_idle: maximum idling time.
12025 ++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
12026 ++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
12027 ++ * async queues.
12028 ++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
12029 ++ * to prevent seeky queues to impose long latencies to well
12030 ++ * behaved ones (this also implies that seeky queues cannot
12031 ++ * receive guarantees in the service domain; after a timeout
12032 ++ * they are charged for the whole allocated budget, to try
12033 ++ * to preserve a behavior reasonably fair among them, but
12034 ++ * without service-domain guarantees).
12035 ++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
12036 ++ * queue is multiplied
12037 ++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
12038 ++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
12039 ++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
12040 ++ * may be reactivated for a queue (in jiffies)
12041 ++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
12042 ++ * after which weight-raising may be
12043 ++ * reactivated for an already busy queue
12044 ++ * (in jiffies)
12045 ++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
12046 ++ * sectors per seconds
12047 ++ * @RT_prod: cached value of the product R*T used for computing the maximum
12048 ++ * duration of the weight raising automatically
12049 ++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
12050 ++ *
12051 ++ * All the fields are protected by the @queue lock.
12052 ++ */
12053 ++struct bfq_data {
12054 ++ struct request_queue *queue;
12055 ++
12056 ++ struct bfq_group *root_group;
12057 ++
12058 ++ struct rb_root rq_pos_tree;
12059 ++
12060 ++ int busy_queues;
12061 ++ int raised_busy_queues;
12062 ++ int queued;
12063 ++ int rq_in_driver;
12064 ++ int sync_flight;
12065 ++
12066 ++ int max_rq_in_driver;
12067 ++ int hw_tag_samples;
12068 ++ int hw_tag;
12069 ++
12070 ++ int budgets_assigned;
12071 ++
12072 ++ struct timer_list idle_slice_timer;
12073 ++ struct work_struct unplug_work;
12074 ++
12075 ++ struct bfq_queue *in_service_queue;
12076 ++ struct bfq_io_cq *in_service_bic;
12077 ++
12078 ++ sector_t last_position;
12079 ++
12080 ++ ktime_t last_budget_start;
12081 ++ ktime_t last_idling_start;
12082 ++ int peak_rate_samples;
12083 ++ u64 peak_rate;
12084 ++ unsigned long bfq_max_budget;
12085 ++
12086 ++ struct hlist_head group_list;
12087 ++ struct list_head active_list;
12088 ++ struct list_head idle_list;
12089 ++
12090 ++ unsigned int bfq_quantum;
12091 ++ unsigned int bfq_fifo_expire[2];
12092 ++ unsigned int bfq_back_penalty;
12093 ++ unsigned int bfq_back_max;
12094 ++ unsigned int bfq_slice_idle;
12095 ++ u64 bfq_class_idle_last_service;
12096 ++
12097 ++ unsigned int bfq_user_max_budget;
12098 ++ unsigned int bfq_max_budget_async_rq;
12099 ++ unsigned int bfq_timeout[2];
12100 ++
12101 ++ bool low_latency;
12102 ++
12103 ++ /* parameters of the low_latency heuristics */
12104 ++ unsigned int bfq_raising_coeff;
12105 ++ unsigned int bfq_raising_max_time;
12106 ++ unsigned int bfq_raising_rt_max_time;
12107 ++ unsigned int bfq_raising_min_idle_time;
12108 ++ unsigned long bfq_raising_min_inter_arr_async;
12109 ++ unsigned int bfq_raising_max_softrt_rate;
12110 ++ u64 RT_prod;
12111 ++
12112 ++ struct bfq_queue oom_bfqq;
12113 ++};
12114 ++
12115 ++enum bfqq_state_flags {
12116 ++ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */
12117 ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
12118 ++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
12119 ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
12120 ++ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
12121 ++ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */
12122 ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
12123 ++ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
12124 ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
12125 ++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
12126 ++ BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */
12127 ++};
12128 ++
12129 ++#define BFQ_BFQQ_FNS(name) \
12130 ++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
12131 ++{ \
12132 ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
12133 ++} \
12134 ++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
12135 ++{ \
12136 ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
12137 ++} \
12138 ++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
12139 ++{ \
12140 ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
12141 ++}
12142 ++
12143 ++BFQ_BFQQ_FNS(busy);
12144 ++BFQ_BFQQ_FNS(wait_request);
12145 ++BFQ_BFQQ_FNS(must_alloc);
12146 ++BFQ_BFQQ_FNS(fifo_expire);
12147 ++BFQ_BFQQ_FNS(idle_window);
12148 ++BFQ_BFQQ_FNS(prio_changed);
12149 ++BFQ_BFQQ_FNS(sync);
12150 ++BFQ_BFQQ_FNS(budget_new);
12151 ++BFQ_BFQQ_FNS(coop);
12152 ++BFQ_BFQQ_FNS(split_coop);
12153 ++BFQ_BFQQ_FNS(softrt_update);
12154 ++#undef BFQ_BFQQ_FNS
12155 ++
12156 ++/* Logging facilities. */
12157 ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
12158 ++ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
12159 ++
12160 ++#define bfq_log(bfqd, fmt, args...) \
12161 ++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
12162 ++
12163 ++/* Expiration reasons. */
12164 ++enum bfqq_expiration {
12165 ++ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */
12166 ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
12167 ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
12168 ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
12169 ++};
12170 ++
12171 ++#ifdef CONFIG_CGROUP_BFQIO
12172 ++/**
12173 ++ * struct bfq_group - per (device, cgroup) data structure.
12174 ++ * @entity: schedulable entity to insert into the parent group sched_data.
12175 ++ * @sched_data: own sched_data, to contain child entities (they may be
12176 ++ * both bfq_queues and bfq_groups).
12177 ++ * @group_node: node to be inserted into the bfqio_cgroup->group_data
12178 ++ * list of the containing cgroup's bfqio_cgroup.
12179 ++ * @bfqd_node: node to be inserted into the @bfqd->group_list list
12180 ++ * of the groups active on the same device; used for cleanup.
12181 ++ * @bfqd: the bfq_data for the device this group acts upon.
12182 ++ * @async_bfqq: array of async queues for all the tasks belonging to
12183 ++ * the group, one queue per ioprio value per ioprio_class,
12184 ++ * except for the idle class that has only one queue.
12185 ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
12186 ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
12187 ++ * to avoid too many special cases during group creation/migration.
12188 ++ *
12189 ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
12190 ++ * there is a set of bfq_groups, each one collecting the lower-level
12191 ++ * entities belonging to the group that are acting on the same device.
12192 ++ *
12193 ++ * Locking works as follows:
12194 ++ * o @group_node is protected by the bfqio_cgroup lock, and is accessed
12195 ++ * via RCU from its readers.
12196 ++ * o @bfqd is protected by the queue lock, RCU is used to access it
12197 ++ * from the readers.
12198 ++ * o All the other fields are protected by the @bfqd queue lock.
12199 ++ */
12200 ++struct bfq_group {
12201 ++ struct bfq_entity entity;
12202 ++ struct bfq_sched_data sched_data;
12203 ++
12204 ++ struct hlist_node group_node;
12205 ++ struct hlist_node bfqd_node;
12206 ++
12207 ++ void *bfqd;
12208 ++
12209 ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
12210 ++ struct bfq_queue *async_idle_bfqq;
12211 ++
12212 ++ struct bfq_entity *my_entity;
12213 ++};
12214 ++
12215 ++/**
12216 ++ * struct bfqio_cgroup - bfq cgroup data structure.
12217 ++ * @css: subsystem state for bfq in the containing cgroup.
12218 ++ * @online: flag marked when the subsystem is inserted.
12219 ++ * @weight: cgroup weight.
12220 ++ * @ioprio: cgroup ioprio.
12221 ++ * @ioprio_class: cgroup ioprio_class.
12222 ++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
12223 ++ * @group_data: list containing the bfq_group belonging to this cgroup.
12224 ++ *
12225 ++ * @group_data is accessed using RCU, with @lock protecting the updates,
12226 ++ * @ioprio and @ioprio_class are protected by @lock.
12227 ++ */
12228 ++struct bfqio_cgroup {
12229 ++ struct cgroup_subsys_state css;
12230 ++ bool online;
12231 ++
12232 ++ unsigned short weight, ioprio, ioprio_class;
12233 ++
12234 ++ spinlock_t lock;
12235 ++ struct hlist_head group_data;
12236 ++};
12237 ++#else
12238 ++struct bfq_group {
12239 ++ struct bfq_sched_data sched_data;
12240 ++
12241 ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
12242 ++ struct bfq_queue *async_idle_bfqq;
12243 ++};
12244 ++#endif
12245 ++
12246 ++static inline struct bfq_service_tree *
12247 ++bfq_entity_service_tree(struct bfq_entity *entity)
12248 ++{
12249 ++ struct bfq_sched_data *sched_data = entity->sched_data;
12250 ++ unsigned int idx = entity->ioprio_class - 1;
12251 ++
12252 ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
12253 ++ BUG_ON(sched_data == NULL);
12254 ++
12255 ++ return sched_data->service_tree + idx;
12256 ++}
12257 ++
12258 ++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
12259 ++ int is_sync)
12260 ++{
12261 ++ return bic->bfqq[!!is_sync];
12262 ++}
12263 ++
12264 ++static inline void bic_set_bfqq(struct bfq_io_cq *bic,
12265 ++ struct bfq_queue *bfqq, int is_sync)
12266 ++{
12267 ++ bic->bfqq[!!is_sync] = bfqq;
12268 ++}
12269 ++
12270 ++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
12271 ++{
12272 ++ return bic->icq.q->elevator->elevator_data;
12273 ++}
12274 ++
12275 ++/**
12276 ++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
12277 ++ * @ptr: a pointer to a bfqd.
12278 ++ * @flags: storage for the flags to be saved.
12279 ++ *
12280 ++ * This function allows bfqg->bfqd to be protected by the
12281 ++ * queue lock of the bfqd they reference; the pointer is dereferenced
12282 ++ * under RCU, so the storage for bfqd is assured to be safe as long
12283 ++ * as the RCU read side critical section does not end. After the
12284 ++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
12285 ++ * sure that no other writer accessed it. If we raced with a writer,
12286 ++ * the function returns NULL, with the queue unlocked, otherwise it
12287 ++ * returns the dereferenced pointer, with the queue locked.
12288 ++ */
12289 ++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
12290 ++ unsigned long *flags)
12291 ++{
12292 ++ struct bfq_data *bfqd;
12293 ++
12294 ++ rcu_read_lock();
12295 ++ bfqd = rcu_dereference(*(struct bfq_data **)ptr);
12296 ++
12297 ++ if (bfqd != NULL) {
12298 ++ spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
12299 ++ if (*ptr == bfqd)
12300 ++ goto out;
12301 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
12302 ++ }
12303 ++
12304 ++ bfqd = NULL;
12305 ++out:
12306 ++ rcu_read_unlock();
12307 ++ return bfqd;
12308 ++}
12309 ++
12310 ++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
12311 ++ unsigned long *flags)
12312 ++{
12313 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
12314 ++}
12315 ++
12316 ++static void bfq_changed_ioprio(struct bfq_io_cq *bic);
12317 ++static void bfq_put_queue(struct bfq_queue *bfqq);
12318 ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
12319 ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
12320 ++ struct bfq_group *bfqg, int is_sync,
12321 ++ struct bfq_io_cq *bic, gfp_t gfp_mask);
12322 ++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
12323 ++ struct bfq_group *bfqg);
12324 ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
12325 ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
12326 ++#endif
12327 +--
12328 +1.8.5.2
12329 +
12330
12331 Deleted: genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch
12332 ===================================================================
12333 --- genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch 2014-02-07 14:46:59 UTC (rev 2665)
12334 +++ genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch 2014-02-07 15:42:35 UTC (rev 2666)
12335 @@ -1,1034 +0,0 @@
12336 -From 3cd9e2ea29c3ba9e420556e8ecf161d166186b63 Mon Sep 17 00:00:00 2001
12337 -From: Mauro Andreolini <mauro.andreolini@×××××××.it>
12338 -Date: Thu, 23 Jan 2014 16:54:44 +0100
12339 -Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7 for
12340 - 3.13.0
12341 -
12342 -A set of processes may happen to perform interleaved reads, i.e., requests
12343 -whose union would give rise to a sequential read pattern. There are two
12344 -typical cases: in the first case, processes read fixed-size chunks of
12345 -data at a fixed distance from each other, while in the second case processes
12346 -may read variable-size chunks at variable distances. The latter case occurs
12347 -for example with KVM, which splits the I/O generated by the guest into
12348 -multiple chunks, and lets these chunks be served by a pool of cooperating
12349 -processes, iteratively assigning the next chunk of I/O to the first
12350 -available process. CFQ uses actual queue merging for the first type of
12351 -rocesses, whereas it uses preemption to get a sequential read pattern out
12352 -of the read requests performed by the second type of processes. In the end
12353 -it uses two different mechanisms to achieve the same goal: boosting the
12354 -throughput with interleaved I/O.
12355 -
12356 -This patch introduces Early Queue Merge (EQM), a unified mechanism to get a
12357 -sequential read pattern with both types of processes. The main idea is
12358 -checking newly arrived requests against the next request of the active queue
12359 -both in case of actual request insert and in case of request merge. By doing
12360 -so, both the types of processes can be handled by just merging their queues.
12361 -EQM is then simpler and more compact than the pair of mechanisms used in
12362 -CFQ.
12363 -
12364 -Finally, EQM also preserves the typical low-latency properties of BFQ, by
12365 -properly restoring the weight-raising state of a queue when it gets back to
12366 -a non-merged state.
12367 -
12368 -Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>
12369 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
12370 -Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>
12371 ----
12372 - block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++----------------
12373 - block/bfq-sched.c | 28 ---
12374 - block/bfq.h | 16 ++
12375 - 3 files changed, 474 insertions(+), 227 deletions(-)
12376 -
12377 -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
12378 -index 7670400..295236e 100644
12379 ---- a/block/bfq-iosched.c
12380 -+++ b/block/bfq-iosched.c
12381 -@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
12382 - return dur;
12383 - }
12384 -
12385 -+static inline void
12386 -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
12387 -+{
12388 -+ if (bic->saved_idle_window)
12389 -+ bfq_mark_bfqq_idle_window(bfqq);
12390 -+ else
12391 -+ bfq_clear_bfqq_idle_window(bfqq);
12392 -+ if (bic->raising_time_left && bfqq->bfqd->low_latency) {
12393 -+ /*
12394 -+ * Start a weight raising period with the duration given by
12395 -+ * the raising_time_left snapshot.
12396 -+ */
12397 -+ if (bfq_bfqq_busy(bfqq))
12398 -+ bfqq->bfqd->raised_busy_queues++;
12399 -+ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;
12400 -+ bfqq->raising_cur_max_time = bic->raising_time_left;
12401 -+ bfqq->last_rais_start_finish = jiffies;
12402 -+ bfqq->entity.ioprio_changed = 1;
12403 -+ }
12404 -+ /*
12405 -+ * Clear raising_time_left to prevent bfq_bfqq_save_state() from
12406 -+ * getting confused about the queue's need of a weight-raising
12407 -+ * period.
12408 -+ */
12409 -+ bic->raising_time_left = 0;
12410 -+}
12411 -+
12412 -+/*
12413 -+ * Must be called with the queue_lock held.
12414 -+ */
12415 -+static int bfqq_process_refs(struct bfq_queue *bfqq)
12416 -+{
12417 -+ int process_refs, io_refs;
12418 -+
12419 -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
12420 -+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
12421 -+ BUG_ON(process_refs < 0);
12422 -+ return process_refs;
12423 -+}
12424 -+
12425 - static void bfq_add_rq_rb(struct request *rq)
12426 - {
12427 - struct bfq_queue *bfqq = RQ_BFQQ(rq);
12428 -@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq)
12429 - if (!bfqd->low_latency)
12430 - goto add_bfqq_busy;
12431 -
12432 -+ if (bfq_bfqq_just_split(bfqq))
12433 -+ goto set_ioprio_changed;
12434 -+
12435 - /*
12436 -- * If the queue is not being boosted and has been idle
12437 -- * for enough time, start a weight-raising period
12438 -+ * If the queue:
12439 -+ * - is not being boosted,
12440 -+ * - has been idle for enough time,
12441 -+ * - is not a sync queue or is linked to a bfq_io_cq (it is
12442 -+ * shared "for its nature" or it is not shared and its
12443 -+ * requests have not been redirected to a shared queue)
12444 -+ * start a weight-raising period.
12445 - */
12446 -- if (old_raising_coeff == 1 &&
12447 -- (idle_for_long_time || soft_rt)) {
12448 -+ if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&
12449 -+ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
12450 - bfqq->raising_coeff = bfqd->bfq_raising_coeff;
12451 - if (idle_for_long_time)
12452 - bfqq->raising_cur_max_time =
12453 -@@ -572,6 +620,7 @@ static void bfq_add_rq_rb(struct request *rq)
12454 - bfqd->bfq_raising_rt_max_time;
12455 - }
12456 - }
12457 -+set_ioprio_changed:
12458 - if (old_raising_coeff != bfqq->raising_coeff)
12459 - entity->ioprio_changed = 1;
12460 - add_bfqq_busy:
12461 -@@ -754,90 +803,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)
12462 - spin_unlock_irq(bfqd->queue->queue_lock);
12463 - }
12464 -
12465 --static int bfq_allow_merge(struct request_queue *q, struct request *rq,
12466 -- struct bio *bio)
12467 --{
12468 -- struct bfq_data *bfqd = q->elevator->elevator_data;
12469 -- struct bfq_io_cq *bic;
12470 -- struct bfq_queue *bfqq;
12471 --
12472 -- /*
12473 -- * Disallow merge of a sync bio into an async request.
12474 -- */
12475 -- if (bfq_bio_sync(bio) && !rq_is_sync(rq))
12476 -- return 0;
12477 --
12478 -- /*
12479 -- * Lookup the bfqq that this bio will be queued with. Allow
12480 -- * merge only if rq is queued there.
12481 -- * Queue lock is held here.
12482 -- */
12483 -- bic = bfq_bic_lookup(bfqd, current->io_context);
12484 -- if (bic == NULL)
12485 -- return 0;
12486 --
12487 -- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
12488 -- return bfqq == RQ_BFQQ(rq);
12489 --}
12490 --
12491 --static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
12492 -- struct bfq_queue *bfqq)
12493 --{
12494 -- if (bfqq != NULL) {
12495 -- bfq_mark_bfqq_must_alloc(bfqq);
12496 -- bfq_mark_bfqq_budget_new(bfqq);
12497 -- bfq_clear_bfqq_fifo_expire(bfqq);
12498 --
12499 -- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
12500 --
12501 -- bfq_log_bfqq(bfqd, bfqq,
12502 -- "set_in_service_queue, cur-budget = %lu",
12503 -- bfqq->entity.budget);
12504 -- }
12505 --
12506 -- bfqd->in_service_queue = bfqq;
12507 --}
12508 --
12509 --/*
12510 -- * Get and set a new queue for service.
12511 -- */
12512 --static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
12513 -- struct bfq_queue *bfqq)
12514 -+static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
12515 - {
12516 -- if (!bfqq)
12517 -- bfqq = bfq_get_next_queue(bfqd);
12518 -+ if (request)
12519 -+ return blk_rq_pos(io_struct);
12520 - else
12521 -- bfq_get_next_queue_forced(bfqd, bfqq);
12522 --
12523 -- __bfq_set_in_service_queue(bfqd, bfqq);
12524 -- return bfqq;
12525 -+ return ((struct bio *)io_struct)->bi_sector;
12526 - }
12527 -
12528 --static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
12529 -- struct request *rq)
12530 -+static inline sector_t bfq_dist_from(sector_t pos1,
12531 -+ sector_t pos2)
12532 - {
12533 -- if (blk_rq_pos(rq) >= bfqd->last_position)
12534 -- return blk_rq_pos(rq) - bfqd->last_position;
12535 -+ if (pos1 >= pos2)
12536 -+ return pos1 - pos2;
12537 - else
12538 -- return bfqd->last_position - blk_rq_pos(rq);
12539 -+ return pos2 - pos1;
12540 - }
12541 -
12542 --/*
12543 -- * Return true if bfqq has no request pending and rq is close enough to
12544 -- * bfqd->last_position, or if rq is closer to bfqd->last_position than
12545 -- * bfqq->next_rq
12546 -- */
12547 --static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
12548 -+static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
12549 -+ sector_t sector)
12550 - {
12551 -- return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
12552 -+ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
12553 -+ BFQQ_SEEK_THR;
12554 - }
12555 -
12556 --static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
12557 -+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
12558 - {
12559 - struct rb_root *root = &bfqd->rq_pos_tree;
12560 - struct rb_node *parent, *node;
12561 - struct bfq_queue *__bfqq;
12562 -- sector_t sector = bfqd->last_position;
12563 -
12564 - if (RB_EMPTY_ROOT(root))
12565 - return NULL;
12566 -@@ -856,7 +850,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
12567 - * position).
12568 - */
12569 - __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
12570 -- if (bfq_rq_close(bfqd, __bfqq->next_rq))
12571 -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
12572 - return __bfqq;
12573 -
12574 - if (blk_rq_pos(__bfqq->next_rq) < sector)
12575 -@@ -867,7 +861,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
12576 - return NULL;
12577 -
12578 - __bfqq = rb_entry(node, struct bfq_queue, pos_node);
12579 -- if (bfq_rq_close(bfqd, __bfqq->next_rq))
12580 -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
12581 - return __bfqq;
12582 -
12583 - return NULL;
12584 -@@ -876,14 +870,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
12585 - /*
12586 - * bfqd - obvious
12587 - * cur_bfqq - passed in so that we don't decide that the current queue
12588 -- * is closely cooperating with itself.
12589 -- *
12590 -- * We are assuming that cur_bfqq has dispatched at least one request,
12591 -- * and that bfqd->last_position reflects a position on the disk associated
12592 -- * with the I/O issued by cur_bfqq.
12593 -+ * is closely cooperating with itself
12594 -+ * sector - used as a reference point to search for a close queue
12595 - */
12596 - static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
12597 -- struct bfq_queue *cur_bfqq)
12598 -+ struct bfq_queue *cur_bfqq,
12599 -+ sector_t sector)
12600 - {
12601 - struct bfq_queue *bfqq;
12602 -
12603 -@@ -903,7 +895,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
12604 - * working closely on the same area of the disk. In that case,
12605 - * we can group them together and don't waste time idling.
12606 - */
12607 -- bfqq = bfqq_close(bfqd);
12608 -+ bfqq = bfqq_close(bfqd, sector);
12609 - if (bfqq == NULL || bfqq == cur_bfqq)
12610 - return NULL;
12611 -
12612 -@@ -930,6 +922,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
12613 - return bfqq;
12614 - }
12615 -
12616 -+static struct bfq_queue *
12617 -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
12618 -+{
12619 -+ int process_refs, new_process_refs;
12620 -+ struct bfq_queue *__bfqq;
12621 -+
12622 -+ /*
12623 -+ * If there are no process references on the new_bfqq, then it is
12624 -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
12625 -+ * may have dropped their last reference (not just their last process
12626 -+ * reference).
12627 -+ */
12628 -+ if (!bfqq_process_refs(new_bfqq))
12629 -+ return NULL;
12630 -+
12631 -+ /* Avoid a circular list and skip interim queue merges. */
12632 -+ while ((__bfqq = new_bfqq->new_bfqq)) {
12633 -+ if (__bfqq == bfqq)
12634 -+ return NULL;
12635 -+ new_bfqq = __bfqq;
12636 -+ }
12637 -+
12638 -+ process_refs = bfqq_process_refs(bfqq);
12639 -+ new_process_refs = bfqq_process_refs(new_bfqq);
12640 -+ /*
12641 -+ * If the process for the bfqq has gone away, there is no
12642 -+ * sense in merging the queues.
12643 -+ */
12644 -+ if (process_refs == 0 || new_process_refs == 0)
12645 -+ return NULL;
12646 -+
12647 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
12648 -+ new_bfqq->pid);
12649 -+
12650 -+ /*
12651 -+ * Merging is just a redirection: the requests of the process owning
12652 -+ * one of the two queues are redirected to the other queue. The latter
12653 -+ * queue, in its turn, is set as shared if this is the first time that
12654 -+ * the requests of some process are redirected to it.
12655 -+ *
12656 -+ * We redirect bfqq to new_bfqq and not the opposite, because we
12657 -+ * are in the context of the process owning bfqq, hence we have the
12658 -+ * io_cq of this process. So we can immediately configure this io_cq
12659 -+ * to redirect the requests of the process to new_bfqq.
12660 -+ *
12661 -+ * NOTE, even if new_bfqq coincides with the in-service queue, the
12662 -+ * io_cq of new_bfqq is not available, because, if the in-service queue
12663 -+ * is shared, bfqd->in_service_bic may not point to the io_cq of the
12664 -+ * in-service queue.
12665 -+ * Redirecting the requests of the process owning bfqq to the currently
12666 -+ * in-service queue is in any case the best option, as we feed the
12667 -+ * in-service queue with new requests close to the last request served
12668 -+ * and, by doing so, hopefully increase the throughput.
12669 -+ */
12670 -+ bfqq->new_bfqq = new_bfqq;
12671 -+ atomic_add(process_refs, &new_bfqq->ref);
12672 -+ return new_bfqq;
12673 -+}
12674 -+
12675 -+/*
12676 -+ * Attempt to schedule a merge of bfqq with the currently in-service queue or
12677 -+ * with a close queue among the scheduled queues.
12678 -+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
12679 -+ * structure otherwise.
12680 -+ */
12681 -+static struct bfq_queue *
12682 -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
12683 -+ void *io_struct, bool request)
12684 -+{
12685 -+ struct bfq_queue *in_service_bfqq, *new_bfqq;
12686 -+
12687 -+ if (bfqq->new_bfqq)
12688 -+ return bfqq->new_bfqq;
12689 -+
12690 -+ if (!io_struct)
12691 -+ return NULL;
12692 -+
12693 -+ in_service_bfqq = bfqd->in_service_queue;
12694 -+
12695 -+ if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||
12696 -+ !bfqd->in_service_bic)
12697 -+ goto check_scheduled;
12698 -+
12699 -+ if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))
12700 -+ goto check_scheduled;
12701 -+
12702 -+ if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))
12703 -+ goto check_scheduled;
12704 -+
12705 -+ if (in_service_bfqq->entity.parent != bfqq->entity.parent)
12706 -+ goto check_scheduled;
12707 -+
12708 -+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
12709 -+ bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {
12710 -+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
12711 -+ if (new_bfqq != NULL)
12712 -+ return new_bfqq; /* Merge with the in-service queue */
12713 -+ }
12714 -+
12715 -+ /*
12716 -+ * Check whether there is a cooperator among currently scheduled
12717 -+ * queues. The only thing we need is that the bio/request is not
12718 -+ * NULL, as we need it to establish whether a cooperator exists.
12719 -+ */
12720 -+check_scheduled:
12721 -+ new_bfqq = bfq_close_cooperator(bfqd, bfqq,
12722 -+ bfq_io_struct_pos(io_struct, request));
12723 -+ if (new_bfqq)
12724 -+ return bfq_setup_merge(bfqq, new_bfqq);
12725 -+
12726 -+ return NULL;
12727 -+}
12728 -+
12729 -+static inline void
12730 -+bfq_bfqq_save_state(struct bfq_queue *bfqq)
12731 -+{
12732 -+ /*
12733 -+ * If bfqq->bic == NULL, the queue is already shared or its requests
12734 -+ * have already been redirected to a shared queue; both idle window
12735 -+ * and weight raising state have already been saved. Do nothing.
12736 -+ */
12737 -+ if (bfqq->bic == NULL)
12738 -+ return;
12739 -+ if (bfqq->bic->raising_time_left)
12740 -+ /*
12741 -+ * This is the queue of a just-started process, and would
12742 -+ * deserve weight raising: we set raising_time_left to the full
12743 -+ * weight-raising duration to trigger weight-raising when and
12744 -+ * if the queue is split and the first request of the queue
12745 -+ * is enqueued.
12746 -+ */
12747 -+ bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);
12748 -+ else if (bfqq->raising_coeff > 1) {
12749 -+ unsigned long wrais_duration =
12750 -+ jiffies - bfqq->last_rais_start_finish;
12751 -+ /*
12752 -+ * It may happen that a queue's weight raising period lasts
12753 -+ * longer than its raising_cur_max_time, as weight raising is
12754 -+ * handled only when a request is enqueued or dispatched (it
12755 -+ * does not use any timer). If the weight raising period is
12756 -+ * about to end, don't save it.
12757 -+ */
12758 -+ if (bfqq->raising_cur_max_time <= wrais_duration)
12759 -+ bfqq->bic->raising_time_left = 0;
12760 -+ else
12761 -+ bfqq->bic->raising_time_left =
12762 -+ bfqq->raising_cur_max_time - wrais_duration;
12763 -+ /*
12764 -+ * The bfq_queue is becoming shared or the requests of the
12765 -+ * process owning the queue are being redirected to a shared
12766 -+ * queue. Stop the weight raising period of the queue, as in
12767 -+ * both cases it should not be owned by an interactive or soft
12768 -+ * real-time application.
12769 -+ */
12770 -+ bfq_bfqq_end_raising(bfqq);
12771 -+ } else
12772 -+ bfqq->bic->raising_time_left = 0;
12773 -+ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
12774 -+}
12775 -+
12776 -+static inline void
12777 -+bfq_get_bic_reference(struct bfq_queue *bfqq)
12778 -+{
12779 -+ /*
12780 -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs
12781 -+ * is about to begin using a shared bfq_queue.
12782 -+ */
12783 -+ if (bfqq->bic)
12784 -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
12785 -+}
12786 -+
12787 -+static void
12788 -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
12789 -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
12790 -+{
12791 -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
12792 -+ (long unsigned)new_bfqq->pid);
12793 -+ /* Save weight raising and idle window of the merged queues */
12794 -+ bfq_bfqq_save_state(bfqq);
12795 -+ bfq_bfqq_save_state(new_bfqq);
12796 -+ /*
12797 -+ * Grab a reference to the bic, to prevent it from being destroyed
12798 -+ * before being possibly touched by a bfq_split_bfqq().
12799 -+ */
12800 -+ bfq_get_bic_reference(bfqq);
12801 -+ bfq_get_bic_reference(new_bfqq);
12802 -+ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */
12803 -+ bic_set_bfqq(bic, new_bfqq, 1);
12804 -+ bfq_mark_bfqq_coop(new_bfqq);
12805 -+ /*
12806 -+ * new_bfqq now belongs to at least two bics (it is a shared queue): set
12807 -+ * new_bfqq->bic to NULL. bfqq either:
12808 -+ * - does not belong to any bic any more, and hence bfqq->bic must
12809 -+ * be set to NULL, or
12810 -+ * - is a queue whose owning bics have already been redirected to a
12811 -+ * different queue, hence the queue is destined to not belong to any
12812 -+ * bic soon and bfqq->bic is already NULL (therefore the next
12813 -+ * assignment causes no harm).
12814 -+ */
12815 -+ new_bfqq->bic = NULL;
12816 -+ bfqq->bic = NULL;
12817 -+ bfq_put_queue(bfqq);
12818 -+}
12819 -+
12820 -+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
12821 -+ struct bio *bio)
12822 -+{
12823 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
12824 -+ struct bfq_io_cq *bic;
12825 -+ struct bfq_queue *bfqq, *new_bfqq;
12826 -+
12827 -+ /*
12828 -+ * Disallow merge of a sync bio into an async request.
12829 -+ */
12830 -+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
12831 -+ return 0;
12832 -+
12833 -+ /*
12834 -+ * Lookup the bfqq that this bio will be queued with. Allow
12835 -+ * merge only if rq is queued there.
12836 -+ * Queue lock is held here.
12837 -+ */
12838 -+ bic = bfq_bic_lookup(bfqd, current->io_context);
12839 -+ if (bic == NULL)
12840 -+ return 0;
12841 -+
12842 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
12843 -+ /*
12844 -+ * We take advantage of this function to perform an early merge
12845 -+ * of the queues of possible cooperating processes.
12846 -+ */
12847 -+ if (bfqq != NULL) {
12848 -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
12849 -+ if (new_bfqq != NULL) {
12850 -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
12851 -+ /*
12852 -+ * If we get here, the bio will be queued in the shared queue,
12853 -+ * i.e., new_bfqq, so use new_bfqq to decide whether bio and
12854 -+ * rq can be merged.
12855 -+ */
12856 -+ bfqq = new_bfqq;
12857 -+ }
12858 -+ }
12859 -+
12860 -+ return bfqq == RQ_BFQQ(rq);
12861 -+}
12862 -+
12863 -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
12864 -+ struct bfq_queue *bfqq)
12865 -+{
12866 -+ if (bfqq != NULL) {
12867 -+ bfq_mark_bfqq_must_alloc(bfqq);
12868 -+ bfq_mark_bfqq_budget_new(bfqq);
12869 -+ bfq_clear_bfqq_fifo_expire(bfqq);
12870 -+
12871 -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
12872 -+
12873 -+ bfq_log_bfqq(bfqd, bfqq,
12874 -+ "set_in_service_queue, cur-budget = %lu",
12875 -+ bfqq->entity.budget);
12876 -+ }
12877 -+
12878 -+ bfqd->in_service_queue = bfqq;
12879 -+}
12880 -+
12881 -+/*
12882 -+ * Get and set a new queue for service.
12883 -+ */
12884 -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
12885 -+{
12886 -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
12887 -+
12888 -+ __bfq_set_in_service_queue(bfqd, bfqq);
12889 -+ return bfqq;
12890 -+}
12891 -+
12892 - /*
12893 - * If enough samples have been computed, return the current max budget
12894 - * stored in bfqd, which is dynamically updated according to the
12895 -@@ -1077,63 +1345,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
12896 - return rq;
12897 - }
12898 -
12899 --/*
12900 -- * Must be called with the queue_lock held.
12901 -- */
12902 --static int bfqq_process_refs(struct bfq_queue *bfqq)
12903 --{
12904 -- int process_refs, io_refs;
12905 --
12906 -- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
12907 -- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
12908 -- BUG_ON(process_refs < 0);
12909 -- return process_refs;
12910 --}
12911 --
12912 --static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
12913 --{
12914 -- int process_refs, new_process_refs;
12915 -- struct bfq_queue *__bfqq;
12916 --
12917 -- /*
12918 -- * If there are no process references on the new_bfqq, then it is
12919 -- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
12920 -- * may have dropped their last reference (not just their last process
12921 -- * reference).
12922 -- */
12923 -- if (!bfqq_process_refs(new_bfqq))
12924 -- return;
12925 --
12926 -- /* Avoid a circular list and skip interim queue merges. */
12927 -- while ((__bfqq = new_bfqq->new_bfqq)) {
12928 -- if (__bfqq == bfqq)
12929 -- return;
12930 -- new_bfqq = __bfqq;
12931 -- }
12932 --
12933 -- process_refs = bfqq_process_refs(bfqq);
12934 -- new_process_refs = bfqq_process_refs(new_bfqq);
12935 -- /*
12936 -- * If the process for the bfqq has gone away, there is no
12937 -- * sense in merging the queues.
12938 -- */
12939 -- if (process_refs == 0 || new_process_refs == 0)
12940 -- return;
12941 --
12942 -- /*
12943 -- * Merge in the direction of the lesser amount of work.
12944 -- */
12945 -- if (new_process_refs >= process_refs) {
12946 -- bfqq->new_bfqq = new_bfqq;
12947 -- atomic_add(process_refs, &new_bfqq->ref);
12948 -- } else {
12949 -- new_bfqq->new_bfqq = bfqq;
12950 -- atomic_add(new_process_refs, &bfqq->ref);
12951 -- }
12952 -- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
12953 -- new_bfqq->pid);
12954 --}
12955 --
12956 - static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
12957 - {
12958 - struct bfq_entity *entity = &bfqq->entity;
12959 -@@ -1703,7 +1914,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
12960 - */
12961 - static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
12962 - {
12963 -- struct bfq_queue *bfqq, *new_bfqq = NULL;
12964 -+ struct bfq_queue *bfqq;
12965 - struct request *next_rq;
12966 - enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
12967 -
12968 -@@ -1713,17 +1924,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
12969 -
12970 - bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
12971 -
12972 -- /*
12973 -- * If another queue has a request waiting within our mean seek
12974 -- * distance, let it run. The expire code will check for close
12975 -- * cooperators and put the close queue at the front of the
12976 -- * service tree. If possible, merge the expiring queue with the
12977 -- * new bfqq.
12978 -- */
12979 -- new_bfqq = bfq_close_cooperator(bfqd, bfqq);
12980 -- if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
12981 -- bfq_setup_merge(bfqq, new_bfqq);
12982 --
12983 - if (bfq_may_expire_for_budg_timeout(bfqq) &&
12984 - !timer_pending(&bfqd->idle_slice_timer) &&
12985 - !bfq_bfqq_must_idle(bfqq))
12986 -@@ -1760,36 +1960,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
12987 - bfq_clear_bfqq_wait_request(bfqq);
12988 - del_timer(&bfqd->idle_slice_timer);
12989 - }
12990 -- if (new_bfqq == NULL)
12991 -- goto keep_queue;
12992 -- else
12993 -- goto expire;
12994 -+ goto keep_queue;
12995 - }
12996 - }
12997 -
12998 - /*
12999 -- * No requests pending. If the in-service queue has no cooperator and
13000 -- * still has requests in flight (possibly waiting for a completion)
13001 -- * or is idling for a new request, then keep it.
13002 -+ * No requests pending. If the in-service queue still has requests in
13003 -+ * flight (possibly waiting for a completion) or is idling for a new
13004 -+ * request, then keep it.
13005 - */
13006 -- if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
13007 -- (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
13008 -+ if (timer_pending(&bfqd->idle_slice_timer) ||
13009 -+ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {
13010 - bfqq = NULL;
13011 - goto keep_queue;
13012 -- } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
13013 -- /*
13014 -- * Expiring the queue because there is a close cooperator,
13015 -- * cancel timer.
13016 -- */
13017 -- bfq_clear_bfqq_wait_request(bfqq);
13018 -- del_timer(&bfqd->idle_slice_timer);
13019 - }
13020 -
13021 - reason = BFQ_BFQQ_NO_MORE_REQUESTS;
13022 - expire:
13023 - bfq_bfqq_expire(bfqd, bfqq, 0, reason);
13024 - new_queue:
13025 -- bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
13026 -+ bfqq = bfq_set_in_service_queue(bfqd);
13027 - bfq_log(bfqd, "select_queue: new queue %d returned",
13028 - bfqq != NULL ? bfqq->pid : 0);
13029 - keep_queue:
13030 -@@ -1799,9 +1989,8 @@ keep_queue:
13031 - static void bfq_update_raising_data(struct bfq_data *bfqd,
13032 - struct bfq_queue *bfqq)
13033 - {
13034 -+ struct bfq_entity *entity = &bfqq->entity;
13035 - if (bfqq->raising_coeff > 1) { /* queue is being boosted */
13036 -- struct bfq_entity *entity = &bfqq->entity;
13037 --
13038 - bfq_log_bfqq(bfqd, bfqq,
13039 - "raising period dur %u/%u msec, "
13040 - "old raising coeff %u, w %d(%d)",
13041 -@@ -1818,7 +2007,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,
13042 - "WARN: pending prio change");
13043 - /*
13044 - * If too much time has elapsed from the beginning
13045 -- * of this weight-raising, stop it.
13046 -+ * of this weight-raising period, stop it.
13047 - */
13048 - if (jiffies - bfqq->last_rais_start_finish >
13049 - bfqq->raising_cur_max_time) {
13050 -@@ -1830,11 +2019,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,
13051 - jiffies_to_msecs(bfqq->
13052 - raising_cur_max_time));
13053 - bfq_bfqq_end_raising(bfqq);
13054 -- __bfq_entity_update_weight_prio(
13055 -- bfq_entity_service_tree(entity),
13056 -- entity);
13057 - }
13058 - }
13059 -+ /* Update weight both if it must be raised and if it must be lowered */
13060 -+ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))
13061 -+ __bfq_entity_update_weight_prio(
13062 -+ bfq_entity_service_tree(entity),
13063 -+ entity);
13064 - }
13065 -
13066 - /*
13067 -@@ -2075,6 +2266,25 @@ static void bfq_init_icq(struct io_cq *icq)
13068 - struct bfq_io_cq *bic = icq_to_bic(icq);
13069 -
13070 - bic->ttime.last_end_request = jiffies;
13071 -+ /*
13072 -+ * A newly created bic indicates that the process has just
13073 -+ * started doing I/O, and is probably mapping into memory its
13074 -+ * executable and libraries: it definitely needs weight raising.
13075 -+ * There is however the possibility that the process performs,
13076 -+ * for a while, I/O close to some other process. EQM intercepts
13077 -+ * this behavior and may merge the queue corresponding to the
13078 -+ * process with some other queue, BEFORE the weight of the queue
13079 -+ * is raised. Merged queues are not weight-raised (they are assumed
13080 -+ * to belong to processes that benefit only from high throughput).
13081 -+ * If the merge is basically the consequence of an accident, then
13082 -+ * the queue will be split soon and will get back its old weight.
13083 -+ * It is then important to write down somewhere that this queue
13084 -+ * does need weight raising, even if it did not make it to get its
13085 -+ * weight raised before being merged. To this purpose, we overload
13086 -+ * the field raising_time_left and assign 1 to it, to mark the queue
13087 -+ * as needing weight raising.
13088 -+ */
13089 -+ bic->raising_time_left = 1;
13090 - }
13091 -
13092 - static void bfq_exit_icq(struct io_cq *icq)
13093 -@@ -2088,6 +2298,13 @@ static void bfq_exit_icq(struct io_cq *icq)
13094 - }
13095 -
13096 - if (bic->bfqq[BLK_RW_SYNC]) {
13097 -+ /*
13098 -+ * If the bic is using a shared queue, put the reference
13099 -+ * taken on the io_context when the bic started using a
13100 -+ * shared bfq_queue.
13101 -+ */
13102 -+ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
13103 -+ put_io_context(icq->ioc);
13104 - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
13105 - bic->bfqq[BLK_RW_SYNC] = NULL;
13106 - }
13107 -@@ -2375,6 +2592,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
13108 - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
13109 - return;
13110 -
13111 -+ /* Idle window just restored, statistics are meaningless. */
13112 -+ if (bfq_bfqq_just_split(bfqq))
13113 -+ return;
13114 -+
13115 - enable_idle = bfq_bfqq_idle_window(bfqq);
13116 -
13117 - if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
13118 -@@ -2415,6 +2636,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
13119 - if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
13120 - !BFQQ_SEEKY(bfqq))
13121 - bfq_update_idle_window(bfqd, bfqq, bic);
13122 -+ bfq_clear_bfqq_just_split(bfqq);
13123 -
13124 - bfq_log_bfqq(bfqd, bfqq,
13125 - "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
13126 -@@ -2475,13 +2697,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
13127 - static void bfq_insert_request(struct request_queue *q, struct request *rq)
13128 - {
13129 - struct bfq_data *bfqd = q->elevator->elevator_data;
13130 -- struct bfq_queue *bfqq = RQ_BFQQ(rq);
13131 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
13132 -
13133 - assert_spin_locked(bfqd->queue->queue_lock);
13134 -+
13135 -+ /*
13136 -+ * An unplug may trigger a requeue of a request from the device
13137 -+ * driver: make sure we are in process context while trying to
13138 -+ * merge two bfq_queues.
13139 -+ */
13140 -+ if (!in_interrupt()) {
13141 -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
13142 -+ if (new_bfqq != NULL) {
13143 -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
13144 -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
13145 -+ /*
13146 -+ * Release the request's reference to the old bfqq
13147 -+ * and make sure one is taken to the shared queue.
13148 -+ */
13149 -+ new_bfqq->allocated[rq_data_dir(rq)]++;
13150 -+ bfqq->allocated[rq_data_dir(rq)]--;
13151 -+ atomic_inc(&new_bfqq->ref);
13152 -+ bfq_put_queue(bfqq);
13153 -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
13154 -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
13155 -+ bfqq, new_bfqq);
13156 -+ rq->elv.priv[1] = new_bfqq;
13157 -+ bfqq = new_bfqq;
13158 -+ }
13159 -+ }
13160 -+
13161 - bfq_init_prio_data(bfqq, RQ_BIC(rq));
13162 -
13163 - bfq_add_rq_rb(rq);
13164 -
13165 -+ /*
13166 -+ * Here a newly-created bfq_queue has already started a weight-raising
13167 -+ * period: clear raising_time_left to prevent bfq_bfqq_save_state()
13168 -+ * from assigning it a full weight-raising period. See the detailed
13169 -+ * comments about this field in bfq_init_icq().
13170 -+ */
13171 -+ if (bfqq->bic != NULL)
13172 -+ bfqq->bic->raising_time_left = 0;
13173 - rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
13174 - list_add_tail(&rq->queuelist, &bfqq->fifo);
13175 -
13176 -@@ -2629,18 +2886,6 @@ static void bfq_put_request(struct request *rq)
13177 - }
13178 - }
13179 -
13180 --static struct bfq_queue *
13181 --bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
13182 -- struct bfq_queue *bfqq)
13183 --{
13184 -- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
13185 -- (long unsigned)bfqq->new_bfqq->pid);
13186 -- bic_set_bfqq(bic, bfqq->new_bfqq, 1);
13187 -- bfq_mark_bfqq_coop(bfqq->new_bfqq);
13188 -- bfq_put_queue(bfqq);
13189 -- return bic_to_bfqq(bic, 1);
13190 --}
13191 --
13192 - /*
13193 - * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
13194 - * was the last process referring to said bfqq.
13195 -@@ -2649,6 +2894,9 @@ static struct bfq_queue *
13196 - bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
13197 - {
13198 - bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
13199 -+
13200 -+ put_io_context(bic->icq.ioc);
13201 -+
13202 - if (bfqq_process_refs(bfqq) == 1) {
13203 - bfqq->pid = current->pid;
13204 - bfq_clear_bfqq_coop(bfqq);
13205 -@@ -2677,6 +2925,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
13206 - struct bfq_queue *bfqq;
13207 - struct bfq_group *bfqg;
13208 - unsigned long flags;
13209 -+ bool split = false;
13210 -
13211 - might_sleep_if(gfp_mask & __GFP_WAIT);
13212 -
13213 -@@ -2695,24 +2944,14 @@ new_queue:
13214 - bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
13215 - bic_set_bfqq(bic, bfqq, is_sync);
13216 - } else {
13217 -- /*
13218 -- * If the queue was seeky for too long, break it apart.
13219 -- */
13220 -+ /* If the queue was seeky for too long, break it apart. */
13221 - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
13222 - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
13223 - bfqq = bfq_split_bfqq(bic, bfqq);
13224 -+ split = true;
13225 - if (!bfqq)
13226 - goto new_queue;
13227 - }
13228 --
13229 -- /*
13230 -- * Check to see if this queue is scheduled to merge with
13231 -- * another closely cooperating queue. The merging of queues
13232 -- * happens here as it must be done in process context.
13233 -- * The reference on new_bfqq was taken in merge_bfqqs.
13234 -- */
13235 -- if (bfqq->new_bfqq != NULL)
13236 -- bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
13237 - }
13238 -
13239 - bfqq->allocated[rw]++;
13240 -@@ -2723,6 +2962,26 @@ new_queue:
13241 - rq->elv.priv[0] = bic;
13242 - rq->elv.priv[1] = bfqq;
13243 -
13244 -+ /*
13245 -+ * If a bfq_queue has only one process reference, it is owned
13246 -+ * by only one bfq_io_cq: we can set the bic field of the
13247 -+ * bfq_queue to the address of that structure. Also, if the
13248 -+ * queue has just been split, mark a flag so that the
13249 -+ * information is available to the other scheduler hooks.
13250 -+ */
13251 -+ if (bfqq_process_refs(bfqq) == 1) {
13252 -+ bfqq->bic = bic;
13253 -+ if (split) {
13254 -+ bfq_mark_bfqq_just_split(bfqq);
13255 -+ /*
13256 -+ * If the queue has just been split from a shared queue,
13257 -+ * restore the idle window and the possible weight
13258 -+ * raising period.
13259 -+ */
13260 -+ bfq_bfqq_resume_state(bfqq, bic);
13261 -+ }
13262 -+ }
13263 -+
13264 - spin_unlock_irqrestore(q->queue_lock, flags);
13265 -
13266 - return 0;
13267 -diff --git a/block/bfq-sched.c b/block/bfq-sched.c
13268 -index 30df81c..47e66a8 100644
13269 ---- a/block/bfq-sched.c
13270 -+++ b/block/bfq-sched.c
13271 -@@ -979,34 +979,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
13272 - return bfqq;
13273 - }
13274 -
13275 --/*
13276 -- * Forced extraction of the given queue.
13277 -- */
13278 --static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
13279 -- struct bfq_queue *bfqq)
13280 --{
13281 -- struct bfq_entity *entity;
13282 -- struct bfq_sched_data *sd;
13283 --
13284 -- BUG_ON(bfqd->in_service_queue != NULL);
13285 --
13286 -- entity = &bfqq->entity;
13287 -- /*
13288 -- * Bubble up extraction/update from the leaf to the root.
13289 -- */
13290 -- for_each_entity(entity) {
13291 -- sd = entity->sched_data;
13292 -- bfq_update_budget(entity);
13293 -- bfq_update_vtime(bfq_entity_service_tree(entity));
13294 -- bfq_active_extract(bfq_entity_service_tree(entity), entity);
13295 -- sd->active_entity = entity;
13296 -- sd->next_active = NULL;
13297 -- entity->service = 0;
13298 -- }
13299 --
13300 -- return;
13301 --}
13302 --
13303 - static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
13304 - {
13305 - if (bfqd->in_service_bic != NULL) {
13306 -diff --git a/block/bfq.h b/block/bfq.h
13307 -index 68b28e3..438f560 100644
13308 ---- a/block/bfq.h
13309 -+++ b/block/bfq.h
13310 -@@ -192,6 +192,8 @@ struct bfq_group;
13311 - * idle to backlogged
13312 - * @service_from_backlogged: cumulative service received from the @bfq_queue
13313 - * since the last transition from idle to backlogged
13314 -+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
13315 -+ * queue is shared
13316 - *
13317 - * A bfq_queue is a leaf request queue; it can be associated to an io_context
13318 - * or more (if it is an async one). @cgroup holds a reference to the
13319 -@@ -235,6 +237,7 @@ struct bfq_queue {
13320 - sector_t last_request_pos;
13321 -
13322 - pid_t pid;
13323 -+ struct bfq_io_cq *bic;
13324 -
13325 - /* weight-raising fields */
13326 - unsigned int raising_cur_max_time;
13327 -@@ -264,12 +267,23 @@ struct bfq_ttime {
13328 - * @icq: associated io_cq structure
13329 - * @bfqq: array of two process queues, the sync and the async
13330 - * @ttime: associated @bfq_ttime struct
13331 -+ * @raising_time_left: snapshot of the time left before weight raising ends
13332 -+ * for the sync queue associated to this process; this
13333 -+ * snapshot is taken to remember this value while the weight
13334 -+ * raising is suspended because the queue is merged with a
13335 -+ * shared queue, and is used to set @raising_cur_max_time
13336 -+ * when the queue is split from the shared queue and its
13337 -+ * weight is raised again
13338 -+ * @saved_idle_window: same purpose as the previous field for the idle window
13339 - */
13340 - struct bfq_io_cq {
13341 - struct io_cq icq; /* must be the first member */
13342 - struct bfq_queue *bfqq[2];
13343 - struct bfq_ttime ttime;
13344 - int ioprio;
13345 -+
13346 -+ unsigned int raising_time_left;
13347 -+ unsigned int saved_idle_window;
13348 - };
13349 -
13350 - /**
13351 -@@ -411,6 +425,7 @@ enum bfqq_state_flags {
13352 - BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
13353 - BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
13354 - BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
13355 -+ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */
13356 - BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */
13357 - };
13358 -
13359 -@@ -438,6 +453,7 @@ BFQ_BFQQ_FNS(sync);
13360 - BFQ_BFQQ_FNS(budget_new);
13361 - BFQ_BFQQ_FNS(coop);
13362 - BFQ_BFQQ_FNS(split_coop);
13363 -+BFQ_BFQQ_FNS(just_split);
13364 - BFQ_BFQQ_FNS(softrt_update);
13365 - #undef BFQ_BFQQ_FNS
13366 -
13367 ---
13368 -1.8.5.2
13369 -
13370
13371 Added: genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch
13372 ===================================================================
13373 --- genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch (rev 0)
13374 +++ genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch 2014-02-07 15:42:35 UTC (rev 2666)
13375 @@ -0,0 +1,1034 @@
13376 +From 39b1dba58b2562ba0d93a33a4f9af662d3c790c5 Mon Sep 17 00:00:00 2001
13377 +From: Mauro Andreolini <mauro.andreolini@×××××××.it>
13378 +Date: Thu, 23 Jan 2014 16:54:44 +0100
13379 +Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r1 for
13380 + 3.13.0
13381 +
13382 +A set of processes may happen to perform interleaved reads, i.e., requests
13383 +whose union would give rise to a sequential read pattern. There are two
13384 +typical cases: in the first case, processes read fixed-size chunks of
13385 +data at a fixed distance from each other, while in the second case processes
13386 +may read variable-size chunks at variable distances. The latter case occurs
13387 +for example with KVM, which splits the I/O generated by the guest into
13388 +multiple chunks, and lets these chunks be served by a pool of cooperating
13389 +processes, iteratively assigning the next chunk of I/O to the first
13390 +available process. CFQ uses actual queue merging for the first type of
13391 +rocesses, whereas it uses preemption to get a sequential read pattern out
13392 +of the read requests performed by the second type of processes. In the end
13393 +it uses two different mechanisms to achieve the same goal: boosting the
13394 +throughput with interleaved I/O.
13395 +
13396 +This patch introduces Early Queue Merge (EQM), a unified mechanism to get a
13397 +sequential read pattern with both types of processes. The main idea is
13398 +checking newly arrived requests against the next request of the active queue
13399 +both in case of actual request insert and in case of request merge. By doing
13400 +so, both the types of processes can be handled by just merging their queues.
13401 +EQM is then simpler and more compact than the pair of mechanisms used in
13402 +CFQ.
13403 +
13404 +Finally, EQM also preserves the typical low-latency properties of BFQ, by
13405 +properly restoring the weight-raising state of a queue when it gets back to
13406 +a non-merged state.
13407 +
13408 +Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>
13409 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
13410 +Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>
13411 +---
13412 + block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++----------------
13413 + block/bfq-sched.c | 28 ---
13414 + block/bfq.h | 16 ++
13415 + 3 files changed, 474 insertions(+), 227 deletions(-)
13416 +
13417 +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
13418 +index eb760de..06ee844 100644
13419 +--- a/block/bfq-iosched.c
13420 ++++ b/block/bfq-iosched.c
13421 +@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
13422 + return dur;
13423 + }
13424 +
13425 ++static inline void
13426 ++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
13427 ++{
13428 ++ if (bic->saved_idle_window)
13429 ++ bfq_mark_bfqq_idle_window(bfqq);
13430 ++ else
13431 ++ bfq_clear_bfqq_idle_window(bfqq);
13432 ++ if (bic->raising_time_left && bfqq->bfqd->low_latency) {
13433 ++ /*
13434 ++ * Start a weight raising period with the duration given by
13435 ++ * the raising_time_left snapshot.
13436 ++ */
13437 ++ if (bfq_bfqq_busy(bfqq))
13438 ++ bfqq->bfqd->raised_busy_queues++;
13439 ++ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;
13440 ++ bfqq->raising_cur_max_time = bic->raising_time_left;
13441 ++ bfqq->last_rais_start_finish = jiffies;
13442 ++ bfqq->entity.ioprio_changed = 1;
13443 ++ }
13444 ++ /*
13445 ++ * Clear raising_time_left to prevent bfq_bfqq_save_state() from
13446 ++ * getting confused about the queue's need of a weight-raising
13447 ++ * period.
13448 ++ */
13449 ++ bic->raising_time_left = 0;
13450 ++}
13451 ++
13452 ++/*
13453 ++ * Must be called with the queue_lock held.
13454 ++ */
13455 ++static int bfqq_process_refs(struct bfq_queue *bfqq)
13456 ++{
13457 ++ int process_refs, io_refs;
13458 ++
13459 ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
13460 ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
13461 ++ BUG_ON(process_refs < 0);
13462 ++ return process_refs;
13463 ++}
13464 ++
13465 + static void bfq_add_rq_rb(struct request *rq)
13466 + {
13467 + struct bfq_queue *bfqq = RQ_BFQQ(rq);
13468 +@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq)
13469 + if (!bfqd->low_latency)
13470 + goto add_bfqq_busy;
13471 +
13472 ++ if (bfq_bfqq_just_split(bfqq))
13473 ++ goto set_ioprio_changed;
13474 ++
13475 + /*
13476 +- * If the queue is not being boosted and has been idle
13477 +- * for enough time, start a weight-raising period
13478 ++ * If the queue:
13479 ++ * - is not being boosted,
13480 ++ * - has been idle for enough time,
13481 ++ * - is not a sync queue or is linked to a bfq_io_cq (it is
13482 ++ * shared "for its nature" or it is not shared and its
13483 ++ * requests have not been redirected to a shared queue)
13484 ++ * start a weight-raising period.
13485 + */
13486 +- if (old_raising_coeff == 1 &&
13487 +- (idle_for_long_time || soft_rt)) {
13488 ++ if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&
13489 ++ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
13490 + bfqq->raising_coeff = bfqd->bfq_raising_coeff;
13491 + if (idle_for_long_time)
13492 + bfqq->raising_cur_max_time =
13493 +@@ -574,6 +622,7 @@ static void bfq_add_rq_rb(struct request *rq)
13494 + bfqd->bfq_raising_rt_max_time;
13495 + }
13496 + }
13497 ++set_ioprio_changed:
13498 + if (old_raising_coeff != bfqq->raising_coeff)
13499 + entity->ioprio_changed = 1;
13500 + add_bfqq_busy:
13501 +@@ -756,90 +805,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)
13502 + spin_unlock_irq(bfqd->queue->queue_lock);
13503 + }
13504 +
13505 +-static int bfq_allow_merge(struct request_queue *q, struct request *rq,
13506 +- struct bio *bio)
13507 +-{
13508 +- struct bfq_data *bfqd = q->elevator->elevator_data;
13509 +- struct bfq_io_cq *bic;
13510 +- struct bfq_queue *bfqq;
13511 +-
13512 +- /*
13513 +- * Disallow merge of a sync bio into an async request.
13514 +- */
13515 +- if (bfq_bio_sync(bio) && !rq_is_sync(rq))
13516 +- return 0;
13517 +-
13518 +- /*
13519 +- * Lookup the bfqq that this bio will be queued with. Allow
13520 +- * merge only if rq is queued there.
13521 +- * Queue lock is held here.
13522 +- */
13523 +- bic = bfq_bic_lookup(bfqd, current->io_context);
13524 +- if (bic == NULL)
13525 +- return 0;
13526 +-
13527 +- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
13528 +- return bfqq == RQ_BFQQ(rq);
13529 +-}
13530 +-
13531 +-static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
13532 +- struct bfq_queue *bfqq)
13533 +-{
13534 +- if (bfqq != NULL) {
13535 +- bfq_mark_bfqq_must_alloc(bfqq);
13536 +- bfq_mark_bfqq_budget_new(bfqq);
13537 +- bfq_clear_bfqq_fifo_expire(bfqq);
13538 +-
13539 +- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
13540 +-
13541 +- bfq_log_bfqq(bfqd, bfqq,
13542 +- "set_in_service_queue, cur-budget = %lu",
13543 +- bfqq->entity.budget);
13544 +- }
13545 +-
13546 +- bfqd->in_service_queue = bfqq;
13547 +-}
13548 +-
13549 +-/*
13550 +- * Get and set a new queue for service.
13551 +- */
13552 +-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
13553 +- struct bfq_queue *bfqq)
13554 ++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
13555 + {
13556 +- if (!bfqq)
13557 +- bfqq = bfq_get_next_queue(bfqd);
13558 ++ if (request)
13559 ++ return blk_rq_pos(io_struct);
13560 + else
13561 +- bfq_get_next_queue_forced(bfqd, bfqq);
13562 +-
13563 +- __bfq_set_in_service_queue(bfqd, bfqq);
13564 +- return bfqq;
13565 ++ return ((struct bio *)io_struct)->bi_sector;
13566 + }
13567 +
13568 +-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
13569 +- struct request *rq)
13570 ++static inline sector_t bfq_dist_from(sector_t pos1,
13571 ++ sector_t pos2)
13572 + {
13573 +- if (blk_rq_pos(rq) >= bfqd->last_position)
13574 +- return blk_rq_pos(rq) - bfqd->last_position;
13575 ++ if (pos1 >= pos2)
13576 ++ return pos1 - pos2;
13577 + else
13578 +- return bfqd->last_position - blk_rq_pos(rq);
13579 ++ return pos2 - pos1;
13580 + }
13581 +
13582 +-/*
13583 +- * Return true if bfqq has no request pending and rq is close enough to
13584 +- * bfqd->last_position, or if rq is closer to bfqd->last_position than
13585 +- * bfqq->next_rq
13586 +- */
13587 +-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
13588 ++static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
13589 ++ sector_t sector)
13590 + {
13591 +- return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
13592 ++ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
13593 ++ BFQQ_SEEK_THR;
13594 + }
13595 +
13596 +-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13597 ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
13598 + {
13599 + struct rb_root *root = &bfqd->rq_pos_tree;
13600 + struct rb_node *parent, *node;
13601 + struct bfq_queue *__bfqq;
13602 +- sector_t sector = bfqd->last_position;
13603 +
13604 + if (RB_EMPTY_ROOT(root))
13605 + return NULL;
13606 +@@ -858,7 +852,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13607 + * position).
13608 + */
13609 + __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
13610 +- if (bfq_rq_close(bfqd, __bfqq->next_rq))
13611 ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
13612 + return __bfqq;
13613 +
13614 + if (blk_rq_pos(__bfqq->next_rq) < sector)
13615 +@@ -869,7 +863,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13616 + return NULL;
13617 +
13618 + __bfqq = rb_entry(node, struct bfq_queue, pos_node);
13619 +- if (bfq_rq_close(bfqd, __bfqq->next_rq))
13620 ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
13621 + return __bfqq;
13622 +
13623 + return NULL;
13624 +@@ -878,14 +872,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13625 + /*
13626 + * bfqd - obvious
13627 + * cur_bfqq - passed in so that we don't decide that the current queue
13628 +- * is closely cooperating with itself.
13629 +- *
13630 +- * We are assuming that cur_bfqq has dispatched at least one request,
13631 +- * and that bfqd->last_position reflects a position on the disk associated
13632 +- * with the I/O issued by cur_bfqq.
13633 ++ * is closely cooperating with itself
13634 ++ * sector - used as a reference point to search for a close queue
13635 + */
13636 + static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
13637 +- struct bfq_queue *cur_bfqq)
13638 ++ struct bfq_queue *cur_bfqq,
13639 ++ sector_t sector)
13640 + {
13641 + struct bfq_queue *bfqq;
13642 +
13643 +@@ -905,7 +897,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
13644 + * working closely on the same area of the disk. In that case,
13645 + * we can group them together and don't waste time idling.
13646 + */
13647 +- bfqq = bfqq_close(bfqd);
13648 ++ bfqq = bfqq_close(bfqd, sector);
13649 + if (bfqq == NULL || bfqq == cur_bfqq)
13650 + return NULL;
13651 +
13652 +@@ -932,6 +924,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
13653 + return bfqq;
13654 + }
13655 +
13656 ++static struct bfq_queue *
13657 ++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
13658 ++{
13659 ++ int process_refs, new_process_refs;
13660 ++ struct bfq_queue *__bfqq;
13661 ++
13662 ++ /*
13663 ++ * If there are no process references on the new_bfqq, then it is
13664 ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
13665 ++ * may have dropped their last reference (not just their last process
13666 ++ * reference).
13667 ++ */
13668 ++ if (!bfqq_process_refs(new_bfqq))
13669 ++ return NULL;
13670 ++
13671 ++ /* Avoid a circular list and skip interim queue merges. */
13672 ++ while ((__bfqq = new_bfqq->new_bfqq)) {
13673 ++ if (__bfqq == bfqq)
13674 ++ return NULL;
13675 ++ new_bfqq = __bfqq;
13676 ++ }
13677 ++
13678 ++ process_refs = bfqq_process_refs(bfqq);
13679 ++ new_process_refs = bfqq_process_refs(new_bfqq);
13680 ++ /*
13681 ++ * If the process for the bfqq has gone away, there is no
13682 ++ * sense in merging the queues.
13683 ++ */
13684 ++ if (process_refs == 0 || new_process_refs == 0)
13685 ++ return NULL;
13686 ++
13687 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
13688 ++ new_bfqq->pid);
13689 ++
13690 ++ /*
13691 ++ * Merging is just a redirection: the requests of the process owning
13692 ++ * one of the two queues are redirected to the other queue. The latter
13693 ++ * queue, in its turn, is set as shared if this is the first time that
13694 ++ * the requests of some process are redirected to it.
13695 ++ *
13696 ++ * We redirect bfqq to new_bfqq and not the opposite, because we
13697 ++ * are in the context of the process owning bfqq, hence we have the
13698 ++ * io_cq of this process. So we can immediately configure this io_cq
13699 ++ * to redirect the requests of the process to new_bfqq.
13700 ++ *
13701 ++ * NOTE, even if new_bfqq coincides with the in-service queue, the
13702 ++ * io_cq of new_bfqq is not available, because, if the in-service queue
13703 ++ * is shared, bfqd->in_service_bic may not point to the io_cq of the
13704 ++ * in-service queue.
13705 ++ * Redirecting the requests of the process owning bfqq to the currently
13706 ++ * in-service queue is in any case the best option, as we feed the
13707 ++ * in-service queue with new requests close to the last request served
13708 ++ * and, by doing so, hopefully increase the throughput.
13709 ++ */
13710 ++ bfqq->new_bfqq = new_bfqq;
13711 ++ atomic_add(process_refs, &new_bfqq->ref);
13712 ++ return new_bfqq;
13713 ++}
13714 ++
13715 ++/*
13716 ++ * Attempt to schedule a merge of bfqq with the currently in-service queue or
13717 ++ * with a close queue among the scheduled queues.
13718 ++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
13719 ++ * structure otherwise.
13720 ++ */
13721 ++static struct bfq_queue *
13722 ++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
13723 ++ void *io_struct, bool request)
13724 ++{
13725 ++ struct bfq_queue *in_service_bfqq, *new_bfqq;
13726 ++
13727 ++ if (bfqq->new_bfqq)
13728 ++ return bfqq->new_bfqq;
13729 ++
13730 ++ if (!io_struct)
13731 ++ return NULL;
13732 ++
13733 ++ in_service_bfqq = bfqd->in_service_queue;
13734 ++
13735 ++ if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||
13736 ++ !bfqd->in_service_bic)
13737 ++ goto check_scheduled;
13738 ++
13739 ++ if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))
13740 ++ goto check_scheduled;
13741 ++
13742 ++ if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))
13743 ++ goto check_scheduled;
13744 ++
13745 ++ if (in_service_bfqq->entity.parent != bfqq->entity.parent)
13746 ++ goto check_scheduled;
13747 ++
13748 ++ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
13749 ++ bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {
13750 ++ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
13751 ++ if (new_bfqq != NULL)
13752 ++ return new_bfqq; /* Merge with the in-service queue */
13753 ++ }
13754 ++
13755 ++ /*
13756 ++ * Check whether there is a cooperator among currently scheduled
13757 ++ * queues. The only thing we need is that the bio/request is not
13758 ++ * NULL, as we need it to establish whether a cooperator exists.
13759 ++ */
13760 ++check_scheduled:
13761 ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq,
13762 ++ bfq_io_struct_pos(io_struct, request));
13763 ++ if (new_bfqq)
13764 ++ return bfq_setup_merge(bfqq, new_bfqq);
13765 ++
13766 ++ return NULL;
13767 ++}
13768 ++
13769 ++static inline void
13770 ++bfq_bfqq_save_state(struct bfq_queue *bfqq)
13771 ++{
13772 ++ /*
13773 ++ * If bfqq->bic == NULL, the queue is already shared or its requests
13774 ++ * have already been redirected to a shared queue; both idle window
13775 ++ * and weight raising state have already been saved. Do nothing.
13776 ++ */
13777 ++ if (bfqq->bic == NULL)
13778 ++ return;
13779 ++ if (bfqq->bic->raising_time_left)
13780 ++ /*
13781 ++ * This is the queue of a just-started process, and would
13782 ++ * deserve weight raising: we set raising_time_left to the full
13783 ++ * weight-raising duration to trigger weight-raising when and
13784 ++ * if the queue is split and the first request of the queue
13785 ++ * is enqueued.
13786 ++ */
13787 ++ bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);
13788 ++ else if (bfqq->raising_coeff > 1) {
13789 ++ unsigned long wrais_duration =
13790 ++ jiffies - bfqq->last_rais_start_finish;
13791 ++ /*
13792 ++ * It may happen that a queue's weight raising period lasts
13793 ++ * longer than its raising_cur_max_time, as weight raising is
13794 ++ * handled only when a request is enqueued or dispatched (it
13795 ++ * does not use any timer). If the weight raising period is
13796 ++ * about to end, don't save it.
13797 ++ */
13798 ++ if (bfqq->raising_cur_max_time <= wrais_duration)
13799 ++ bfqq->bic->raising_time_left = 0;
13800 ++ else
13801 ++ bfqq->bic->raising_time_left =
13802 ++ bfqq->raising_cur_max_time - wrais_duration;
13803 ++ /*
13804 ++ * The bfq_queue is becoming shared or the requests of the
13805 ++ * process owning the queue are being redirected to a shared
13806 ++ * queue. Stop the weight raising period of the queue, as in
13807 ++ * both cases it should not be owned by an interactive or soft
13808 ++ * real-time application.
13809 ++ */
13810 ++ bfq_bfqq_end_raising(bfqq);
13811 ++ } else
13812 ++ bfqq->bic->raising_time_left = 0;
13813 ++ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
13814 ++}
13815 ++
13816 ++static inline void
13817 ++bfq_get_bic_reference(struct bfq_queue *bfqq)
13818 ++{
13819 ++ /*
13820 ++ * If bfqq->bic has a non-NULL value, the bic to which it belongs
13821 ++ * is about to begin using a shared bfq_queue.
13822 ++ */
13823 ++ if (bfqq->bic)
13824 ++ atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
13825 ++}
13826 ++
13827 ++static void
13828 ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
13829 ++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
13830 ++{
13831 ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
13832 ++ (long unsigned)new_bfqq->pid);
13833 ++ /* Save weight raising and idle window of the merged queues */
13834 ++ bfq_bfqq_save_state(bfqq);
13835 ++ bfq_bfqq_save_state(new_bfqq);
13836 ++ /*
13837 ++ * Grab a reference to the bic, to prevent it from being destroyed
13838 ++ * before being possibly touched by a bfq_split_bfqq().
13839 ++ */
13840 ++ bfq_get_bic_reference(bfqq);
13841 ++ bfq_get_bic_reference(new_bfqq);
13842 ++ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */
13843 ++ bic_set_bfqq(bic, new_bfqq, 1);
13844 ++ bfq_mark_bfqq_coop(new_bfqq);
13845 ++ /*
13846 ++ * new_bfqq now belongs to at least two bics (it is a shared queue): set
13847 ++ * new_bfqq->bic to NULL. bfqq either:
13848 ++ * - does not belong to any bic any more, and hence bfqq->bic must
13849 ++ * be set to NULL, or
13850 ++ * - is a queue whose owning bics have already been redirected to a
13851 ++ * different queue, hence the queue is destined to not belong to any
13852 ++ * bic soon and bfqq->bic is already NULL (therefore the next
13853 ++ * assignment causes no harm).
13854 ++ */
13855 ++ new_bfqq->bic = NULL;
13856 ++ bfqq->bic = NULL;
13857 ++ bfq_put_queue(bfqq);
13858 ++}
13859 ++
13860 ++static int bfq_allow_merge(struct request_queue *q, struct request *rq,
13861 ++ struct bio *bio)
13862 ++{
13863 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
13864 ++ struct bfq_io_cq *bic;
13865 ++ struct bfq_queue *bfqq, *new_bfqq;
13866 ++
13867 ++ /*
13868 ++ * Disallow merge of a sync bio into an async request.
13869 ++ */
13870 ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
13871 ++ return 0;
13872 ++
13873 ++ /*
13874 ++ * Lookup the bfqq that this bio will be queued with. Allow
13875 ++ * merge only if rq is queued there.
13876 ++ * Queue lock is held here.
13877 ++ */
13878 ++ bic = bfq_bic_lookup(bfqd, current->io_context);
13879 ++ if (bic == NULL)
13880 ++ return 0;
13881 ++
13882 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
13883 ++ /*
13884 ++ * We take advantage of this function to perform an early merge
13885 ++ * of the queues of possible cooperating processes.
13886 ++ */
13887 ++ if (bfqq != NULL) {
13888 ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
13889 ++ if (new_bfqq != NULL) {
13890 ++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
13891 ++ /*
13892 ++ * If we get here, the bio will be queued in the shared queue,
13893 ++ * i.e., new_bfqq, so use new_bfqq to decide whether bio and
13894 ++ * rq can be merged.
13895 ++ */
13896 ++ bfqq = new_bfqq;
13897 ++ }
13898 ++ }
13899 ++
13900 ++ return bfqq == RQ_BFQQ(rq);
13901 ++}
13902 ++
13903 ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
13904 ++ struct bfq_queue *bfqq)
13905 ++{
13906 ++ if (bfqq != NULL) {
13907 ++ bfq_mark_bfqq_must_alloc(bfqq);
13908 ++ bfq_mark_bfqq_budget_new(bfqq);
13909 ++ bfq_clear_bfqq_fifo_expire(bfqq);
13910 ++
13911 ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
13912 ++
13913 ++ bfq_log_bfqq(bfqd, bfqq,
13914 ++ "set_in_service_queue, cur-budget = %lu",
13915 ++ bfqq->entity.budget);
13916 ++ }
13917 ++
13918 ++ bfqd->in_service_queue = bfqq;
13919 ++}
13920 ++
13921 ++/*
13922 ++ * Get and set a new queue for service.
13923 ++ */
13924 ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
13925 ++{
13926 ++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
13927 ++
13928 ++ __bfq_set_in_service_queue(bfqd, bfqq);
13929 ++ return bfqq;
13930 ++}
13931 ++
13932 + /*
13933 + * If enough samples have been computed, return the current max budget
13934 + * stored in bfqd, which is dynamically updated according to the
13935 +@@ -1079,63 +1347,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
13936 + return rq;
13937 + }
13938 +
13939 +-/*
13940 +- * Must be called with the queue_lock held.
13941 +- */
13942 +-static int bfqq_process_refs(struct bfq_queue *bfqq)
13943 +-{
13944 +- int process_refs, io_refs;
13945 +-
13946 +- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
13947 +- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
13948 +- BUG_ON(process_refs < 0);
13949 +- return process_refs;
13950 +-}
13951 +-
13952 +-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
13953 +-{
13954 +- int process_refs, new_process_refs;
13955 +- struct bfq_queue *__bfqq;
13956 +-
13957 +- /*
13958 +- * If there are no process references on the new_bfqq, then it is
13959 +- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
13960 +- * may have dropped their last reference (not just their last process
13961 +- * reference).
13962 +- */
13963 +- if (!bfqq_process_refs(new_bfqq))
13964 +- return;
13965 +-
13966 +- /* Avoid a circular list and skip interim queue merges. */
13967 +- while ((__bfqq = new_bfqq->new_bfqq)) {
13968 +- if (__bfqq == bfqq)
13969 +- return;
13970 +- new_bfqq = __bfqq;
13971 +- }
13972 +-
13973 +- process_refs = bfqq_process_refs(bfqq);
13974 +- new_process_refs = bfqq_process_refs(new_bfqq);
13975 +- /*
13976 +- * If the process for the bfqq has gone away, there is no
13977 +- * sense in merging the queues.
13978 +- */
13979 +- if (process_refs == 0 || new_process_refs == 0)
13980 +- return;
13981 +-
13982 +- /*
13983 +- * Merge in the direction of the lesser amount of work.
13984 +- */
13985 +- if (new_process_refs >= process_refs) {
13986 +- bfqq->new_bfqq = new_bfqq;
13987 +- atomic_add(process_refs, &new_bfqq->ref);
13988 +- } else {
13989 +- new_bfqq->new_bfqq = bfqq;
13990 +- atomic_add(new_process_refs, &bfqq->ref);
13991 +- }
13992 +- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
13993 +- new_bfqq->pid);
13994 +-}
13995 +-
13996 + static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
13997 + {
13998 + struct bfq_entity *entity = &bfqq->entity;
13999 +@@ -1729,7 +1940,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
14000 + */
14001 + static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
14002 + {
14003 +- struct bfq_queue *bfqq, *new_bfqq = NULL;
14004 ++ struct bfq_queue *bfqq;
14005 + struct request *next_rq;
14006 + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
14007 +
14008 +@@ -1739,17 +1950,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
14009 +
14010 + bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
14011 +
14012 +- /*
14013 +- * If another queue has a request waiting within our mean seek
14014 +- * distance, let it run. The expire code will check for close
14015 +- * cooperators and put the close queue at the front of the
14016 +- * service tree. If possible, merge the expiring queue with the
14017 +- * new bfqq.
14018 +- */
14019 +- new_bfqq = bfq_close_cooperator(bfqd, bfqq);
14020 +- if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
14021 +- bfq_setup_merge(bfqq, new_bfqq);
14022 +-
14023 + if (bfq_may_expire_for_budg_timeout(bfqq) &&
14024 + !timer_pending(&bfqd->idle_slice_timer) &&
14025 + !bfq_bfqq_must_idle(bfqq))
14026 +@@ -1786,36 +1986,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
14027 + bfq_clear_bfqq_wait_request(bfqq);
14028 + del_timer(&bfqd->idle_slice_timer);
14029 + }
14030 +- if (new_bfqq == NULL)
14031 +- goto keep_queue;
14032 +- else
14033 +- goto expire;
14034 ++ goto keep_queue;
14035 + }
14036 + }
14037 +
14038 + /*
14039 +- * No requests pending. If the in-service queue has no cooperator and
14040 +- * still has requests in flight (possibly waiting for a completion)
14041 +- * or is idling for a new request, then keep it.
14042 ++ * No requests pending. If the in-service queue still has requests in
14043 ++ * flight (possibly waiting for a completion) or is idling for a new
14044 ++ * request, then keep it.
14045 + */
14046 +- if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
14047 +- (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
14048 ++ if (timer_pending(&bfqd->idle_slice_timer) ||
14049 ++ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {
14050 + bfqq = NULL;
14051 + goto keep_queue;
14052 +- } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
14053 +- /*
14054 +- * Expiring the queue because there is a close cooperator,
14055 +- * cancel timer.
14056 +- */
14057 +- bfq_clear_bfqq_wait_request(bfqq);
14058 +- del_timer(&bfqd->idle_slice_timer);
14059 + }
14060 +
14061 + reason = BFQ_BFQQ_NO_MORE_REQUESTS;
14062 + expire:
14063 + bfq_bfqq_expire(bfqd, bfqq, 0, reason);
14064 + new_queue:
14065 +- bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
14066 ++ bfqq = bfq_set_in_service_queue(bfqd);
14067 + bfq_log(bfqd, "select_queue: new queue %d returned",
14068 + bfqq != NULL ? bfqq->pid : 0);
14069 + keep_queue:
14070 +@@ -1825,9 +2015,8 @@ keep_queue:
14071 + static void bfq_update_raising_data(struct bfq_data *bfqd,
14072 + struct bfq_queue *bfqq)
14073 + {
14074 ++ struct bfq_entity *entity = &bfqq->entity;
14075 + if (bfqq->raising_coeff > 1) { /* queue is being boosted */
14076 +- struct bfq_entity *entity = &bfqq->entity;
14077 +-
14078 + bfq_log_bfqq(bfqd, bfqq,
14079 + "raising period dur %u/%u msec, "
14080 + "old raising coeff %u, w %d(%d)",
14081 +@@ -1844,7 +2033,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,
14082 + "WARN: pending prio change");
14083 + /*
14084 + * If too much time has elapsed from the beginning
14085 +- * of this weight-raising, stop it.
14086 ++ * of this weight-raising period, stop it.
14087 + */
14088 + if (time_is_before_jiffies(bfqq->last_rais_start_finish +
14089 + bfqq->raising_cur_max_time)) {
14090 +@@ -1856,11 +2045,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,
14091 + jiffies_to_msecs(bfqq->
14092 + raising_cur_max_time));
14093 + bfq_bfqq_end_raising(bfqq);
14094 +- __bfq_entity_update_weight_prio(
14095 +- bfq_entity_service_tree(entity),
14096 +- entity);
14097 + }
14098 + }
14099 ++ /* Update weight both if it must be raised and if it must be lowered */
14100 ++ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))
14101 ++ __bfq_entity_update_weight_prio(
14102 ++ bfq_entity_service_tree(entity),
14103 ++ entity);
14104 + }
14105 +
14106 + /*
14107 +@@ -2101,6 +2292,25 @@ static void bfq_init_icq(struct io_cq *icq)
14108 + struct bfq_io_cq *bic = icq_to_bic(icq);
14109 +
14110 + bic->ttime.last_end_request = jiffies;
14111 ++ /*
14112 ++ * A newly created bic indicates that the process has just
14113 ++ * started doing I/O, and is probably mapping into memory its
14114 ++ * executable and libraries: it definitely needs weight raising.
14115 ++ * There is however the possibility that the process performs,
14116 ++ * for a while, I/O close to some other process. EQM intercepts
14117 ++ * this behavior and may merge the queue corresponding to the
14118 ++ * process with some other queue, BEFORE the weight of the queue
14119 ++ * is raised. Merged queues are not weight-raised (they are assumed
14120 ++ * to belong to processes that benefit only from high throughput).
14121 ++ * If the merge is basically the consequence of an accident, then
14122 ++ * the queue will be split soon and will get back its old weight.
14123 ++ * It is then important to write down somewhere that this queue
14124 ++ * does need weight raising, even if it did not make it to get its
14125 ++ * weight raised before being merged. To this purpose, we overload
14126 ++ * the field raising_time_left and assign 1 to it, to mark the queue
14127 ++ * as needing weight raising.
14128 ++ */
14129 ++ bic->raising_time_left = 1;
14130 + }
14131 +
14132 + static void bfq_exit_icq(struct io_cq *icq)
14133 +@@ -2114,6 +2324,13 @@ static void bfq_exit_icq(struct io_cq *icq)
14134 + }
14135 +
14136 + if (bic->bfqq[BLK_RW_SYNC]) {
14137 ++ /*
14138 ++ * If the bic is using a shared queue, put the reference
14139 ++ * taken on the io_context when the bic started using a
14140 ++ * shared bfq_queue.
14141 ++ */
14142 ++ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
14143 ++ put_io_context(icq->ioc);
14144 + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
14145 + bic->bfqq[BLK_RW_SYNC] = NULL;
14146 + }
14147 +@@ -2405,6 +2622,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
14148 + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
14149 + return;
14150 +
14151 ++ /* Idle window just restored, statistics are meaningless. */
14152 ++ if (bfq_bfqq_just_split(bfqq))
14153 ++ return;
14154 ++
14155 + enable_idle = bfq_bfqq_idle_window(bfqq);
14156 +
14157 + if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
14158 +@@ -2445,6 +2666,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
14159 + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
14160 + !BFQQ_SEEKY(bfqq))
14161 + bfq_update_idle_window(bfqd, bfqq, bic);
14162 ++ bfq_clear_bfqq_just_split(bfqq);
14163 +
14164 + bfq_log_bfqq(bfqd, bfqq,
14165 + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
14166 +@@ -2505,13 +2727,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
14167 + static void bfq_insert_request(struct request_queue *q, struct request *rq)
14168 + {
14169 + struct bfq_data *bfqd = q->elevator->elevator_data;
14170 +- struct bfq_queue *bfqq = RQ_BFQQ(rq);
14171 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
14172 +
14173 + assert_spin_locked(bfqd->queue->queue_lock);
14174 ++
14175 ++ /*
14176 ++ * An unplug may trigger a requeue of a request from the device
14177 ++ * driver: make sure we are in process context while trying to
14178 ++ * merge two bfq_queues.
14179 ++ */
14180 ++ if (!in_interrupt()) {
14181 ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
14182 ++ if (new_bfqq != NULL) {
14183 ++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
14184 ++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
14185 ++ /*
14186 ++ * Release the request's reference to the old bfqq
14187 ++ * and make sure one is taken to the shared queue.
14188 ++ */
14189 ++ new_bfqq->allocated[rq_data_dir(rq)]++;
14190 ++ bfqq->allocated[rq_data_dir(rq)]--;
14191 ++ atomic_inc(&new_bfqq->ref);
14192 ++ bfq_put_queue(bfqq);
14193 ++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
14194 ++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
14195 ++ bfqq, new_bfqq);
14196 ++ rq->elv.priv[1] = new_bfqq;
14197 ++ bfqq = new_bfqq;
14198 ++ }
14199 ++ }
14200 ++
14201 + bfq_init_prio_data(bfqq, RQ_BIC(rq));
14202 +
14203 + bfq_add_rq_rb(rq);
14204 +
14205 ++ /*
14206 ++ * Here a newly-created bfq_queue has already started a weight-raising
14207 ++ * period: clear raising_time_left to prevent bfq_bfqq_save_state()
14208 ++ * from assigning it a full weight-raising period. See the detailed
14209 ++ * comments about this field in bfq_init_icq().
14210 ++ */
14211 ++ if (bfqq->bic != NULL)
14212 ++ bfqq->bic->raising_time_left = 0;
14213 + rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
14214 + list_add_tail(&rq->queuelist, &bfqq->fifo);
14215 +
14216 +@@ -2659,18 +2916,6 @@ static void bfq_put_request(struct request *rq)
14217 + }
14218 + }
14219 +
14220 +-static struct bfq_queue *
14221 +-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
14222 +- struct bfq_queue *bfqq)
14223 +-{
14224 +- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
14225 +- (long unsigned)bfqq->new_bfqq->pid);
14226 +- bic_set_bfqq(bic, bfqq->new_bfqq, 1);
14227 +- bfq_mark_bfqq_coop(bfqq->new_bfqq);
14228 +- bfq_put_queue(bfqq);
14229 +- return bic_to_bfqq(bic, 1);
14230 +-}
14231 +-
14232 + /*
14233 + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
14234 + * was the last process referring to said bfqq.
14235 +@@ -2679,6 +2924,9 @@ static struct bfq_queue *
14236 + bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
14237 + {
14238 + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
14239 ++
14240 ++ put_io_context(bic->icq.ioc);
14241 ++
14242 + if (bfqq_process_refs(bfqq) == 1) {
14243 + bfqq->pid = current->pid;
14244 + bfq_clear_bfqq_coop(bfqq);
14245 +@@ -2707,6 +2955,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
14246 + struct bfq_queue *bfqq;
14247 + struct bfq_group *bfqg;
14248 + unsigned long flags;
14249 ++ bool split = false;
14250 +
14251 + might_sleep_if(gfp_mask & __GFP_WAIT);
14252 +
14253 +@@ -2725,24 +2974,14 @@ new_queue:
14254 + bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
14255 + bic_set_bfqq(bic, bfqq, is_sync);
14256 + } else {
14257 +- /*
14258 +- * If the queue was seeky for too long, break it apart.
14259 +- */
14260 ++ /* If the queue was seeky for too long, break it apart. */
14261 + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
14262 + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
14263 + bfqq = bfq_split_bfqq(bic, bfqq);
14264 ++ split = true;
14265 + if (!bfqq)
14266 + goto new_queue;
14267 + }
14268 +-
14269 +- /*
14270 +- * Check to see if this queue is scheduled to merge with
14271 +- * another closely cooperating queue. The merging of queues
14272 +- * happens here as it must be done in process context.
14273 +- * The reference on new_bfqq was taken in merge_bfqqs.
14274 +- */
14275 +- if (bfqq->new_bfqq != NULL)
14276 +- bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
14277 + }
14278 +
14279 + bfqq->allocated[rw]++;
14280 +@@ -2753,6 +2992,26 @@ new_queue:
14281 + rq->elv.priv[0] = bic;
14282 + rq->elv.priv[1] = bfqq;
14283 +
14284 ++ /*
14285 ++ * If a bfq_queue has only one process reference, it is owned
14286 ++ * by only one bfq_io_cq: we can set the bic field of the
14287 ++ * bfq_queue to the address of that structure. Also, if the
14288 ++ * queue has just been split, mark a flag so that the
14289 ++ * information is available to the other scheduler hooks.
14290 ++ */
14291 ++ if (bfqq_process_refs(bfqq) == 1) {
14292 ++ bfqq->bic = bic;
14293 ++ if (split) {
14294 ++ bfq_mark_bfqq_just_split(bfqq);
14295 ++ /*
14296 ++ * If the queue has just been split from a shared queue,
14297 ++ * restore the idle window and the possible weight
14298 ++ * raising period.
14299 ++ */
14300 ++ bfq_bfqq_resume_state(bfqq, bic);
14301 ++ }
14302 ++ }
14303 ++
14304 + spin_unlock_irqrestore(q->queue_lock, flags);
14305 +
14306 + return 0;
14307 +diff --git a/block/bfq-sched.c b/block/bfq-sched.c
14308 +index 999b475..e54ea33 100644
14309 +--- a/block/bfq-sched.c
14310 ++++ b/block/bfq-sched.c
14311 +@@ -980,34 +980,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
14312 + return bfqq;
14313 + }
14314 +
14315 +-/*
14316 +- * Forced extraction of the given queue.
14317 +- */
14318 +-static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
14319 +- struct bfq_queue *bfqq)
14320 +-{
14321 +- struct bfq_entity *entity;
14322 +- struct bfq_sched_data *sd;
14323 +-
14324 +- BUG_ON(bfqd->in_service_queue != NULL);
14325 +-
14326 +- entity = &bfqq->entity;
14327 +- /*
14328 +- * Bubble up extraction/update from the leaf to the root.
14329 +- */
14330 +- for_each_entity(entity) {
14331 +- sd = entity->sched_data;
14332 +- bfq_update_budget(entity);
14333 +- bfq_update_vtime(bfq_entity_service_tree(entity));
14334 +- bfq_active_extract(bfq_entity_service_tree(entity), entity);
14335 +- sd->active_entity = entity;
14336 +- sd->next_active = NULL;
14337 +- entity->service = 0;
14338 +- }
14339 +-
14340 +- return;
14341 +-}
14342 +-
14343 + static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
14344 + {
14345 + if (bfqd->in_service_bic != NULL) {
14346 +diff --git a/block/bfq.h b/block/bfq.h
14347 +index f9b5881..0bfad40 100644
14348 +--- a/block/bfq.h
14349 ++++ b/block/bfq.h
14350 +@@ -192,6 +192,8 @@ struct bfq_group;
14351 + * idle to backlogged
14352 + * @service_from_backlogged: cumulative service received from the @bfq_queue
14353 + * since the last transition from idle to backlogged
14354 ++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
14355 ++ * queue is shared
14356 + *
14357 + * A bfq_queue is a leaf request queue; it can be associated to an io_context
14358 + * or more (if it is an async one). @cgroup holds a reference to the
14359 +@@ -235,6 +237,7 @@ struct bfq_queue {
14360 + sector_t last_request_pos;
14361 +
14362 + pid_t pid;
14363 ++ struct bfq_io_cq *bic;
14364 +
14365 + /* weight-raising fields */
14366 + unsigned long raising_cur_max_time;
14367 +@@ -264,12 +267,23 @@ struct bfq_ttime {
14368 + * @icq: associated io_cq structure
14369 + * @bfqq: array of two process queues, the sync and the async
14370 + * @ttime: associated @bfq_ttime struct
14371 ++ * @raising_time_left: snapshot of the time left before weight raising ends
14372 ++ * for the sync queue associated to this process; this
14373 ++ * snapshot is taken to remember this value while the weight
14374 ++ * raising is suspended because the queue is merged with a
14375 ++ * shared queue, and is used to set @raising_cur_max_time
14376 ++ * when the queue is split from the shared queue and its
14377 ++ * weight is raised again
14378 ++ * @saved_idle_window: same purpose as the previous field for the idle window
14379 + */
14380 + struct bfq_io_cq {
14381 + struct io_cq icq; /* must be the first member */
14382 + struct bfq_queue *bfqq[2];
14383 + struct bfq_ttime ttime;
14384 + int ioprio;
14385 ++
14386 ++ unsigned int raising_time_left;
14387 ++ unsigned int saved_idle_window;
14388 + };
14389 +
14390 + /**
14391 +@@ -411,6 +425,7 @@ enum bfqq_state_flags {
14392 + BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
14393 + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
14394 + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
14395 ++ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */
14396 + BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */
14397 + };
14398 +
14399 +@@ -438,6 +453,7 @@ BFQ_BFQQ_FNS(sync);
14400 + BFQ_BFQQ_FNS(budget_new);
14401 + BFQ_BFQQ_FNS(coop);
14402 + BFQ_BFQQ_FNS(split_coop);
14403 ++BFQ_BFQQ_FNS(just_split);
14404 + BFQ_BFQQ_FNS(softrt_update);
14405 + #undef BFQ_BFQQ_FNS
14406 +
14407 +--
14408 +1.8.5.2
14409 +
14410
14411 Modified: genpatches-2.6/trunk/3.14/0000_README
14412 ===================================================================
14413 --- genpatches-2.6/trunk/3.14/0000_README 2014-02-07 14:46:59 UTC (rev 2665)
14414 +++ genpatches-2.6/trunk/3.14/0000_README 2014-02-07 15:42:35 UTC (rev 2666)
14415 @@ -83,17 +83,17 @@
14416 From: Tom Wijsman <TomWij@g.o>
14417 Desc: Add Gentoo Linux support config settings and defaults.
14418
14419 -Patch: 5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7-3.13.patch
14420 +Patch: 5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7r1-3.13.patch
14421 From: http://algo.ing.unimo.it/people/paolo/disk_sched/
14422 -Desc: BFQ v7 patch 1 for 3.13: Build, cgroups and kconfig bits
14423 +Desc: BFQ v7r1 patch 1 for 3.13: Build, cgroups and kconfig bits
14424
14425 -Patch: 5000_BFQ-2-block-introduce-the-v7-I-O-sched-for-3.13.patch1
14426 +Patch: 5000_BFQ-2-block-introduce-the-v7r1-I-O-sched-for-3.13.patch1
14427 From: http://algo.ing.unimo.it/people/paolo/disk_sched/
14428 -Desc: BFQ v7 patch 2 for 3.13: BFQ Scheduler
14429 +Desc: BFQ v7r1 patch 2 for 3.13: BFQ Scheduler
14430
14431 -Patch: 5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7-for-3.13.0.patch
14432 +Patch: 5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r1-for-3.13.0.patch
14433 From: http://algo.ing.unimo.it/people/paolo/disk_sched/
14434 -Desc: BFQ v7 patch 3 for 3.13: Early Queue Merge (EQM)
14435 +Desc: BFQ v7r1 patch 3 for 3.13: Early Queue Merge (EQM)
14436
14437 Patch: 5000_enable-additional-cpu-optimizations-for-gcc.patch
14438 From: https://github.com/graysky2/kernel_gcc_patch/
14439
14440 Added: genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch
14441 ===================================================================
14442 --- genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch (rev 0)
14443 +++ genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch 2014-02-07 15:42:35 UTC (rev 2666)
14444 @@ -0,0 +1,104 @@
14445 +From ae1b820a5286601aa9d5426459f8f3de658342b4 Mon Sep 17 00:00:00 2001
14446 +From: Paolo Valente <paolo.valente@×××××××.it>
14447 +Date: Tue, 3 Sep 2013 16:50:42 +0200
14448 +Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r1-3.13
14449 +
14450 +Update Kconfig.iosched and do the related Makefile changes to include
14451 +kernel configuration options for BFQ. Also add the bfqio controller
14452 +to the cgroups subsystem.
14453 +
14454 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
14455 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
14456 +---
14457 + block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++
14458 + block/Makefile | 1 +
14459 + include/linux/cgroup_subsys.h | 4 ++++
14460 + 3 files changed, 37 insertions(+)
14461 +
14462 +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
14463 +index 421bef9..8f552ba 100644
14464 +--- a/block/Kconfig.iosched
14465 ++++ b/block/Kconfig.iosched
14466 +@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED
14467 + ---help---
14468 + Enable group IO scheduling in CFQ.
14469 +
14470 ++config IOSCHED_BFQ
14471 ++ tristate "BFQ I/O scheduler"
14472 ++ default n
14473 ++ ---help---
14474 ++ The BFQ I/O scheduler tries to distribute bandwidth among
14475 ++ all processes according to their weights.
14476 ++ It aims at distributing the bandwidth as desired, independently of
14477 ++ the disk parameters and with any workload. It also tries to
14478 ++ guarantee low latency to interactive and soft real-time
14479 ++ applications. If compiled built-in (saying Y here), BFQ can
14480 ++ be configured to support hierarchical scheduling.
14481 ++
14482 ++config CGROUP_BFQIO
14483 ++ bool "BFQ hierarchical scheduling support"
14484 ++ depends on CGROUPS && IOSCHED_BFQ=y
14485 ++ default n
14486 ++ ---help---
14487 ++ Enable hierarchical scheduling in BFQ, using the cgroups
14488 ++ filesystem interface. The name of the subsystem will be
14489 ++ bfqio.
14490 ++
14491 + choice
14492 + prompt "Default I/O scheduler"
14493 + default DEFAULT_CFQ
14494 +@@ -52,6 +73,16 @@ choice
14495 + config DEFAULT_CFQ
14496 + bool "CFQ" if IOSCHED_CFQ=y
14497 +
14498 ++ config DEFAULT_BFQ
14499 ++ bool "BFQ" if IOSCHED_BFQ=y
14500 ++ help
14501 ++ Selects BFQ as the default I/O scheduler which will be
14502 ++ used by default for all block devices.
14503 ++ The BFQ I/O scheduler aims at distributing the bandwidth
14504 ++ as desired, independently of the disk parameters and with
14505 ++ any workload. It also tries to guarantee low latency to
14506 ++ interactive and soft real-time applications.
14507 ++
14508 + config DEFAULT_NOOP
14509 + bool "No-op"
14510 +
14511 +@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED
14512 + string
14513 + default "deadline" if DEFAULT_DEADLINE
14514 + default "cfq" if DEFAULT_CFQ
14515 ++ default "bfq" if DEFAULT_BFQ
14516 + default "noop" if DEFAULT_NOOP
14517 +
14518 + endmenu
14519 +diff --git a/block/Makefile b/block/Makefile
14520 +index 20645e8..cbd83fb 100644
14521 +--- a/block/Makefile
14522 ++++ b/block/Makefile
14523 +@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
14524 + obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
14525 + obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
14526 + obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
14527 ++obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
14528 +
14529 + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
14530 + obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
14531 +diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
14532 +index b613ffd..43c5dc9 100644
14533 +--- a/include/linux/cgroup_subsys.h
14534 ++++ b/include/linux/cgroup_subsys.h
14535 +@@ -39,6 +39,10 @@ SUBSYS(net_cls)
14536 + SUBSYS(blkio)
14537 + #endif
14538 +
14539 ++#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)
14540 ++SUBSYS(bfqio)
14541 ++#endif
14542 ++
14543 + #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)
14544 + SUBSYS(perf)
14545 + #endif
14546 +--
14547 +1.8.5.2
14548 +
14549
14550 Added: genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1
14551 ===================================================================
14552 --- genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1 (rev 0)
14553 +++ genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1 2014-02-07 15:42:35 UTC (rev 2666)
14554 @@ -0,0 +1,6040 @@
14555 +From be5107dc591f7ae692ca7cceecbba72e4c174c37 Mon Sep 17 00:00:00 2001
14556 +From: Paolo Valente <paolo.valente@×××××××.it>
14557 +Date: Thu, 9 May 2013 19:10:02 +0200
14558 +Subject: [PATCH 2/3] block: introduce the BFQ-v7r1 I/O sched for 3.13
14559 +
14560 +Add the BFQ-v7r1 I/O scheduler to 3.13.
14561 +The general structure is borrowed from CFQ, as much of the code for
14562 +handling I/O contexts Over time, several useful features have been
14563 +ported from CFQ as well (details in the changelog in README.BFQ). A
14564 +(bfq_)queue is associated to each task doing I/O on a device, and each
14565 +time a scheduling decision has to be made a queue is selected and served
14566 +until it expires.
14567 +
14568 + - Slices are given in the service domain: tasks are assigned
14569 + budgets, measured in number of sectors. Once got the disk, a task
14570 + must however consume its assigned budget within a configurable
14571 + maximum time (by default, the maximum possible value of the
14572 + budgets is automatically computed to comply with this timeout).
14573 + This allows the desired latency vs "throughput boosting" tradeoff
14574 + to be set.
14575 +
14576 + - Budgets are scheduled according to a variant of WF2Q+, implemented
14577 + using an augmented rb-tree to take eligibility into account while
14578 + preserving an O(log N) overall complexity.
14579 +
14580 + - A low-latency tunable is provided; if enabled, both interactive
14581 + and soft real-time applications are guaranteed a very low latency.
14582 +
14583 + - Latency guarantees are preserved also in the presence of NCQ.
14584 +
14585 + - Also with flash-based devices, a high throughput is achieved
14586 + while still preserving latency guarantees.
14587 +
14588 + - BFQ features Early Queue Merge (EQM), a sort of fusion of the
14589 + cooperating-queue-merging and the preemption mechanisms present
14590 + in CFQ. EQM is in fact a unified mechanism that tries to get a
14591 + sequential read pattern, and hence a high throughput, with any
14592 + set of processes performing interleaved I/O over a contiguous
14593 + sequence of sectors.
14594 +
14595 + - BFQ supports full hierarchical scheduling, exporting a cgroups
14596 + interface. Since each node has a full scheduler, each group can
14597 + be assigned its own weight.
14598 +
14599 + - If the cgroups interface is not used, only I/O priorities can be
14600 + assigned to processes, with ioprio values mapped to weights
14601 + with the relation weight = IOPRIO_BE_NR - ioprio.
14602 +
14603 + - ioprio classes are served in strict priority order, i.e., lower
14604 + priority queues are not served as long as there are higher
14605 + priority queues. Among queues in the same class the bandwidth is
14606 + distributed in proportion to the weight of each queue. A very
14607 + thin extra bandwidth is however guaranteed to the Idle class, to
14608 + prevent it from starving.
14609 +
14610 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
14611 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
14612 +---
14613 + block/bfq-cgroup.c | 911 ++++++++++++++
14614 + block/bfq-ioc.c | 36 +
14615 + block/bfq-iosched.c | 3298 +++++++++++++++++++++++++++++++++++++++++++++++++++
14616 + block/bfq-sched.c | 1078 +++++++++++++++++
14617 + block/bfq.h | 614 ++++++++++
14618 + 5 files changed, 5937 insertions(+)
14619 + create mode 100644 block/bfq-cgroup.c
14620 + create mode 100644 block/bfq-ioc.c
14621 + create mode 100644 block/bfq-iosched.c
14622 + create mode 100644 block/bfq-sched.c
14623 + create mode 100644 block/bfq.h
14624 +
14625 +diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
14626 +new file mode 100644
14627 +index 0000000..79a288a
14628 +--- /dev/null
14629 ++++ b/block/bfq-cgroup.c
14630 +@@ -0,0 +1,911 @@
14631 ++/*
14632 ++ * BFQ: CGROUPS support.
14633 ++ *
14634 ++ * Based on ideas and code from CFQ:
14635 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
14636 ++ *
14637 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
14638 ++ * Paolo Valente <paolo.valente@×××××××.it>
14639 ++ *
14640 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
14641 ++ *
14642 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
14643 ++ */
14644 ++
14645 ++#ifdef CONFIG_CGROUP_BFQIO
14646 ++
14647 ++static DEFINE_MUTEX(bfqio_mutex);
14648 ++
14649 ++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
14650 ++{
14651 ++ return bgrp ? !bgrp->online : false;
14652 ++}
14653 ++
14654 ++static struct bfqio_cgroup bfqio_root_cgroup = {
14655 ++ .weight = BFQ_DEFAULT_GRP_WEIGHT,
14656 ++ .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
14657 ++ .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
14658 ++};
14659 ++
14660 ++static inline void bfq_init_entity(struct bfq_entity *entity,
14661 ++ struct bfq_group *bfqg)
14662 ++{
14663 ++ entity->weight = entity->new_weight;
14664 ++ entity->orig_weight = entity->new_weight;
14665 ++ entity->ioprio = entity->new_ioprio;
14666 ++ entity->ioprio_class = entity->new_ioprio_class;
14667 ++ entity->parent = bfqg->my_entity;
14668 ++ entity->sched_data = &bfqg->sched_data;
14669 ++}
14670 ++
14671 ++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
14672 ++{
14673 ++ return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
14674 ++}
14675 ++
14676 ++/*
14677 ++ * Search the bfq_group for bfqd into the hash table (by now only a list)
14678 ++ * of bgrp. Must be called under rcu_read_lock().
14679 ++ */
14680 ++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
14681 ++ struct bfq_data *bfqd)
14682 ++{
14683 ++ struct bfq_group *bfqg;
14684 ++ void *key;
14685 ++
14686 ++ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
14687 ++ key = rcu_dereference(bfqg->bfqd);
14688 ++ if (key == bfqd)
14689 ++ return bfqg;
14690 ++ }
14691 ++
14692 ++ return NULL;
14693 ++}
14694 ++
14695 ++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
14696 ++ struct bfq_group *bfqg)
14697 ++{
14698 ++ struct bfq_entity *entity = &bfqg->entity;
14699 ++
14700 ++ /*
14701 ++ * If the weight of the entity has never been set via the sysfs
14702 ++ * interface, then bgrp->weight == 0. In this case we initialize
14703 ++ * the weight from the current ioprio value. Otherwise, the group
14704 ++ * weight, if set, has priority over the ioprio value.
14705 ++ */
14706 ++ if (bgrp->weight == 0) {
14707 ++ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
14708 ++ entity->new_ioprio = bgrp->ioprio;
14709 ++ } else {
14710 ++ entity->new_weight = bgrp->weight;
14711 ++ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
14712 ++ }
14713 ++ entity->orig_weight = entity->weight = entity->new_weight;
14714 ++ entity->ioprio = entity->new_ioprio;
14715 ++ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
14716 ++ entity->my_sched_data = &bfqg->sched_data;
14717 ++}
14718 ++
14719 ++static inline void bfq_group_set_parent(struct bfq_group *bfqg,
14720 ++ struct bfq_group *parent)
14721 ++{
14722 ++ struct bfq_entity *entity;
14723 ++
14724 ++ BUG_ON(parent == NULL);
14725 ++ BUG_ON(bfqg == NULL);
14726 ++
14727 ++ entity = &bfqg->entity;
14728 ++ entity->parent = parent->my_entity;
14729 ++ entity->sched_data = &parent->sched_data;
14730 ++}
14731 ++
14732 ++/**
14733 ++ * bfq_group_chain_alloc - allocate a chain of groups.
14734 ++ * @bfqd: queue descriptor.
14735 ++ * @css: the leaf cgroup_subsys_state this chain starts from.
14736 ++ *
14737 ++ * Allocate a chain of groups starting from the one belonging to
14738 ++ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
14739 ++ * to the root has already an allocated group on @bfqd.
14740 ++ */
14741 ++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
14742 ++ struct cgroup_subsys_state *css)
14743 ++{
14744 ++ struct bfqio_cgroup *bgrp;
14745 ++ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
14746 ++
14747 ++ for (; css != NULL; css = css->parent) {
14748 ++ bgrp = css_to_bfqio(css);
14749 ++
14750 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
14751 ++ if (bfqg != NULL) {
14752 ++ /*
14753 ++ * All the cgroups in the path from there to the
14754 ++ * root must have a bfq_group for bfqd, so we don't
14755 ++ * need any more allocations.
14756 ++ */
14757 ++ break;
14758 ++ }
14759 ++
14760 ++ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
14761 ++ if (bfqg == NULL)
14762 ++ goto cleanup;
14763 ++
14764 ++ bfq_group_init_entity(bgrp, bfqg);
14765 ++ bfqg->my_entity = &bfqg->entity;
14766 ++
14767 ++ if (leaf == NULL) {
14768 ++ leaf = bfqg;
14769 ++ prev = leaf;
14770 ++ } else {
14771 ++ bfq_group_set_parent(prev, bfqg);
14772 ++ /*
14773 ++ * Build a list of allocated nodes using the bfqd
14774 ++ * filed, that is still unused and will be initialized
14775 ++ * only after the node will be connected.
14776 ++ */
14777 ++ prev->bfqd = bfqg;
14778 ++ prev = bfqg;
14779 ++ }
14780 ++ }
14781 ++
14782 ++ return leaf;
14783 ++
14784 ++cleanup:
14785 ++ while (leaf != NULL) {
14786 ++ prev = leaf;
14787 ++ leaf = leaf->bfqd;
14788 ++ kfree(prev);
14789 ++ }
14790 ++
14791 ++ return NULL;
14792 ++}
14793 ++
14794 ++/**
14795 ++ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
14796 ++ * @bfqd: the queue descriptor.
14797 ++ * @css: the leaf cgroup_subsys_state to start from.
14798 ++ * @leaf: the leaf group (to be associated to @cgroup).
14799 ++ *
14800 ++ * Try to link a chain of groups to a cgroup hierarchy, connecting the
14801 ++ * nodes bottom-up, so we can be sure that when we find a cgroup in the
14802 ++ * hierarchy that already as a group associated to @bfqd all the nodes
14803 ++ * in the path to the root cgroup have one too.
14804 ++ *
14805 ++ * On locking: the queue lock protects the hierarchy (there is a hierarchy
14806 ++ * per device) while the bfqio_cgroup lock protects the list of groups
14807 ++ * belonging to the same cgroup.
14808 ++ */
14809 ++static void bfq_group_chain_link(struct bfq_data *bfqd,
14810 ++ struct cgroup_subsys_state *css,
14811 ++ struct bfq_group *leaf)
14812 ++{
14813 ++ struct bfqio_cgroup *bgrp;
14814 ++ struct bfq_group *bfqg, *next, *prev = NULL;
14815 ++ unsigned long flags;
14816 ++
14817 ++ assert_spin_locked(bfqd->queue->queue_lock);
14818 ++
14819 ++ for (; css != NULL && leaf != NULL; css = css->parent) {
14820 ++ bgrp = css_to_bfqio(css);
14821 ++ next = leaf->bfqd;
14822 ++
14823 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
14824 ++ BUG_ON(bfqg != NULL);
14825 ++
14826 ++ spin_lock_irqsave(&bgrp->lock, flags);
14827 ++
14828 ++ rcu_assign_pointer(leaf->bfqd, bfqd);
14829 ++ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
14830 ++ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
14831 ++
14832 ++ spin_unlock_irqrestore(&bgrp->lock, flags);
14833 ++
14834 ++ prev = leaf;
14835 ++ leaf = next;
14836 ++ }
14837 ++
14838 ++ BUG_ON(css == NULL && leaf != NULL);
14839 ++ if (css != NULL && prev != NULL) {
14840 ++ bgrp = css_to_bfqio(css);
14841 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
14842 ++ bfq_group_set_parent(prev, bfqg);
14843 ++ }
14844 ++}
14845 ++
14846 ++/**
14847 ++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
14848 ++ * @bfqd: queue descriptor.
14849 ++ * @cgroup: cgroup being searched for.
14850 ++ *
14851 ++ * Return a group associated to @bfqd in @cgroup, allocating one if
14852 ++ * necessary. When a group is returned all the cgroups in the path
14853 ++ * to the root have a group associated to @bfqd.
14854 ++ *
14855 ++ * If the allocation fails, return the root group: this breaks guarantees
14856 ++ * but is a safe fallbak. If this loss becames a problem it can be
14857 ++ * mitigated using the equivalent weight (given by the product of the
14858 ++ * weights of the groups in the path from @group to the root) in the
14859 ++ * root scheduler.
14860 ++ *
14861 ++ * We allocate all the missing nodes in the path from the leaf cgroup
14862 ++ * to the root and we connect the nodes only after all the allocations
14863 ++ * have been successful.
14864 ++ */
14865 ++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
14866 ++ struct cgroup_subsys_state *css)
14867 ++{
14868 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
14869 ++ struct bfq_group *bfqg;
14870 ++
14871 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
14872 ++ if (bfqg != NULL)
14873 ++ return bfqg;
14874 ++
14875 ++ bfqg = bfq_group_chain_alloc(bfqd, css);
14876 ++ if (bfqg != NULL)
14877 ++ bfq_group_chain_link(bfqd, css, bfqg);
14878 ++ else
14879 ++ bfqg = bfqd->root_group;
14880 ++
14881 ++ return bfqg;
14882 ++}
14883 ++
14884 ++/**
14885 ++ * bfq_bfqq_move - migrate @bfqq to @bfqg.
14886 ++ * @bfqd: queue descriptor.
14887 ++ * @bfqq: the queue to move.
14888 ++ * @entity: @bfqq's entity.
14889 ++ * @bfqg: the group to move to.
14890 ++ *
14891 ++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
14892 ++ * it on the new one. Avoid putting the entity on the old group idle tree.
14893 ++ *
14894 ++ * Must be called under the queue lock; the cgroup owning @bfqg must
14895 ++ * not disappear (by now this just means that we are called under
14896 ++ * rcu_read_lock()).
14897 ++ */
14898 ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
14899 ++ struct bfq_entity *entity, struct bfq_group *bfqg)
14900 ++{
14901 ++ int busy, resume;
14902 ++
14903 ++ busy = bfq_bfqq_busy(bfqq);
14904 ++ resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
14905 ++
14906 ++ BUG_ON(resume && !entity->on_st);
14907 ++ BUG_ON(busy && !resume && entity->on_st &&
14908 ++ bfqq != bfqd->in_service_queue);
14909 ++
14910 ++ if (busy) {
14911 ++ BUG_ON(atomic_read(&bfqq->ref) < 2);
14912 ++
14913 ++ if (!resume)
14914 ++ bfq_del_bfqq_busy(bfqd, bfqq, 0);
14915 ++ else
14916 ++ bfq_deactivate_bfqq(bfqd, bfqq, 0);
14917 ++ } else if (entity->on_st)
14918 ++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
14919 ++
14920 ++ /*
14921 ++ * Here we use a reference to bfqg. We don't need a refcounter
14922 ++ * as the cgroup reference will not be dropped, so that its
14923 ++ * destroy() callback will not be invoked.
14924 ++ */
14925 ++ entity->parent = bfqg->my_entity;
14926 ++ entity->sched_data = &bfqg->sched_data;
14927 ++
14928 ++ if (busy && resume)
14929 ++ bfq_activate_bfqq(bfqd, bfqq);
14930 ++
14931 ++ if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)
14932 ++ bfq_schedule_dispatch(bfqd);
14933 ++}
14934 ++
14935 ++/**
14936 ++ * __bfq_bic_change_cgroup - move @bic to @cgroup.
14937 ++ * @bfqd: the queue descriptor.
14938 ++ * @bic: the bic to move.
14939 ++ * @cgroup: the cgroup to move to.
14940 ++ *
14941 ++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
14942 ++ * has to make sure that the reference to cgroup is valid across the call.
14943 ++ *
14944 ++ * NOTE: an alternative approach might have been to store the current
14945 ++ * cgroup in bfqq and getting a reference to it, reducing the lookup
14946 ++ * time here, at the price of slightly more complex code.
14947 ++ */
14948 ++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
14949 ++ struct bfq_io_cq *bic,
14950 ++ struct cgroup_subsys_state *css)
14951 ++{
14952 ++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
14953 ++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
14954 ++ struct bfq_entity *entity;
14955 ++ struct bfq_group *bfqg;
14956 ++ struct bfqio_cgroup *bgrp;
14957 ++
14958 ++ bgrp = css_to_bfqio(css);
14959 ++
14960 ++ bfqg = bfq_find_alloc_group(bfqd, css);
14961 ++ if (async_bfqq != NULL) {
14962 ++ entity = &async_bfqq->entity;
14963 ++
14964 ++ if (entity->sched_data != &bfqg->sched_data) {
14965 ++ bic_set_bfqq(bic, NULL, 0);
14966 ++ bfq_log_bfqq(bfqd, async_bfqq,
14967 ++ "bic_change_group: %p %d",
14968 ++ async_bfqq, atomic_read(&async_bfqq->ref));
14969 ++ bfq_put_queue(async_bfqq);
14970 ++ }
14971 ++ }
14972 ++
14973 ++ if (sync_bfqq != NULL) {
14974 ++ entity = &sync_bfqq->entity;
14975 ++ if (entity->sched_data != &bfqg->sched_data)
14976 ++ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
14977 ++ }
14978 ++
14979 ++ return bfqg;
14980 ++}
14981 ++
14982 ++/**
14983 ++ * bfq_bic_change_cgroup - move @bic to @cgroup.
14984 ++ * @bic: the bic being migrated.
14985 ++ * @cgroup: the destination cgroup.
14986 ++ *
14987 ++ * When the task owning @bic is moved to @cgroup, @bic is immediately
14988 ++ * moved into its new parent group.
14989 ++ */
14990 ++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
14991 ++ struct cgroup_subsys_state *css)
14992 ++{
14993 ++ struct bfq_data *bfqd;
14994 ++ unsigned long uninitialized_var(flags);
14995 ++
14996 ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
14997 ++ &flags);
14998 ++ if (bfqd != NULL) {
14999 ++ __bfq_bic_change_cgroup(bfqd, bic, css);
15000 ++ bfq_put_bfqd_unlock(bfqd, &flags);
15001 ++ }
15002 ++}
15003 ++
15004 ++/**
15005 ++ * bfq_bic_update_cgroup - update the cgroup of @bic.
15006 ++ * @bic: the @bic to update.
15007 ++ *
15008 ++ * Make sure that @bic is enqueued in the cgroup of the current task.
15009 ++ * We need this in addition to moving bics during the cgroup attach
15010 ++ * phase because the task owning @bic could be at its first disk
15011 ++ * access or we may end up in the root cgroup as the result of a
15012 ++ * memory allocation failure and here we try to move to the right
15013 ++ * group.
15014 ++ *
15015 ++ * Must be called under the queue lock. It is safe to use the returned
15016 ++ * value even after the rcu_read_unlock() as the migration/destruction
15017 ++ * paths act under the queue lock too. IOW it is impossible to race with
15018 ++ * group migration/destruction and end up with an invalid group as:
15019 ++ * a) here cgroup has not yet been destroyed, nor its destroy callback
15020 ++ * has started execution, as current holds a reference to it,
15021 ++ * b) if it is destroyed after rcu_read_unlock() [after current is
15022 ++ * migrated to a different cgroup] its attach() callback will have
15023 ++ * taken care of remove all the references to the old cgroup data.
15024 ++ */
15025 ++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
15026 ++{
15027 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
15028 ++ struct bfq_group *bfqg;
15029 ++ struct cgroup_subsys_state *css;
15030 ++
15031 ++ BUG_ON(bfqd == NULL);
15032 ++
15033 ++ rcu_read_lock();
15034 ++ css = task_css(current, bfqio_subsys_id);
15035 ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
15036 ++ rcu_read_unlock();
15037 ++
15038 ++ return bfqg;
15039 ++}
15040 ++
15041 ++/**
15042 ++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
15043 ++ * @st: the service tree being flushed.
15044 ++ */
15045 ++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
15046 ++{
15047 ++ struct bfq_entity *entity = st->first_idle;
15048 ++
15049 ++ for (; entity != NULL; entity = st->first_idle)
15050 ++ __bfq_deactivate_entity(entity, 0);
15051 ++}
15052 ++
15053 ++/**
15054 ++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
15055 ++ * @bfqd: the device data structure with the root group.
15056 ++ * @entity: the entity to move.
15057 ++ */
15058 ++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
15059 ++ struct bfq_entity *entity)
15060 ++{
15061 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
15062 ++
15063 ++ BUG_ON(bfqq == NULL);
15064 ++ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
15065 ++ return;
15066 ++}
15067 ++
15068 ++/**
15069 ++ * bfq_reparent_active_entities - move to the root group all active entities.
15070 ++ * @bfqd: the device data structure with the root group.
15071 ++ * @bfqg: the group to move from.
15072 ++ * @st: the service tree with the entities.
15073 ++ *
15074 ++ * Needs queue_lock to be taken and reference to be valid over the call.
15075 ++ */
15076 ++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
15077 ++ struct bfq_group *bfqg,
15078 ++ struct bfq_service_tree *st)
15079 ++{
15080 ++ struct rb_root *active = &st->active;
15081 ++ struct bfq_entity *entity = NULL;
15082 ++
15083 ++ if (!RB_EMPTY_ROOT(&st->active))
15084 ++ entity = bfq_entity_of(rb_first(active));
15085 ++
15086 ++ for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))
15087 ++ bfq_reparent_leaf_entity(bfqd, entity);
15088 ++
15089 ++ if (bfqg->sched_data.in_service_entity != NULL)
15090 ++ bfq_reparent_leaf_entity(bfqd,
15091 ++ bfqg->sched_data.in_service_entity);
15092 ++
15093 ++ return;
15094 ++}
15095 ++
15096 ++/**
15097 ++ * bfq_destroy_group - destroy @bfqg.
15098 ++ * @bgrp: the bfqio_cgroup containing @bfqg.
15099 ++ * @bfqg: the group being destroyed.
15100 ++ *
15101 ++ * Destroy @bfqg, making sure that it is not referenced from its parent.
15102 ++ */
15103 ++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
15104 ++{
15105 ++ struct bfq_data *bfqd;
15106 ++ struct bfq_service_tree *st;
15107 ++ struct bfq_entity *entity = bfqg->my_entity;
15108 ++ unsigned long uninitialized_var(flags);
15109 ++ int i;
15110 ++
15111 ++ hlist_del(&bfqg->group_node);
15112 ++
15113 ++ /*
15114 ++ * Empty all service_trees belonging to this group before deactivating
15115 ++ * the group itself.
15116 ++ */
15117 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
15118 ++ st = bfqg->sched_data.service_tree + i;
15119 ++
15120 ++ /*
15121 ++ * The idle tree may still contain bfq_queues belonging
15122 ++ * to exited task because they never migrated to a different
15123 ++ * cgroup from the one being destroyed now. Noone else
15124 ++ * can access them so it's safe to act without any lock.
15125 ++ */
15126 ++ bfq_flush_idle_tree(st);
15127 ++
15128 ++ /*
15129 ++ * It may happen that some queues are still active
15130 ++ * (busy) upon group destruction (if the corresponding
15131 ++ * processes have been forced to terminate). We move
15132 ++ * all the leaf entities corresponding to these queues
15133 ++ * to the root_group.
15134 ++ * Also, it may happen that the group has an entity
15135 ++ * under service, which is disconnected from the active
15136 ++ * tree: it must be moved, too.
15137 ++ * There is no need to put the sync queues, as the
15138 ++ * scheduler has taken no reference.
15139 ++ */
15140 ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
15141 ++ if (bfqd != NULL) {
15142 ++ bfq_reparent_active_entities(bfqd, bfqg, st);
15143 ++ bfq_put_bfqd_unlock(bfqd, &flags);
15144 ++ }
15145 ++ BUG_ON(!RB_EMPTY_ROOT(&st->active));
15146 ++ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
15147 ++ }
15148 ++ BUG_ON(bfqg->sched_data.next_in_service != NULL);
15149 ++ BUG_ON(bfqg->sched_data.in_service_entity != NULL);
15150 ++
15151 ++ /*
15152 ++ * We may race with device destruction, take extra care when
15153 ++ * dereferencing bfqg->bfqd.
15154 ++ */
15155 ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
15156 ++ if (bfqd != NULL) {
15157 ++ hlist_del(&bfqg->bfqd_node);
15158 ++ __bfq_deactivate_entity(entity, 0);
15159 ++ bfq_put_async_queues(bfqd, bfqg);
15160 ++ bfq_put_bfqd_unlock(bfqd, &flags);
15161 ++ }
15162 ++ BUG_ON(entity->tree != NULL);
15163 ++
15164 ++ /*
15165 ++ * No need to defer the kfree() to the end of the RCU grace
15166 ++ * period: we are called from the destroy() callback of our
15167 ++ * cgroup, so we can be sure that noone is a) still using
15168 ++ * this cgroup or b) doing lookups in it.
15169 ++ */
15170 ++ kfree(bfqg);
15171 ++}
15172 ++
15173 ++static void bfq_end_raising_async(struct bfq_data *bfqd)
15174 ++{
15175 ++ struct hlist_node *tmp;
15176 ++ struct bfq_group *bfqg;
15177 ++
15178 ++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
15179 ++ bfq_end_raising_async_queues(bfqd, bfqg);
15180 ++ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
15181 ++}
15182 ++
15183 ++/**
15184 ++ * bfq_disconnect_groups - diconnect @bfqd from all its groups.
15185 ++ * @bfqd: the device descriptor being exited.
15186 ++ *
15187 ++ * When the device exits we just make sure that no lookup can return
15188 ++ * the now unused group structures. They will be deallocated on cgroup
15189 ++ * destruction.
15190 ++ */
15191 ++static void bfq_disconnect_groups(struct bfq_data *bfqd)
15192 ++{
15193 ++ struct hlist_node *tmp;
15194 ++ struct bfq_group *bfqg;
15195 ++
15196 ++ bfq_log(bfqd, "disconnect_groups beginning");
15197 ++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
15198 ++ hlist_del(&bfqg->bfqd_node);
15199 ++
15200 ++ __bfq_deactivate_entity(bfqg->my_entity, 0);
15201 ++
15202 ++ /*
15203 ++ * Don't remove from the group hash, just set an
15204 ++ * invalid key. No lookups can race with the
15205 ++ * assignment as bfqd is being destroyed; this
15206 ++ * implies also that new elements cannot be added
15207 ++ * to the list.
15208 ++ */
15209 ++ rcu_assign_pointer(bfqg->bfqd, NULL);
15210 ++
15211 ++ bfq_log(bfqd, "disconnect_groups: put async for group %p",
15212 ++ bfqg);
15213 ++ bfq_put_async_queues(bfqd, bfqg);
15214 ++ }
15215 ++}
15216 ++
15217 ++static inline void bfq_free_root_group(struct bfq_data *bfqd)
15218 ++{
15219 ++ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
15220 ++ struct bfq_group *bfqg = bfqd->root_group;
15221 ++
15222 ++ bfq_put_async_queues(bfqd, bfqg);
15223 ++
15224 ++ spin_lock_irq(&bgrp->lock);
15225 ++ hlist_del_rcu(&bfqg->group_node);
15226 ++ spin_unlock_irq(&bgrp->lock);
15227 ++
15228 ++ /*
15229 ++ * No need to synchronize_rcu() here: since the device is gone
15230 ++ * there cannot be any read-side access to its root_group.
15231 ++ */
15232 ++ kfree(bfqg);
15233 ++}
15234 ++
15235 ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
15236 ++{
15237 ++ struct bfq_group *bfqg;
15238 ++ struct bfqio_cgroup *bgrp;
15239 ++ int i;
15240 ++
15241 ++ bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);
15242 ++ if (bfqg == NULL)
15243 ++ return NULL;
15244 ++
15245 ++ bfqg->entity.parent = NULL;
15246 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
15247 ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
15248 ++
15249 ++ bgrp = &bfqio_root_cgroup;
15250 ++ spin_lock_irq(&bgrp->lock);
15251 ++ rcu_assign_pointer(bfqg->bfqd, bfqd);
15252 ++ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
15253 ++ spin_unlock_irq(&bgrp->lock);
15254 ++
15255 ++ return bfqg;
15256 ++}
15257 ++
15258 ++#define SHOW_FUNCTION(__VAR) \
15259 ++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
15260 ++ struct cftype *cftype) \
15261 ++{ \
15262 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
15263 ++ u64 ret = -ENODEV; \
15264 ++ \
15265 ++ mutex_lock(&bfqio_mutex); \
15266 ++ if (bfqio_is_removed(bgrp)) \
15267 ++ goto out_unlock; \
15268 ++ \
15269 ++ spin_lock_irq(&bgrp->lock); \
15270 ++ ret = bgrp->__VAR; \
15271 ++ spin_unlock_irq(&bgrp->lock); \
15272 ++ \
15273 ++out_unlock: \
15274 ++ mutex_unlock(&bfqio_mutex); \
15275 ++ return ret; \
15276 ++}
15277 ++
15278 ++SHOW_FUNCTION(weight);
15279 ++SHOW_FUNCTION(ioprio);
15280 ++SHOW_FUNCTION(ioprio_class);
15281 ++#undef SHOW_FUNCTION
15282 ++
15283 ++#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
15284 ++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
15285 ++ struct cftype *cftype, \
15286 ++ u64 val) \
15287 ++{ \
15288 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
15289 ++ struct bfq_group *bfqg; \
15290 ++ int ret = -EINVAL; \
15291 ++ \
15292 ++ if (val < (__MIN) || val > (__MAX)) \
15293 ++ return ret; \
15294 ++ \
15295 ++ ret = -ENODEV; \
15296 ++ mutex_lock(&bfqio_mutex); \
15297 ++ if (bfqio_is_removed(bgrp)) \
15298 ++ goto out_unlock; \
15299 ++ ret = 0; \
15300 ++ \
15301 ++ spin_lock_irq(&bgrp->lock); \
15302 ++ bgrp->__VAR = (unsigned short)val; \
15303 ++ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
15304 ++ /* \
15305 ++ * Setting the ioprio_changed flag of the entity \
15306 ++ * to 1 with new_##__VAR == ##__VAR would re-set \
15307 ++ * the value of the weight to its ioprio mapping. \
15308 ++ * Set the flag only if necessary. \
15309 ++ */ \
15310 ++ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
15311 ++ bfqg->entity.new_##__VAR = (unsigned short)val; \
15312 ++ smp_wmb(); \
15313 ++ bfqg->entity.ioprio_changed = 1; \
15314 ++ } \
15315 ++ } \
15316 ++ spin_unlock_irq(&bgrp->lock); \
15317 ++ \
15318 ++out_unlock: \
15319 ++ mutex_unlock(&bfqio_mutex); \
15320 ++ return ret; \
15321 ++}
15322 ++
15323 ++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
15324 ++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
15325 ++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
15326 ++#undef STORE_FUNCTION
15327 ++
15328 ++static struct cftype bfqio_files[] = {
15329 ++ {
15330 ++ .name = "weight",
15331 ++ .read_u64 = bfqio_cgroup_weight_read,
15332 ++ .write_u64 = bfqio_cgroup_weight_write,
15333 ++ },
15334 ++ {
15335 ++ .name = "ioprio",
15336 ++ .read_u64 = bfqio_cgroup_ioprio_read,
15337 ++ .write_u64 = bfqio_cgroup_ioprio_write,
15338 ++ },
15339 ++ {
15340 ++ .name = "ioprio_class",
15341 ++ .read_u64 = bfqio_cgroup_ioprio_class_read,
15342 ++ .write_u64 = bfqio_cgroup_ioprio_class_write,
15343 ++ },
15344 ++ { }, /* terminate */
15345 ++};
15346 ++
15347 ++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state
15348 ++ *parent_css)
15349 ++{
15350 ++ struct bfqio_cgroup *bgrp;
15351 ++
15352 ++ if (parent_css != NULL) {
15353 ++ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
15354 ++ if (bgrp == NULL)
15355 ++ return ERR_PTR(-ENOMEM);
15356 ++ } else
15357 ++ bgrp = &bfqio_root_cgroup;
15358 ++
15359 ++ spin_lock_init(&bgrp->lock);
15360 ++ INIT_HLIST_HEAD(&bgrp->group_data);
15361 ++ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
15362 ++ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
15363 ++
15364 ++ return &bgrp->css;
15365 ++}
15366 ++
15367 ++/*
15368 ++ * We cannot support shared io contexts, as we have no means to support
15369 ++ * two tasks with the same ioc in two different groups without major rework
15370 ++ * of the main bic/bfqq data structures. By now we allow a task to change
15371 ++ * its cgroup only if it's the only owner of its ioc; the drawback of this
15372 ++ * behavior is that a group containing a task that forked using CLONE_IO
15373 ++ * will not be destroyed until the tasks sharing the ioc die.
15374 ++ */
15375 ++static int bfqio_can_attach(struct cgroup_subsys_state *css,
15376 ++ struct cgroup_taskset *tset)
15377 ++{
15378 ++ struct task_struct *task;
15379 ++ struct io_context *ioc;
15380 ++ int ret = 0;
15381 ++
15382 ++ cgroup_taskset_for_each(task, css, tset) {
15383 ++ /*
15384 ++ * task_lock() is needed to avoid races with
15385 ++ * exit_io_context()
15386 ++ */
15387 ++ task_lock(task);
15388 ++ ioc = task->io_context;
15389 ++ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
15390 ++ /*
15391 ++ * ioc == NULL means that the task is either too young
15392 ++ * or exiting: if it has still no ioc the ioc can't be
15393 ++ * shared, if the task is exiting the attach will fail
15394 ++ * anyway, no matter what we return here.
15395 ++ */
15396 ++ ret = -EINVAL;
15397 ++ task_unlock(task);
15398 ++ if (ret)
15399 ++ break;
15400 ++ }
15401 ++
15402 ++ return ret;
15403 ++}
15404 ++
15405 ++static void bfqio_attach(struct cgroup_subsys_state *css,
15406 ++ struct cgroup_taskset *tset)
15407 ++{
15408 ++ struct task_struct *task;
15409 ++ struct io_context *ioc;
15410 ++ struct io_cq *icq;
15411 ++
15412 ++ /*
15413 ++ * IMPORTANT NOTE: The move of more than one process at a time to a
15414 ++ * new group has not yet been tested.
15415 ++ */
15416 ++ cgroup_taskset_for_each(task, css, tset) {
15417 ++ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
15418 ++ if (ioc) {
15419 ++ /*
15420 ++ * Handle cgroup change here.
15421 ++ */
15422 ++ rcu_read_lock();
15423 ++ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
15424 ++ if (!strncmp(
15425 ++ icq->q->elevator->type->elevator_name,
15426 ++ "bfq", ELV_NAME_MAX))
15427 ++ bfq_bic_change_cgroup(icq_to_bic(icq),
15428 ++ css);
15429 ++ rcu_read_unlock();
15430 ++ put_io_context(ioc);
15431 ++ }
15432 ++ }
15433 ++}
15434 ++
15435 ++static void bfqio_destroy(struct cgroup_subsys_state *css)
15436 ++{
15437 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
15438 ++ struct hlist_node *tmp;
15439 ++ struct bfq_group *bfqg;
15440 ++
15441 ++ /*
15442 ++ * Since we are destroying the cgroup, there are no more tasks
15443 ++ * referencing it, and all the RCU grace periods that may have
15444 ++ * referenced it are ended (as the destruction of the parent
15445 ++ * cgroup is RCU-safe); bgrp->group_data will not be accessed by
15446 ++ * anything else and we don't need any synchronization.
15447 ++ */
15448 ++ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
15449 ++ bfq_destroy_group(bgrp, bfqg);
15450 ++
15451 ++ BUG_ON(!hlist_empty(&bgrp->group_data));
15452 ++
15453 ++ kfree(bgrp);
15454 ++}
15455 ++
15456 ++static int bfqio_css_online(struct cgroup_subsys_state *css)
15457 ++{
15458 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
15459 ++
15460 ++ mutex_lock(&bfqio_mutex);
15461 ++ bgrp->online = true;
15462 ++ mutex_unlock(&bfqio_mutex);
15463 ++
15464 ++ return 0;
15465 ++}
15466 ++
15467 ++static void bfqio_css_offline(struct cgroup_subsys_state *css)
15468 ++{
15469 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
15470 ++
15471 ++ mutex_lock(&bfqio_mutex);
15472 ++ bgrp->online = false;
15473 ++ mutex_unlock(&bfqio_mutex);
15474 ++}
15475 ++
15476 ++struct cgroup_subsys bfqio_subsys = {
15477 ++ .name = "bfqio",
15478 ++ .css_alloc = bfqio_create,
15479 ++ .css_online = bfqio_css_online,
15480 ++ .css_offline = bfqio_css_offline,
15481 ++ .can_attach = bfqio_can_attach,
15482 ++ .attach = bfqio_attach,
15483 ++ .css_free = bfqio_destroy,
15484 ++ .subsys_id = bfqio_subsys_id,
15485 ++ .base_cftypes = bfqio_files,
15486 ++};
15487 ++#else
15488 ++static inline void bfq_init_entity(struct bfq_entity *entity,
15489 ++ struct bfq_group *bfqg)
15490 ++{
15491 ++ entity->weight = entity->new_weight;
15492 ++ entity->orig_weight = entity->new_weight;
15493 ++ entity->ioprio = entity->new_ioprio;
15494 ++ entity->ioprio_class = entity->new_ioprio_class;
15495 ++ entity->sched_data = &bfqg->sched_data;
15496 ++}
15497 ++
15498 ++static inline struct bfq_group *
15499 ++bfq_bic_update_cgroup(struct bfq_io_cq *bic)
15500 ++{
15501 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
15502 ++ return bfqd->root_group;
15503 ++}
15504 ++
15505 ++static inline void bfq_bfqq_move(struct bfq_data *bfqd,
15506 ++ struct bfq_queue *bfqq,
15507 ++ struct bfq_entity *entity,
15508 ++ struct bfq_group *bfqg)
15509 ++{
15510 ++}
15511 ++
15512 ++static void bfq_end_raising_async(struct bfq_data *bfqd)
15513 ++{
15514 ++ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
15515 ++}
15516 ++
15517 ++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
15518 ++{
15519 ++ bfq_put_async_queues(bfqd, bfqd->root_group);
15520 ++}
15521 ++
15522 ++static inline void bfq_free_root_group(struct bfq_data *bfqd)
15523 ++{
15524 ++ kfree(bfqd->root_group);
15525 ++}
15526 ++
15527 ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
15528 ++{
15529 ++ struct bfq_group *bfqg;
15530 ++ int i;
15531 ++
15532 ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
15533 ++ if (bfqg == NULL)
15534 ++ return NULL;
15535 ++
15536 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
15537 ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
15538 ++
15539 ++ return bfqg;
15540 ++}
15541 ++#endif
15542 +diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
15543 +new file mode 100644
15544 +index 0000000..7f6b000
15545 +--- /dev/null
15546 ++++ b/block/bfq-ioc.c
15547 +@@ -0,0 +1,36 @@
15548 ++/*
15549 ++ * BFQ: I/O context handling.
15550 ++ *
15551 ++ * Based on ideas and code from CFQ:
15552 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
15553 ++ *
15554 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
15555 ++ * Paolo Valente <paolo.valente@×××××××.it>
15556 ++ *
15557 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
15558 ++ */
15559 ++
15560 ++/**
15561 ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
15562 ++ * @icq: the iocontext queue.
15563 ++ */
15564 ++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
15565 ++{
15566 ++ /* bic->icq is the first member, %NULL will convert to %NULL */
15567 ++ return container_of(icq, struct bfq_io_cq, icq);
15568 ++}
15569 ++
15570 ++/**
15571 ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
15572 ++ * @bfqd: the lookup key.
15573 ++ * @ioc: the io_context of the process doing I/O.
15574 ++ *
15575 ++ * Queue lock must be held.
15576 ++ */
15577 ++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
15578 ++ struct io_context *ioc)
15579 ++{
15580 ++ if (ioc)
15581 ++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
15582 ++ return NULL;
15583 ++}
15584 +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
15585 +new file mode 100644
15586 +index 0000000..eb760de
15587 +--- /dev/null
15588 ++++ b/block/bfq-iosched.c
15589 +@@ -0,0 +1,3298 @@
15590 ++/*
15591 ++ * BFQ, or Budget Fair Queueing, disk scheduler.
15592 ++ *
15593 ++ * Based on ideas and code from CFQ:
15594 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
15595 ++ *
15596 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
15597 ++ * Paolo Valente <paolo.valente@×××××××.it>
15598 ++ *
15599 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
15600 ++ *
15601 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
15602 ++ *
15603 ++ * BFQ is a proportional share disk scheduling algorithm based on the
15604 ++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in
15605 ++ * number of sectors, to tasks instead of time slices. The disk is not granted
15606 ++ * to the in-service task for a given time slice, but until it has exahusted
15607 ++ * its assigned budget. This change from the time to the service domain allows
15608 ++ * BFQ to distribute the disk bandwidth among tasks as desired, without any
15609 ++ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an
15610 ++ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to
15611 ++ * their budgets (more precisely BFQ schedules queues associated to tasks).
15612 ++ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to
15613 ++ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low
15614 ++ * latencies to interactive and soft real-time applications.
15615 ++ *
15616 ++ * BFQ is described in [1], where also a reference to the initial, more
15617 ++ * theoretical paper on BFQ can be found. The interested reader can find in
15618 ++ * the latter paper full details on the main algorithm as well as formulas of
15619 ++ * the guarantees, plus formal proofs of all the properties. With respect to
15620 ++ * the version of BFQ presented in these papers, this implementation adds a
15621 ++ * few more heuristics, such as the one that guarantees a low latency to soft
15622 ++ * real-time applications, and a hierarchical extension based on H-WF2Q+.
15623 ++ *
15624 ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
15625 ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
15626 ++ * complexity derives from the one introduced with EEVDF in [3].
15627 ++ *
15628 ++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
15629 ++ * with the BFQ Disk I/O Scheduler'',
15630 ++ * Proceedings of the 5th Annual International Systems and Storage
15631 ++ * Conference (SYSTOR '12), June 2012.
15632 ++ *
15633 ++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
15634 ++ *
15635 ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
15636 ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
15637 ++ * Oct 1997.
15638 ++ *
15639 ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
15640 ++ *
15641 ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
15642 ++ * First: A Flexible and Accurate Mechanism for Proportional Share
15643 ++ * Resource Allocation,'' technical report.
15644 ++ *
15645 ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
15646 ++ */
15647 ++#include <linux/module.h>
15648 ++#include <linux/slab.h>
15649 ++#include <linux/blkdev.h>
15650 ++#include <linux/cgroup.h>
15651 ++#include <linux/elevator.h>
15652 ++#include <linux/jiffies.h>
15653 ++#include <linux/rbtree.h>
15654 ++#include <linux/ioprio.h>
15655 ++#include "bfq.h"
15656 ++#include "blk.h"
15657 ++
15658 ++/* Max number of dispatches in one round of service. */
15659 ++static const int bfq_quantum = 4;
15660 ++
15661 ++/* Expiration time of sync (0) and async (1) requests, in jiffies. */
15662 ++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
15663 ++
15664 ++/* Maximum backwards seek, in KiB. */
15665 ++static const int bfq_back_max = 16 * 1024;
15666 ++
15667 ++/* Penalty of a backwards seek, in number of sectors. */
15668 ++static const int bfq_back_penalty = 2;
15669 ++
15670 ++/* Idling period duration, in jiffies. */
15671 ++static int bfq_slice_idle = HZ / 125;
15672 ++
15673 ++/* Default maximum budget values, in sectors and number of requests. */
15674 ++static const int bfq_default_max_budget = 16 * 1024;
15675 ++static const int bfq_max_budget_async_rq = 4;
15676 ++
15677 ++/*
15678 ++ * Async to sync throughput distribution is controlled as follows:
15679 ++ * when an async request is served, the entity is charged the number
15680 ++ * of sectors of the request, multipled by the factor below
15681 ++ */
15682 ++static const int bfq_async_charge_factor = 10;
15683 ++
15684 ++/* Default timeout values, in jiffies, approximating CFQ defaults. */
15685 ++static const int bfq_timeout_sync = HZ / 8;
15686 ++static int bfq_timeout_async = HZ / 25;
15687 ++
15688 ++struct kmem_cache *bfq_pool;
15689 ++
15690 ++/* Below this threshold (in ms), we consider thinktime immediate. */
15691 ++#define BFQ_MIN_TT 2
15692 ++
15693 ++/* hw_tag detection: parallel requests threshold and min samples needed. */
15694 ++#define BFQ_HW_QUEUE_THRESHOLD 4
15695 ++#define BFQ_HW_QUEUE_SAMPLES 32
15696 ++
15697 ++#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
15698 ++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
15699 ++
15700 ++/* Min samples used for peak rate estimation (for autotuning). */
15701 ++#define BFQ_PEAK_RATE_SAMPLES 32
15702 ++
15703 ++/* Shift used for peak rate fixed precision calculations. */
15704 ++#define BFQ_RATE_SHIFT 16
15705 ++
15706 ++/*
15707 ++ * The duration of the weight raising for interactive applications is
15708 ++ * computed automatically (as default behaviour), using the following
15709 ++ * formula: duration = (R / r) * T, where r is the peak rate of the
15710 ++ * disk, and R and T are two reference parameters. In particular, R is
15711 ++ * the peak rate of a reference disk, and T is about the maximum time
15712 ++ * for starting popular large applications on that disk, under BFQ and
15713 ++ * while reading two files in parallel. Finally, BFQ uses two
15714 ++ * different pairs (R, T) depending on whether the disk is rotational
15715 ++ * or non-rotational.
15716 ++ */
15717 ++#define T_rot (msecs_to_jiffies(5500))
15718 ++#define T_nonrot (msecs_to_jiffies(2000))
15719 ++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
15720 ++#define R_rot 17415
15721 ++#define R_nonrot 34791
15722 ++
15723 ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
15724 ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
15725 ++
15726 ++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
15727 ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
15728 ++
15729 ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
15730 ++
15731 ++#include "bfq-ioc.c"
15732 ++#include "bfq-sched.c"
15733 ++#include "bfq-cgroup.c"
15734 ++
15735 ++#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
15736 ++ IOPRIO_CLASS_IDLE)
15737 ++#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
15738 ++ IOPRIO_CLASS_RT)
15739 ++
15740 ++#define bfq_sample_valid(samples) ((samples) > 80)
15741 ++
15742 ++/*
15743 ++ * We regard a request as SYNC, if either it's a read or has the SYNC bit
15744 ++ * set (in which case it could also be a direct WRITE).
15745 ++ */
15746 ++static inline int bfq_bio_sync(struct bio *bio)
15747 ++{
15748 ++ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
15749 ++ return 1;
15750 ++
15751 ++ return 0;
15752 ++}
15753 ++
15754 ++/*
15755 ++ * Scheduler run of queue, if there are requests pending and no one in the
15756 ++ * driver that will restart queueing.
15757 ++ */
15758 ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
15759 ++{
15760 ++ if (bfqd->queued != 0) {
15761 ++ bfq_log(bfqd, "schedule dispatch");
15762 ++ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
15763 ++ }
15764 ++}
15765 ++
15766 ++/*
15767 ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
15768 ++ * We choose the request that is closesr to the head right now. Distance
15769 ++ * behind the head is penalized and only allowed to a certain extent.
15770 ++ */
15771 ++static struct request *bfq_choose_req(struct bfq_data *bfqd,
15772 ++ struct request *rq1,
15773 ++ struct request *rq2,
15774 ++ sector_t last)
15775 ++{
15776 ++ sector_t s1, s2, d1 = 0, d2 = 0;
15777 ++ unsigned long back_max;
15778 ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
15779 ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
15780 ++ unsigned wrap = 0; /* bit mask: requests behind the disk head? */
15781 ++
15782 ++ if (rq1 == NULL || rq1 == rq2)
15783 ++ return rq2;
15784 ++ if (rq2 == NULL)
15785 ++ return rq1;
15786 ++
15787 ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
15788 ++ return rq1;
15789 ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
15790 ++ return rq2;
15791 ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
15792 ++ return rq1;
15793 ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
15794 ++ return rq2;
15795 ++
15796 ++ s1 = blk_rq_pos(rq1);
15797 ++ s2 = blk_rq_pos(rq2);
15798 ++
15799 ++ /*
15800 ++ * By definition, 1KiB is 2 sectors.
15801 ++ */
15802 ++ back_max = bfqd->bfq_back_max * 2;
15803 ++
15804 ++ /*
15805 ++ * Strict one way elevator _except_ in the case where we allow
15806 ++ * short backward seeks which are biased as twice the cost of a
15807 ++ * similar forward seek.
15808 ++ */
15809 ++ if (s1 >= last)
15810 ++ d1 = s1 - last;
15811 ++ else if (s1 + back_max >= last)
15812 ++ d1 = (last - s1) * bfqd->bfq_back_penalty;
15813 ++ else
15814 ++ wrap |= BFQ_RQ1_WRAP;
15815 ++
15816 ++ if (s2 >= last)
15817 ++ d2 = s2 - last;
15818 ++ else if (s2 + back_max >= last)
15819 ++ d2 = (last - s2) * bfqd->bfq_back_penalty;
15820 ++ else
15821 ++ wrap |= BFQ_RQ2_WRAP;
15822 ++
15823 ++ /* Found required data */
15824 ++
15825 ++ /*
15826 ++ * By doing switch() on the bit mask "wrap" we avoid having to
15827 ++ * check two variables for all permutations: --> faster!
15828 ++ */
15829 ++ switch (wrap) {
15830 ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
15831 ++ if (d1 < d2)
15832 ++ return rq1;
15833 ++ else if (d2 < d1)
15834 ++ return rq2;
15835 ++ else {
15836 ++ if (s1 >= s2)
15837 ++ return rq1;
15838 ++ else
15839 ++ return rq2;
15840 ++ }
15841 ++
15842 ++ case BFQ_RQ2_WRAP:
15843 ++ return rq1;
15844 ++ case BFQ_RQ1_WRAP:
15845 ++ return rq2;
15846 ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
15847 ++ default:
15848 ++ /*
15849 ++ * Since both rqs are wrapped,
15850 ++ * start with the one that's further behind head
15851 ++ * (--> only *one* back seek required),
15852 ++ * since back seek takes more time than forward.
15853 ++ */
15854 ++ if (s1 <= s2)
15855 ++ return rq1;
15856 ++ else
15857 ++ return rq2;
15858 ++ }
15859 ++}
15860 ++
15861 ++static struct bfq_queue *
15862 ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
15863 ++ sector_t sector, struct rb_node **ret_parent,
15864 ++ struct rb_node ***rb_link)
15865 ++{
15866 ++ struct rb_node **p, *parent;
15867 ++ struct bfq_queue *bfqq = NULL;
15868 ++
15869 ++ parent = NULL;
15870 ++ p = &root->rb_node;
15871 ++ while (*p) {
15872 ++ struct rb_node **n;
15873 ++
15874 ++ parent = *p;
15875 ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
15876 ++
15877 ++ /*
15878 ++ * Sort strictly based on sector. Smallest to the left,
15879 ++ * largest to the right.
15880 ++ */
15881 ++ if (sector > blk_rq_pos(bfqq->next_rq))
15882 ++ n = &(*p)->rb_right;
15883 ++ else if (sector < blk_rq_pos(bfqq->next_rq))
15884 ++ n = &(*p)->rb_left;
15885 ++ else
15886 ++ break;
15887 ++ p = n;
15888 ++ bfqq = NULL;
15889 ++ }
15890 ++
15891 ++ *ret_parent = parent;
15892 ++ if (rb_link)
15893 ++ *rb_link = p;
15894 ++
15895 ++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
15896 ++ (long long unsigned)sector,
15897 ++ bfqq != NULL ? bfqq->pid : 0);
15898 ++
15899 ++ return bfqq;
15900 ++}
15901 ++
15902 ++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
15903 ++{
15904 ++ struct rb_node **p, *parent;
15905 ++ struct bfq_queue *__bfqq;
15906 ++
15907 ++ if (bfqq->pos_root != NULL) {
15908 ++ rb_erase(&bfqq->pos_node, bfqq->pos_root);
15909 ++ bfqq->pos_root = NULL;
15910 ++ }
15911 ++
15912 ++ if (bfq_class_idle(bfqq))
15913 ++ return;
15914 ++ if (!bfqq->next_rq)
15915 ++ return;
15916 ++
15917 ++ bfqq->pos_root = &bfqd->rq_pos_tree;
15918 ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
15919 ++ blk_rq_pos(bfqq->next_rq), &parent, &p);
15920 ++ if (__bfqq == NULL) {
15921 ++ rb_link_node(&bfqq->pos_node, parent, p);
15922 ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
15923 ++ } else
15924 ++ bfqq->pos_root = NULL;
15925 ++}
15926 ++
15927 ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
15928 ++ struct bfq_queue *bfqq,
15929 ++ struct request *last)
15930 ++{
15931 ++ struct rb_node *rbnext = rb_next(&last->rb_node);
15932 ++ struct rb_node *rbprev = rb_prev(&last->rb_node);
15933 ++ struct request *next = NULL, *prev = NULL;
15934 ++
15935 ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
15936 ++
15937 ++ if (rbprev != NULL)
15938 ++ prev = rb_entry_rq(rbprev);
15939 ++
15940 ++ if (rbnext != NULL)
15941 ++ next = rb_entry_rq(rbnext);
15942 ++ else {
15943 ++ rbnext = rb_first(&bfqq->sort_list);
15944 ++ if (rbnext && rbnext != &last->rb_node)
15945 ++ next = rb_entry_rq(rbnext);
15946 ++ }
15947 ++
15948 ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
15949 ++}
15950 ++
15951 ++static void bfq_del_rq_rb(struct request *rq)
15952 ++{
15953 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
15954 ++ struct bfq_data *bfqd = bfqq->bfqd;
15955 ++ const int sync = rq_is_sync(rq);
15956 ++
15957 ++ BUG_ON(bfqq->queued[sync] == 0);
15958 ++ bfqq->queued[sync]--;
15959 ++ bfqd->queued--;
15960 ++
15961 ++ elv_rb_del(&bfqq->sort_list, rq);
15962 ++
15963 ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
15964 ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
15965 ++ bfq_del_bfqq_busy(bfqd, bfqq, 1);
15966 ++ /*
15967 ++ * Remove queue from request-position tree as it is empty.
15968 ++ */
15969 ++ if (bfqq->pos_root != NULL) {
15970 ++ rb_erase(&bfqq->pos_node, bfqq->pos_root);
15971 ++ bfqq->pos_root = NULL;
15972 ++ }
15973 ++ }
15974 ++}
15975 ++
15976 ++/* see the definition of bfq_async_charge_factor for details */
15977 ++static inline unsigned long bfq_serv_to_charge(struct request *rq,
15978 ++ struct bfq_queue *bfqq)
15979 ++{
15980 ++ return blk_rq_sectors(rq) *
15981 ++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
15982 ++ bfq_async_charge_factor));
15983 ++}
15984 ++
15985 ++/**
15986 ++ * bfq_updated_next_req - update the queue after a new next_rq selection.
15987 ++ * @bfqd: the device data the queue belongs to.
15988 ++ * @bfqq: the queue to update.
15989 ++ *
15990 ++ * If the first request of a queue changes we make sure that the queue
15991 ++ * has enough budget to serve at least its first request (if the
15992 ++ * request has grown). We do this because if the queue has not enough
15993 ++ * budget for its first request, it has to go through two dispatch
15994 ++ * rounds to actually get it dispatched.
15995 ++ */
15996 ++static void bfq_updated_next_req(struct bfq_data *bfqd,
15997 ++ struct bfq_queue *bfqq)
15998 ++{
15999 ++ struct bfq_entity *entity = &bfqq->entity;
16000 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
16001 ++ struct request *next_rq = bfqq->next_rq;
16002 ++ unsigned long new_budget;
16003 ++
16004 ++ if (next_rq == NULL)
16005 ++ return;
16006 ++
16007 ++ if (bfqq == bfqd->in_service_queue)
16008 ++ /*
16009 ++ * In order not to break guarantees, budgets cannot be
16010 ++ * changed after an entity has been selected.
16011 ++ */
16012 ++ return;
16013 ++
16014 ++ BUG_ON(entity->tree != &st->active);
16015 ++ BUG_ON(entity == entity->sched_data->in_service_entity);
16016 ++
16017 ++ new_budget = max_t(unsigned long, bfqq->max_budget,
16018 ++ bfq_serv_to_charge(next_rq, bfqq));
16019 ++ entity->budget = new_budget;
16020 ++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
16021 ++ bfq_activate_bfqq(bfqd, bfqq);
16022 ++}
16023 ++
16024 ++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
16025 ++{
16026 ++ u64 dur;
16027 ++
16028 ++ if (bfqd->bfq_raising_max_time > 0)
16029 ++ return bfqd->bfq_raising_max_time;
16030 ++
16031 ++ dur = bfqd->RT_prod;
16032 ++ do_div(dur, bfqd->peak_rate);
16033 ++
16034 ++ return dur;
16035 ++}
16036 ++
16037 ++static void bfq_add_rq_rb(struct request *rq)
16038 ++{
16039 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
16040 ++ struct bfq_entity *entity = &bfqq->entity;
16041 ++ struct bfq_data *bfqd = bfqq->bfqd;
16042 ++ struct request *next_rq, *prev;
16043 ++ unsigned long old_raising_coeff = bfqq->raising_coeff;
16044 ++ int idle_for_long_time = 0;
16045 ++
16046 ++ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
16047 ++ bfqq->queued[rq_is_sync(rq)]++;
16048 ++ bfqd->queued++;
16049 ++
16050 ++ elv_rb_add(&bfqq->sort_list, rq);
16051 ++
16052 ++ /*
16053 ++ * Check if this request is a better next-serve candidate.
16054 ++ */
16055 ++ prev = bfqq->next_rq;
16056 ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
16057 ++ BUG_ON(next_rq == NULL);
16058 ++ bfqq->next_rq = next_rq;
16059 ++
16060 ++ /*
16061 ++ * Adjust priority tree position, if next_rq changes.
16062 ++ */
16063 ++ if (prev != bfqq->next_rq)
16064 ++ bfq_rq_pos_tree_add(bfqd, bfqq);
16065 ++
16066 ++ if (!bfq_bfqq_busy(bfqq)) {
16067 ++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
16068 ++ time_is_before_jiffies(bfqq->soft_rt_next_start);
16069 ++ idle_for_long_time = time_is_before_jiffies(
16070 ++ bfqq->budget_timeout +
16071 ++ bfqd->bfq_raising_min_idle_time);
16072 ++ entity->budget = max_t(unsigned long, bfqq->max_budget,
16073 ++ bfq_serv_to_charge(next_rq, bfqq));
16074 ++
16075 ++ if (!bfqd->low_latency)
16076 ++ goto add_bfqq_busy;
16077 ++
16078 ++ /*
16079 ++ * If the queue is not being boosted and has been idle
16080 ++ * for enough time, start a weight-raising period
16081 ++ */
16082 ++ if (old_raising_coeff == 1 &&
16083 ++ (idle_for_long_time || soft_rt)) {
16084 ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
16085 ++ if (idle_for_long_time)
16086 ++ bfqq->raising_cur_max_time =
16087 ++ bfq_wrais_duration(bfqd);
16088 ++ else
16089 ++ bfqq->raising_cur_max_time =
16090 ++ bfqd->bfq_raising_rt_max_time;
16091 ++ bfq_log_bfqq(bfqd, bfqq,
16092 ++ "wrais starting at %lu, "
16093 ++ "rais_max_time %u",
16094 ++ jiffies,
16095 ++ jiffies_to_msecs(bfqq->
16096 ++ raising_cur_max_time));
16097 ++ } else if (old_raising_coeff > 1) {
16098 ++ if (idle_for_long_time)
16099 ++ bfqq->raising_cur_max_time =
16100 ++ bfq_wrais_duration(bfqd);
16101 ++ else if (bfqq->raising_cur_max_time ==
16102 ++ bfqd->bfq_raising_rt_max_time &&
16103 ++ !soft_rt) {
16104 ++ bfqq->raising_coeff = 1;
16105 ++ bfq_log_bfqq(bfqd, bfqq,
16106 ++ "wrais ending at %lu, "
16107 ++ "rais_max_time %u",
16108 ++ jiffies,
16109 ++ jiffies_to_msecs(bfqq->
16110 ++ raising_cur_max_time));
16111 ++ } else if (time_before(
16112 ++ bfqq->last_rais_start_finish +
16113 ++ bfqq->raising_cur_max_time,
16114 ++ jiffies +
16115 ++ bfqd->bfq_raising_rt_max_time) &&
16116 ++ soft_rt) {
16117 ++ /*
16118 ++ *
16119 ++ * The remaining weight-raising time is lower
16120 ++ * than bfqd->bfq_raising_rt_max_time, which
16121 ++ * means that the application is enjoying
16122 ++ * weight raising either because deemed soft rt
16123 ++ * in the near past, or because deemed
16124 ++ * interactive a long ago. In both cases,
16125 ++ * resetting now the current remaining weight-
16126 ++ * raising time for the application to the
16127 ++ * weight-raising duration for soft rt
16128 ++ * applications would not cause any latency
16129 ++ * increase for the application (as the new
16130 ++ * duration would be higher than the remaining
16131 ++ * time).
16132 ++ *
16133 ++ * In addition, the application is now meeting
16134 ++ * the requirements for being deemed soft rt.
16135 ++ * In the end we can correctly and safely
16136 ++ * (re)charge the weight-raising duration for
16137 ++ * the application with the weight-raising
16138 ++ * duration for soft rt applications.
16139 ++ *
16140 ++ * In particular, doing this recharge now, i.e.,
16141 ++ * before the weight-raising period for the
16142 ++ * application finishes, reduces the probability
16143 ++ * of the following negative scenario:
16144 ++ * 1) the weight of a soft rt application is
16145 ++ * raised at startup (as for any newly
16146 ++ * created application),
16147 ++ * 2) since the application is not interactive,
16148 ++ * at a certain time weight-raising is
16149 ++ * stopped for the application,
16150 ++ * 3) at that time the application happens to
16151 ++ * still have pending requests, and hence
16152 ++ * is destined to not have a chance to be
16153 ++ * deemed soft rt before these requests are
16154 ++ * completed (see the comments to the
16155 ++ * function bfq_bfqq_softrt_next_start()
16156 ++ * for details on soft rt detection),
16157 ++ * 4) these pending requests experience a high
16158 ++ * latency because the application is not
16159 ++ * weight-raised while they are pending.
16160 ++ */
16161 ++ bfqq->last_rais_start_finish = jiffies;
16162 ++ bfqq->raising_cur_max_time =
16163 ++ bfqd->bfq_raising_rt_max_time;
16164 ++ }
16165 ++ }
16166 ++ if (old_raising_coeff != bfqq->raising_coeff)
16167 ++ entity->ioprio_changed = 1;
16168 ++add_bfqq_busy:
16169 ++ bfqq->last_idle_bklogged = jiffies;
16170 ++ bfqq->service_from_backlogged = 0;
16171 ++ bfq_clear_bfqq_softrt_update(bfqq);
16172 ++ bfq_add_bfqq_busy(bfqd, bfqq);
16173 ++ } else {
16174 ++ if (bfqd->low_latency && old_raising_coeff == 1 &&
16175 ++ !rq_is_sync(rq) &&
16176 ++ time_is_before_jiffies(
16177 ++ bfqq->last_rais_start_finish +
16178 ++ bfqd->bfq_raising_min_inter_arr_async)) {
16179 ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
16180 ++ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
16181 ++
16182 ++ bfqd->raised_busy_queues++;
16183 ++ entity->ioprio_changed = 1;
16184 ++ bfq_log_bfqq(bfqd, bfqq,
16185 ++ "non-idle wrais starting at %lu, "
16186 ++ "rais_max_time %u",
16187 ++ jiffies,
16188 ++ jiffies_to_msecs(bfqq->
16189 ++ raising_cur_max_time));
16190 ++ }
16191 ++ bfq_updated_next_req(bfqd, bfqq);
16192 ++ }
16193 ++
16194 ++ if (bfqd->low_latency &&
16195 ++ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
16196 ++ idle_for_long_time))
16197 ++ bfqq->last_rais_start_finish = jiffies;
16198 ++}
16199 ++
16200 ++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
16201 ++{
16202 ++ elv_rb_del(&bfqq->sort_list, rq);
16203 ++ bfqq->queued[rq_is_sync(rq)]--;
16204 ++ bfqq->bfqd->queued--;
16205 ++ bfq_add_rq_rb(rq);
16206 ++}
16207 ++
16208 ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
16209 ++ struct bio *bio)
16210 ++{
16211 ++ struct task_struct *tsk = current;
16212 ++ struct bfq_io_cq *bic;
16213 ++ struct bfq_queue *bfqq;
16214 ++
16215 ++ bic = bfq_bic_lookup(bfqd, tsk->io_context);
16216 ++ if (bic == NULL)
16217 ++ return NULL;
16218 ++
16219 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
16220 ++ if (bfqq != NULL)
16221 ++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
16222 ++
16223 ++ return NULL;
16224 ++}
16225 ++
16226 ++static void bfq_activate_request(struct request_queue *q, struct request *rq)
16227 ++{
16228 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
16229 ++
16230 ++ bfqd->rq_in_driver++;
16231 ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
16232 ++ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
16233 ++ (long long unsigned)bfqd->last_position);
16234 ++}
16235 ++
16236 ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
16237 ++{
16238 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
16239 ++
16240 ++ WARN_ON(bfqd->rq_in_driver == 0);
16241 ++ bfqd->rq_in_driver--;
16242 ++}
16243 ++
16244 ++static void bfq_remove_request(struct request *rq)
16245 ++{
16246 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
16247 ++ struct bfq_data *bfqd = bfqq->bfqd;
16248 ++
16249 ++ if (bfqq->next_rq == rq) {
16250 ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
16251 ++ bfq_updated_next_req(bfqd, bfqq);
16252 ++ }
16253 ++
16254 ++ list_del_init(&rq->queuelist);
16255 ++ bfq_del_rq_rb(rq);
16256 ++
16257 ++ if (rq->cmd_flags & REQ_META) {
16258 ++ WARN_ON(bfqq->meta_pending == 0);
16259 ++ bfqq->meta_pending--;
16260 ++ }
16261 ++}
16262 ++
16263 ++static int bfq_merge(struct request_queue *q, struct request **req,
16264 ++ struct bio *bio)
16265 ++{
16266 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
16267 ++ struct request *__rq;
16268 ++
16269 ++ __rq = bfq_find_rq_fmerge(bfqd, bio);
16270 ++ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
16271 ++ *req = __rq;
16272 ++ return ELEVATOR_FRONT_MERGE;
16273 ++ }
16274 ++
16275 ++ return ELEVATOR_NO_MERGE;
16276 ++}
16277 ++
16278 ++static void bfq_merged_request(struct request_queue *q, struct request *req,
16279 ++ int type)
16280 ++{
16281 ++ if (type == ELEVATOR_FRONT_MERGE) {
16282 ++ struct bfq_queue *bfqq = RQ_BFQQ(req);
16283 ++
16284 ++ bfq_reposition_rq_rb(bfqq, req);
16285 ++ }
16286 ++}
16287 ++
16288 ++static void bfq_merged_requests(struct request_queue *q, struct request *rq,
16289 ++ struct request *next)
16290 ++{
16291 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
16292 ++
16293 ++ /*
16294 ++ * Reposition in fifo if next is older than rq.
16295 ++ */
16296 ++ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
16297 ++ time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
16298 ++ list_move(&rq->queuelist, &next->queuelist);
16299 ++ rq_set_fifo_time(rq, rq_fifo_time(next));
16300 ++ }
16301 ++
16302 ++ if (bfqq->next_rq == next)
16303 ++ bfqq->next_rq = rq;
16304 ++
16305 ++ bfq_remove_request(next);
16306 ++}
16307 ++
16308 ++/* Must be called with bfqq != NULL */
16309 ++static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)
16310 ++{
16311 ++ BUG_ON(bfqq == NULL);
16312 ++ if (bfq_bfqq_busy(bfqq))
16313 ++ bfqq->bfqd->raised_busy_queues--;
16314 ++ bfqq->raising_coeff = 1;
16315 ++ bfqq->raising_cur_max_time = 0;
16316 ++ /* Trigger a weight change on the next activation of the queue */
16317 ++ bfqq->entity.ioprio_changed = 1;
16318 ++}
16319 ++
16320 ++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
16321 ++ struct bfq_group *bfqg)
16322 ++{
16323 ++ int i, j;
16324 ++
16325 ++ for (i = 0; i < 2; i++)
16326 ++ for (j = 0; j < IOPRIO_BE_NR; j++)
16327 ++ if (bfqg->async_bfqq[i][j] != NULL)
16328 ++ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);
16329 ++ if (bfqg->async_idle_bfqq != NULL)
16330 ++ bfq_bfqq_end_raising(bfqg->async_idle_bfqq);
16331 ++}
16332 ++
16333 ++static void bfq_end_raising(struct bfq_data *bfqd)
16334 ++{
16335 ++ struct bfq_queue *bfqq;
16336 ++
16337 ++ spin_lock_irq(bfqd->queue->queue_lock);
16338 ++
16339 ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
16340 ++ bfq_bfqq_end_raising(bfqq);
16341 ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
16342 ++ bfq_bfqq_end_raising(bfqq);
16343 ++ bfq_end_raising_async(bfqd);
16344 ++
16345 ++ spin_unlock_irq(bfqd->queue->queue_lock);
16346 ++}
16347 ++
16348 ++static int bfq_allow_merge(struct request_queue *q, struct request *rq,
16349 ++ struct bio *bio)
16350 ++{
16351 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
16352 ++ struct bfq_io_cq *bic;
16353 ++ struct bfq_queue *bfqq;
16354 ++
16355 ++ /*
16356 ++ * Disallow merge of a sync bio into an async request.
16357 ++ */
16358 ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
16359 ++ return 0;
16360 ++
16361 ++ /*
16362 ++ * Lookup the bfqq that this bio will be queued with. Allow
16363 ++ * merge only if rq is queued there.
16364 ++ * Queue lock is held here.
16365 ++ */
16366 ++ bic = bfq_bic_lookup(bfqd, current->io_context);
16367 ++ if (bic == NULL)
16368 ++ return 0;
16369 ++
16370 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
16371 ++ return bfqq == RQ_BFQQ(rq);
16372 ++}
16373 ++
16374 ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
16375 ++ struct bfq_queue *bfqq)
16376 ++{
16377 ++ if (bfqq != NULL) {
16378 ++ bfq_mark_bfqq_must_alloc(bfqq);
16379 ++ bfq_mark_bfqq_budget_new(bfqq);
16380 ++ bfq_clear_bfqq_fifo_expire(bfqq);
16381 ++
16382 ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
16383 ++
16384 ++ bfq_log_bfqq(bfqd, bfqq,
16385 ++ "set_in_service_queue, cur-budget = %lu",
16386 ++ bfqq->entity.budget);
16387 ++ }
16388 ++
16389 ++ bfqd->in_service_queue = bfqq;
16390 ++}
16391 ++
16392 ++/*
16393 ++ * Get and set a new queue for service.
16394 ++ */
16395 ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
16396 ++ struct bfq_queue *bfqq)
16397 ++{
16398 ++ if (!bfqq)
16399 ++ bfqq = bfq_get_next_queue(bfqd);
16400 ++ else
16401 ++ bfq_get_next_queue_forced(bfqd, bfqq);
16402 ++
16403 ++ __bfq_set_in_service_queue(bfqd, bfqq);
16404 ++ return bfqq;
16405 ++}
16406 ++
16407 ++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
16408 ++ struct request *rq)
16409 ++{
16410 ++ if (blk_rq_pos(rq) >= bfqd->last_position)
16411 ++ return blk_rq_pos(rq) - bfqd->last_position;
16412 ++ else
16413 ++ return bfqd->last_position - blk_rq_pos(rq);
16414 ++}
16415 ++
16416 ++/*
16417 ++ * Return true if bfqq has no request pending and rq is close enough to
16418 ++ * bfqd->last_position, or if rq is closer to bfqd->last_position than
16419 ++ * bfqq->next_rq
16420 ++ */
16421 ++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
16422 ++{
16423 ++ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
16424 ++}
16425 ++
16426 ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
16427 ++{
16428 ++ struct rb_root *root = &bfqd->rq_pos_tree;
16429 ++ struct rb_node *parent, *node;
16430 ++ struct bfq_queue *__bfqq;
16431 ++ sector_t sector = bfqd->last_position;
16432 ++
16433 ++ if (RB_EMPTY_ROOT(root))
16434 ++ return NULL;
16435 ++
16436 ++ /*
16437 ++ * First, if we find a request starting at the end of the last
16438 ++ * request, choose it.
16439 ++ */
16440 ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
16441 ++ if (__bfqq != NULL)
16442 ++ return __bfqq;
16443 ++
16444 ++ /*
16445 ++ * If the exact sector wasn't found, the parent of the NULL leaf
16446 ++ * will contain the closest sector (rq_pos_tree sorted by next_request
16447 ++ * position).
16448 ++ */
16449 ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
16450 ++ if (bfq_rq_close(bfqd, __bfqq->next_rq))
16451 ++ return __bfqq;
16452 ++
16453 ++ if (blk_rq_pos(__bfqq->next_rq) < sector)
16454 ++ node = rb_next(&__bfqq->pos_node);
16455 ++ else
16456 ++ node = rb_prev(&__bfqq->pos_node);
16457 ++ if (node == NULL)
16458 ++ return NULL;
16459 ++
16460 ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
16461 ++ if (bfq_rq_close(bfqd, __bfqq->next_rq))
16462 ++ return __bfqq;
16463 ++
16464 ++ return NULL;
16465 ++}
16466 ++
16467 ++/*
16468 ++ * bfqd - obvious
16469 ++ * cur_bfqq - passed in so that we don't decide that the current queue
16470 ++ * is closely cooperating with itself.
16471 ++ *
16472 ++ * We are assuming that cur_bfqq has dispatched at least one request,
16473 ++ * and that bfqd->last_position reflects a position on the disk associated
16474 ++ * with the I/O issued by cur_bfqq.
16475 ++ */
16476 ++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
16477 ++ struct bfq_queue *cur_bfqq)
16478 ++{
16479 ++ struct bfq_queue *bfqq;
16480 ++
16481 ++ if (bfq_class_idle(cur_bfqq))
16482 ++ return NULL;
16483 ++ if (!bfq_bfqq_sync(cur_bfqq))
16484 ++ return NULL;
16485 ++ if (BFQQ_SEEKY(cur_bfqq))
16486 ++ return NULL;
16487 ++
16488 ++ /* If device has only one backlogged bfq_queue, don't search. */
16489 ++ if (bfqd->busy_queues == 1)
16490 ++ return NULL;
16491 ++
16492 ++ /*
16493 ++ * We should notice if some of the queues are cooperating, e.g.
16494 ++ * working closely on the same area of the disk. In that case,
16495 ++ * we can group them together and don't waste time idling.
16496 ++ */
16497 ++ bfqq = bfqq_close(bfqd);
16498 ++ if (bfqq == NULL || bfqq == cur_bfqq)
16499 ++ return NULL;
16500 ++
16501 ++ /*
16502 ++ * Do not merge queues from different bfq_groups.
16503 ++ */
16504 ++ if (bfqq->entity.parent != cur_bfqq->entity.parent)
16505 ++ return NULL;
16506 ++
16507 ++ /*
16508 ++ * It only makes sense to merge sync queues.
16509 ++ */
16510 ++ if (!bfq_bfqq_sync(bfqq))
16511 ++ return NULL;
16512 ++ if (BFQQ_SEEKY(bfqq))
16513 ++ return NULL;
16514 ++
16515 ++ /*
16516 ++ * Do not merge queues of different priority classes.
16517 ++ */
16518 ++ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
16519 ++ return NULL;
16520 ++
16521 ++ return bfqq;
16522 ++}
16523 ++
16524 ++/*
16525 ++ * If enough samples have been computed, return the current max budget
16526 ++ * stored in bfqd, which is dynamically updated according to the
16527 ++ * estimated disk peak rate; otherwise return the default max budget
16528 ++ */
16529 ++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
16530 ++{
16531 ++ if (bfqd->budgets_assigned < 194)
16532 ++ return bfq_default_max_budget;
16533 ++ else
16534 ++ return bfqd->bfq_max_budget;
16535 ++}
16536 ++
16537 ++/*
16538 ++ * Return min budget, which is a fraction of the current or default
16539 ++ * max budget (trying with 1/32)
16540 ++ */
16541 ++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
16542 ++{
16543 ++ if (bfqd->budgets_assigned < 194)
16544 ++ return bfq_default_max_budget / 32;
16545 ++ else
16546 ++ return bfqd->bfq_max_budget / 32;
16547 ++}
16548 ++
16549 ++/*
16550 ++ * Decides whether idling should be done for given device and
16551 ++ * given in-service queue.
16552 ++ */
16553 ++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
16554 ++ struct bfq_queue *in_service_bfqq)
16555 ++{
16556 ++ if (in_service_bfqq == NULL)
16557 ++ return false;
16558 ++ /*
16559 ++ * If the device is non-rotational, and hence has no seek penalty,
16560 ++ * disable idling; but do so only if:
16561 ++ * - device does not support queuing, otherwise we still have
16562 ++ * a problem with sync vs async workloads;
16563 ++ * - the queue is not weight-raised, to preserve guarantees.
16564 ++ */
16565 ++ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
16566 ++ in_service_bfqq->raising_coeff == 1);
16567 ++}
16568 ++
16569 ++static void bfq_arm_slice_timer(struct bfq_data *bfqd)
16570 ++{
16571 ++ struct bfq_queue *bfqq = bfqd->in_service_queue;
16572 ++ struct bfq_io_cq *bic;
16573 ++ unsigned long sl;
16574 ++
16575 ++ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
16576 ++
16577 ++ /* Tasks have exited, don't wait. */
16578 ++ bic = bfqd->in_service_bic;
16579 ++ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
16580 ++ return;
16581 ++
16582 ++ bfq_mark_bfqq_wait_request(bfqq);
16583 ++
16584 ++ /*
16585 ++ * We don't want to idle for seeks, but we do want to allow
16586 ++ * fair distribution of slice time for a process doing back-to-back
16587 ++ * seeks. So allow a little bit of time for him to submit a new rq.
16588 ++ *
16589 ++ * To prevent processes with (partly) seeky workloads from
16590 ++ * being too ill-treated, grant them a small fraction of the
16591 ++ * assigned budget before reducing the waiting time to
16592 ++ * BFQ_MIN_TT. This happened to help reduce latency.
16593 ++ */
16594 ++ sl = bfqd->bfq_slice_idle;
16595 ++ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
16596 ++ bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
16597 ++ bfqq->raising_coeff == 1)
16598 ++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
16599 ++ else if (bfqq->raising_coeff > 1)
16600 ++ sl = sl * 3;
16601 ++ bfqd->last_idling_start = ktime_get();
16602 ++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
16603 ++ bfq_log(bfqd, "arm idle: %u/%u ms",
16604 ++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
16605 ++}
16606 ++
16607 ++/*
16608 ++ * Set the maximum time for the in-service queue to consume its
16609 ++ * budget. This prevents seeky processes from lowering the disk
16610 ++ * throughput (always guaranteed with a time slice scheme as in CFQ).
16611 ++ */
16612 ++static void bfq_set_budget_timeout(struct bfq_data *bfqd)
16613 ++{
16614 ++ struct bfq_queue *bfqq = bfqd->in_service_queue;
16615 ++ unsigned int timeout_coeff;
16616 ++ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
16617 ++ timeout_coeff = 1;
16618 ++ else
16619 ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
16620 ++
16621 ++ bfqd->last_budget_start = ktime_get();
16622 ++
16623 ++ bfq_clear_bfqq_budget_new(bfqq);
16624 ++ bfqq->budget_timeout = jiffies +
16625 ++ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
16626 ++
16627 ++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
16628 ++ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
16629 ++ timeout_coeff));
16630 ++}
16631 ++
16632 ++/*
16633 ++ * Move request from internal lists to the request queue dispatch list.
16634 ++ */
16635 ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
16636 ++{
16637 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
16638 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
16639 ++
16640 ++ bfq_remove_request(rq);
16641 ++ bfqq->dispatched++;
16642 ++ elv_dispatch_sort(q, rq);
16643 ++
16644 ++ if (bfq_bfqq_sync(bfqq))
16645 ++ bfqd->sync_flight++;
16646 ++}
16647 ++
16648 ++/*
16649 ++ * Return expired entry, or NULL to just start from scratch in rbtree.
16650 ++ */
16651 ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
16652 ++{
16653 ++ struct request *rq = NULL;
16654 ++
16655 ++ if (bfq_bfqq_fifo_expire(bfqq))
16656 ++ return NULL;
16657 ++
16658 ++ bfq_mark_bfqq_fifo_expire(bfqq);
16659 ++
16660 ++ if (list_empty(&bfqq->fifo))
16661 ++ return NULL;
16662 ++
16663 ++ rq = rq_entry_fifo(bfqq->fifo.next);
16664 ++
16665 ++ if (time_before(jiffies, rq_fifo_time(rq)))
16666 ++ return NULL;
16667 ++
16668 ++ return rq;
16669 ++}
16670 ++
16671 ++/*
16672 ++ * Must be called with the queue_lock held.
16673 ++ */
16674 ++static int bfqq_process_refs(struct bfq_queue *bfqq)
16675 ++{
16676 ++ int process_refs, io_refs;
16677 ++
16678 ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
16679 ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
16680 ++ BUG_ON(process_refs < 0);
16681 ++ return process_refs;
16682 ++}
16683 ++
16684 ++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
16685 ++{
16686 ++ int process_refs, new_process_refs;
16687 ++ struct bfq_queue *__bfqq;
16688 ++
16689 ++ /*
16690 ++ * If there are no process references on the new_bfqq, then it is
16691 ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
16692 ++ * may have dropped their last reference (not just their last process
16693 ++ * reference).
16694 ++ */
16695 ++ if (!bfqq_process_refs(new_bfqq))
16696 ++ return;
16697 ++
16698 ++ /* Avoid a circular list and skip interim queue merges. */
16699 ++ while ((__bfqq = new_bfqq->new_bfqq)) {
16700 ++ if (__bfqq == bfqq)
16701 ++ return;
16702 ++ new_bfqq = __bfqq;
16703 ++ }
16704 ++
16705 ++ process_refs = bfqq_process_refs(bfqq);
16706 ++ new_process_refs = bfqq_process_refs(new_bfqq);
16707 ++ /*
16708 ++ * If the process for the bfqq has gone away, there is no
16709 ++ * sense in merging the queues.
16710 ++ */
16711 ++ if (process_refs == 0 || new_process_refs == 0)
16712 ++ return;
16713 ++
16714 ++ /*
16715 ++ * Merge in the direction of the lesser amount of work.
16716 ++ */
16717 ++ if (new_process_refs >= process_refs) {
16718 ++ bfqq->new_bfqq = new_bfqq;
16719 ++ atomic_add(process_refs, &new_bfqq->ref);
16720 ++ } else {
16721 ++ new_bfqq->new_bfqq = bfqq;
16722 ++ atomic_add(new_process_refs, &bfqq->ref);
16723 ++ }
16724 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
16725 ++ new_bfqq->pid);
16726 ++}
16727 ++
16728 ++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
16729 ++{
16730 ++ struct bfq_entity *entity = &bfqq->entity;
16731 ++ return entity->budget - entity->service;
16732 ++}
16733 ++
16734 ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
16735 ++{
16736 ++ BUG_ON(bfqq != bfqd->in_service_queue);
16737 ++
16738 ++ __bfq_bfqd_reset_in_service(bfqd);
16739 ++
16740 ++ /*
16741 ++ * If this bfqq is shared between multiple processes, check
16742 ++ * to make sure that those processes are still issuing I/Os
16743 ++ * within the mean seek distance. If not, it may be time to
16744 ++ * break the queues apart again.
16745 ++ */
16746 ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
16747 ++ bfq_mark_bfqq_split_coop(bfqq);
16748 ++
16749 ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
16750 ++ /*
16751 ++ * overloading budget_timeout field to store when
16752 ++ * the queue remains with no backlog, used by
16753 ++ * the weight-raising mechanism
16754 ++ */
16755 ++ bfqq->budget_timeout = jiffies;
16756 ++ bfq_del_bfqq_busy(bfqd, bfqq, 1);
16757 ++ } else {
16758 ++ bfq_activate_bfqq(bfqd, bfqq);
16759 ++ /*
16760 ++ * Resort priority tree of potential close cooperators.
16761 ++ */
16762 ++ bfq_rq_pos_tree_add(bfqd, bfqq);
16763 ++ }
16764 ++}
16765 ++
16766 ++/**
16767 ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
16768 ++ * @bfqd: device data.
16769 ++ * @bfqq: queue to update.
16770 ++ * @reason: reason for expiration.
16771 ++ *
16772 ++ * Handle the feedback on @bfqq budget. See the body for detailed
16773 ++ * comments.
16774 ++ */
16775 ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
16776 ++ struct bfq_queue *bfqq,
16777 ++ enum bfqq_expiration reason)
16778 ++{
16779 ++ struct request *next_rq;
16780 ++ unsigned long budget, min_budget;
16781 ++
16782 ++ budget = bfqq->max_budget;
16783 ++ min_budget = bfq_min_budget(bfqd);
16784 ++
16785 ++ BUG_ON(bfqq != bfqd->in_service_queue);
16786 ++
16787 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
16788 ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
16789 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
16790 ++ budget, bfq_min_budget(bfqd));
16791 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
16792 ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
16793 ++
16794 ++ if (bfq_bfqq_sync(bfqq)) {
16795 ++ switch (reason) {
16796 ++ /*
16797 ++ * Caveat: in all the following cases we trade latency
16798 ++ * for throughput.
16799 ++ */
16800 ++ case BFQ_BFQQ_TOO_IDLE:
16801 ++ /*
16802 ++ * This is the only case where we may reduce
16803 ++ * the budget: if there is no requets of the
16804 ++ * process still waiting for completion, then
16805 ++ * we assume (tentatively) that the timer has
16806 ++ * expired because the batch of requests of
16807 ++ * the process could have been served with a
16808 ++ * smaller budget. Hence, betting that
16809 ++ * process will behave in the same way when it
16810 ++ * becomes backlogged again, we reduce its
16811 ++ * next budget. As long as we guess right,
16812 ++ * this budget cut reduces the latency
16813 ++ * experienced by the process.
16814 ++ *
16815 ++ * However, if there are still outstanding
16816 ++ * requests, then the process may have not yet
16817 ++ * issued its next request just because it is
16818 ++ * still waiting for the completion of some of
16819 ++ * the still oustanding ones. So in this
16820 ++ * subcase we do not reduce its budget, on the
16821 ++ * contrary we increase it to possibly boost
16822 ++ * the throughput, as discussed in the
16823 ++ * comments to the BUDGET_TIMEOUT case.
16824 ++ */
16825 ++ if (bfqq->dispatched > 0) /* still oustanding reqs */
16826 ++ budget = min(budget * 2, bfqd->bfq_max_budget);
16827 ++ else {
16828 ++ if (budget > 5 * min_budget)
16829 ++ budget -= 4 * min_budget;
16830 ++ else
16831 ++ budget = min_budget;
16832 ++ }
16833 ++ break;
16834 ++ case BFQ_BFQQ_BUDGET_TIMEOUT:
16835 ++ /*
16836 ++ * We double the budget here because: 1) it
16837 ++ * gives the chance to boost the throughput if
16838 ++ * this is not a seeky process (which may have
16839 ++ * bumped into this timeout because of, e.g.,
16840 ++ * ZBR), 2) together with charge_full_budget
16841 ++ * it helps give seeky processes higher
16842 ++ * timestamps, and hence be served less
16843 ++ * frequently.
16844 ++ */
16845 ++ budget = min(budget * 2, bfqd->bfq_max_budget);
16846 ++ break;
16847 ++ case BFQ_BFQQ_BUDGET_EXHAUSTED:
16848 ++ /*
16849 ++ * The process still has backlog, and did not
16850 ++ * let either the budget timeout or the disk
16851 ++ * idling timeout expire. Hence it is not
16852 ++ * seeky, has a short thinktime and may be
16853 ++ * happy with a higher budget too. So
16854 ++ * definitely increase the budget of this good
16855 ++ * candidate to boost the disk throughput.
16856 ++ */
16857 ++ budget = min(budget * 4, bfqd->bfq_max_budget);
16858 ++ break;
16859 ++ case BFQ_BFQQ_NO_MORE_REQUESTS:
16860 ++ /*
16861 ++ * Leave the budget unchanged.
16862 ++ */
16863 ++ default:
16864 ++ return;
16865 ++ }
16866 ++ } else /* async queue */
16867 ++ /* async queues get always the maximum possible budget
16868 ++ * (their ability to dispatch is limited by
16869 ++ * @bfqd->bfq_max_budget_async_rq).
16870 ++ */
16871 ++ budget = bfqd->bfq_max_budget;
16872 ++
16873 ++ bfqq->max_budget = budget;
16874 ++
16875 ++ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
16876 ++ bfqq->max_budget > bfqd->bfq_max_budget)
16877 ++ bfqq->max_budget = bfqd->bfq_max_budget;
16878 ++
16879 ++ /*
16880 ++ * Make sure that we have enough budget for the next request.
16881 ++ * Since the finish time of the bfqq must be kept in sync with
16882 ++ * the budget, be sure to call __bfq_bfqq_expire() after the
16883 ++ * update.
16884 ++ */
16885 ++ next_rq = bfqq->next_rq;
16886 ++ if (next_rq != NULL)
16887 ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
16888 ++ bfq_serv_to_charge(next_rq, bfqq));
16889 ++ else
16890 ++ bfqq->entity.budget = bfqq->max_budget;
16891 ++
16892 ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
16893 ++ next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
16894 ++ bfqq->entity.budget);
16895 ++}
16896 ++
16897 ++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
16898 ++{
16899 ++ unsigned long max_budget;
16900 ++
16901 ++ /*
16902 ++ * The max_budget calculated when autotuning is equal to the
16903 ++ * amount of sectors transfered in timeout_sync at the
16904 ++ * estimated peak rate.
16905 ++ */
16906 ++ max_budget = (unsigned long)(peak_rate * 1000 *
16907 ++ timeout >> BFQ_RATE_SHIFT);
16908 ++
16909 ++ return max_budget;
16910 ++}
16911 ++
16912 ++/*
16913 ++ * In addition to updating the peak rate, checks whether the process
16914 ++ * is "slow", and returns 1 if so. This slow flag is used, in addition
16915 ++ * to the budget timeout, to reduce the amount of service provided to
16916 ++ * seeky processes, and hence reduce their chances to lower the
16917 ++ * throughput. See the code for more details.
16918 ++ */
16919 ++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
16920 ++ int compensate, enum bfqq_expiration reason)
16921 ++{
16922 ++ u64 bw, usecs, expected, timeout;
16923 ++ ktime_t delta;
16924 ++ int update = 0;
16925 ++
16926 ++ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
16927 ++ return 0;
16928 ++
16929 ++ if (compensate)
16930 ++ delta = bfqd->last_idling_start;
16931 ++ else
16932 ++ delta = ktime_get();
16933 ++ delta = ktime_sub(delta, bfqd->last_budget_start);
16934 ++ usecs = ktime_to_us(delta);
16935 ++
16936 ++ /* Don't trust short/unrealistic values. */
16937 ++ if (usecs < 100 || usecs >= LONG_MAX)
16938 ++ return 0;
16939 ++
16940 ++ /*
16941 ++ * Calculate the bandwidth for the last slice. We use a 64 bit
16942 ++ * value to store the peak rate, in sectors per usec in fixed
16943 ++ * point math. We do so to have enough precision in the estimate
16944 ++ * and to avoid overflows.
16945 ++ */
16946 ++ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
16947 ++ do_div(bw, (unsigned long)usecs);
16948 ++
16949 ++ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
16950 ++
16951 ++ /*
16952 ++ * Use only long (> 20ms) intervals to filter out spikes for
16953 ++ * the peak rate estimation.
16954 ++ */
16955 ++ if (usecs > 20000) {
16956 ++ if (bw > bfqd->peak_rate ||
16957 ++ (!BFQQ_SEEKY(bfqq) &&
16958 ++ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
16959 ++ bfq_log(bfqd, "measured bw =%llu", bw);
16960 ++ /*
16961 ++ * To smooth oscillations use a low-pass filter with
16962 ++ * alpha=7/8, i.e.,
16963 ++ * new_rate = (7/8) * old_rate + (1/8) * bw
16964 ++ */
16965 ++ do_div(bw, 8);
16966 ++ if (bw == 0)
16967 ++ return 0;
16968 ++ bfqd->peak_rate *= 7;
16969 ++ do_div(bfqd->peak_rate, 8);
16970 ++ bfqd->peak_rate += bw;
16971 ++ update = 1;
16972 ++ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
16973 ++ }
16974 ++
16975 ++ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
16976 ++
16977 ++ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
16978 ++ bfqd->peak_rate_samples++;
16979 ++
16980 ++ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
16981 ++ update && bfqd->bfq_user_max_budget == 0) {
16982 ++ bfqd->bfq_max_budget =
16983 ++ bfq_calc_max_budget(bfqd->peak_rate, timeout);
16984 ++ bfq_log(bfqd, "new max_budget=%lu",
16985 ++ bfqd->bfq_max_budget);
16986 ++ }
16987 ++ }
16988 ++
16989 ++ /*
16990 ++ * If the process has been served for a too short time
16991 ++ * interval to let its possible sequential accesses prevail on
16992 ++ * the initial seek time needed to move the disk head on the
16993 ++ * first sector it requested, then give the process a chance
16994 ++ * and for the moment return false.
16995 ++ */
16996 ++ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
16997 ++ return 0;
16998 ++
16999 ++ /*
17000 ++ * A process is considered ``slow'' (i.e., seeky, so that we
17001 ++ * cannot treat it fairly in the service domain, as it would
17002 ++ * slow down too much the other processes) if, when a slice
17003 ++ * ends for whatever reason, it has received service at a
17004 ++ * rate that would not be high enough to complete the budget
17005 ++ * before the budget timeout expiration.
17006 ++ */
17007 ++ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
17008 ++
17009 ++ /*
17010 ++ * Caveat: processes doing IO in the slower disk zones will
17011 ++ * tend to be slow(er) even if not seeky. And the estimated
17012 ++ * peak rate will actually be an average over the disk
17013 ++ * surface. Hence, to not be too harsh with unlucky processes,
17014 ++ * we keep a budget/3 margin of safety before declaring a
17015 ++ * process slow.
17016 ++ */
17017 ++ return expected > (4 * bfqq->entity.budget) / 3;
17018 ++}
17019 ++
17020 ++/*
17021 ++ * To be deemed as soft real-time, an application must meet two requirements.
17022 ++ * The first is that the application must not require an average bandwidth
17023 ++ * higher than the approximate bandwidth required to playback or record a
17024 ++ * compressed high-definition video.
17025 ++ * The next function is invoked on the completion of the last request of a
17026 ++ * batch, to compute the next-start time instant, soft_rt_next_start, such
17027 ++ * that, if the next request of the application does not arrive before
17028 ++ * soft_rt_next_start, then the above requirement on the bandwidth is met.
17029 ++ *
17030 ++ * The second requirement is that the request pattern of the application is
17031 ++ * isochronous, i.e., that, after issuing a request or a batch of requests, the
17032 ++ * application stops for a while, then issues a new batch, and so on. For this
17033 ++ * reason the next function is invoked to compute soft_rt_next_start only for
17034 ++ * applications that meet this requirement, whereas soft_rt_next_start is set
17035 ++ * to infinity for applications that do not.
17036 ++ *
17037 ++ * Unfortunately, even a greedy application may happen to behave in an
17038 ++ * isochronous way if several processes are competing for the CPUs. In fact,
17039 ++ * in this scenario the application stops issuing requests while the CPUs are
17040 ++ * busy serving other processes, then restarts, then stops again for a while,
17041 ++ * and so on. In addition, if the disk achieves a low enough throughput with
17042 ++ * the request pattern issued by the application (e.g., because the request
17043 ++ * pattern is random and/or the device is slow), then the above bandwidth
17044 ++ * requirement may happen to be met too. To prevent such a greedy application
17045 ++ * to be deemed as soft real-time, a further rule is used in the computation
17046 ++ * of soft_rt_next_start: soft_rt_next_start must be higher than the current
17047 ++ * time plus the maximum time for which the arrival of a request is waited
17048 ++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This
17049 ++ * filters out greedy applications, as the latter issue instead their next
17050 ++ * request as soon as possible after the last one has been completed (in
17051 ++ * contrast, when a batch of requests is completed, a soft real-time
17052 ++ * application spends some time processing data).
17053 ++ *
17054 ++ * Actually, the last filter may easily generate false positives if: only
17055 ++ * bfqd->bfq_slice_idle is used as a reference time interval, and one or
17056 ++ * both the following two cases occur:
17057 ++ * 1) HZ is so low that the duration of a jiffie is comparable to or higher
17058 ++ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
17059 ++ * HZ=100.
17060 ++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
17061 ++ * for a while, then suddenly 'jump' by several units to recover the lost
17062 ++ * increments. This seems to happen, e.g., inside virtual machines.
17063 ++ * To address this issue, we do not use as a reference time interval just
17064 ++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
17065 ++ * particular we add the minimum number of jiffies for which the filter seems
17066 ++ * to be quite precise also in embedded systems and KVM/QEMU virtual machines.
17067 ++ */
17068 ++static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
17069 ++ struct bfq_queue *bfqq)
17070 ++{
17071 ++ return max(bfqq->last_idle_bklogged +
17072 ++ HZ * bfqq->service_from_backlogged /
17073 ++ bfqd->bfq_raising_max_softrt_rate,
17074 ++ jiffies + bfqq->bfqd->bfq_slice_idle + 4);
17075 ++}
17076 ++
17077 ++/*
17078 ++ * Largest-possible time instant such that, for as long as possible, the
17079 ++ * current time will be lower than this time instant according to the macro
17080 ++ * time_is_before_jiffies().
17081 ++ */
17082 ++static inline unsigned long bfq_infinity_from_now(unsigned long now)
17083 ++{
17084 ++ return now + ULONG_MAX / 2;
17085 ++}
17086 ++
17087 ++/**
17088 ++ * bfq_bfqq_expire - expire a queue.
17089 ++ * @bfqd: device owning the queue.
17090 ++ * @bfqq: the queue to expire.
17091 ++ * @compensate: if true, compensate for the time spent idling.
17092 ++ * @reason: the reason causing the expiration.
17093 ++ *
17094 ++ *
17095 ++ * If the process associated to the queue is slow (i.e., seeky), or in
17096 ++ * case of budget timeout, or, finally, if it is async, we
17097 ++ * artificially charge it an entire budget (independently of the
17098 ++ * actual service it received). As a consequence, the queue will get
17099 ++ * higher timestamps than the correct ones upon reactivation, and
17100 ++ * hence it will be rescheduled as if it had received more service
17101 ++ * than what it actually received. In the end, this class of processes
17102 ++ * will receive less service in proportion to how slowly they consume
17103 ++ * their budgets (and hence how seriously they tend to lower the
17104 ++ * throughput).
17105 ++ *
17106 ++ * In contrast, when a queue expires because it has been idling for
17107 ++ * too much or because it exhausted its budget, we do not touch the
17108 ++ * amount of service it has received. Hence when the queue will be
17109 ++ * reactivated and its timestamps updated, the latter will be in sync
17110 ++ * with the actual service received by the queue until expiration.
17111 ++ *
17112 ++ * Charging a full budget to the first type of queues and the exact
17113 ++ * service to the others has the effect of using the WF2Q+ policy to
17114 ++ * schedule the former on a timeslice basis, without violating the
17115 ++ * service domain guarantees of the latter.
17116 ++ */
17117 ++static void bfq_bfqq_expire(struct bfq_data *bfqd,
17118 ++ struct bfq_queue *bfqq,
17119 ++ int compensate,
17120 ++ enum bfqq_expiration reason)
17121 ++{
17122 ++ int slow;
17123 ++ BUG_ON(bfqq != bfqd->in_service_queue);
17124 ++
17125 ++ /* Update disk peak rate for autotuning and check whether the
17126 ++ * process is slow (see bfq_update_peak_rate).
17127 ++ */
17128 ++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
17129 ++
17130 ++ /*
17131 ++ * As above explained, 'punish' slow (i.e., seeky), timed-out
17132 ++ * and async queues, to favor sequential sync workloads.
17133 ++ *
17134 ++ * Processes doing IO in the slower disk zones will tend to be
17135 ++ * slow(er) even if not seeky. Hence, since the estimated peak
17136 ++ * rate is actually an average over the disk surface, these
17137 ++ * processes may timeout just for bad luck. To avoid punishing
17138 ++ * them we do not charge a full budget to a process that
17139 ++ * succeeded in consuming at least 2/3 of its budget.
17140 ++ */
17141 ++ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
17142 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
17143 ++ bfq_bfqq_charge_full_budget(bfqq);
17144 ++
17145 ++ bfqq->service_from_backlogged += bfqq->entity.service;
17146 ++
17147 ++ if (bfqd->low_latency && bfqq->raising_coeff == 1)
17148 ++ bfqq->last_rais_start_finish = jiffies;
17149 ++
17150 ++ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0 &&
17151 ++ RB_EMPTY_ROOT(&bfqq->sort_list)) {
17152 ++ /*
17153 ++ * If we get here, then the request pattern is
17154 ++ * isochronous (see the comments to the function
17155 ++ * bfq_bfqq_softrt_next_start()). However, if the
17156 ++ * queue still has in-flight requests, then it is
17157 ++ * better to postpone the computation of next_start
17158 ++ * to the next request completion. In fact, if we
17159 ++ * computed it now, then the application might pass
17160 ++ * the greedy-application filter improperly, because
17161 ++ * the arrival of its next request may happen to be
17162 ++ * higher than (jiffies + bfqq->bfqd->bfq_slice_idle)
17163 ++ * not because the application is truly soft real-
17164 ++ * time, but just because the application is currently
17165 ++ * waiting for the completion of some request before
17166 ++ * issuing, as quickly as possible, its next request.
17167 ++ */
17168 ++ if (bfqq->dispatched > 0) {
17169 ++ /*
17170 ++ * The application is still waiting for the
17171 ++ * completion of one or more requests:
17172 ++ * prevent it from possibly being incorrectly
17173 ++ * deemed as soft real-time by setting its
17174 ++ * soft_rt_next_start to infinity. In fact,
17175 ++ * without this assignment, the application
17176 ++ * would be incorrectly deemed as soft
17177 ++ * real-time if:
17178 ++ * 1) it issued a new request before the
17179 ++ * completion of all its in-flight
17180 ++ * requests, and
17181 ++ * 2) at that time, its soft_rt_next_start
17182 ++ * happened to be in the past.
17183 ++ */
17184 ++ bfqq->soft_rt_next_start =
17185 ++ bfq_infinity_from_now(jiffies);
17186 ++ bfq_mark_bfqq_softrt_update(bfqq);
17187 ++ } else
17188 ++ bfqq->soft_rt_next_start =
17189 ++ bfq_bfqq_softrt_next_start(bfqd, bfqq);
17190 ++ }
17191 ++
17192 ++ bfq_log_bfqq(bfqd, bfqq,
17193 ++ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
17194 ++ bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
17195 ++
17196 ++ /* Increase, decrease or leave budget unchanged according to reason */
17197 ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
17198 ++ __bfq_bfqq_expire(bfqd, bfqq);
17199 ++}
17200 ++
17201 ++/*
17202 ++ * Budget timeout is not implemented through a dedicated timer, but
17203 ++ * just checked on request arrivals and completions, as well as on
17204 ++ * idle timer expirations.
17205 ++ */
17206 ++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
17207 ++{
17208 ++ if (bfq_bfqq_budget_new(bfqq))
17209 ++ return 0;
17210 ++
17211 ++ if (time_before(jiffies, bfqq->budget_timeout))
17212 ++ return 0;
17213 ++
17214 ++ return 1;
17215 ++}
17216 ++
17217 ++/*
17218 ++ * If we expire a queue that is waiting for the arrival of a new
17219 ++ * request, we may prevent the fictitious timestamp backshifting that
17220 ++ * allows the guarantees of the queue to be preserved (see [1] for
17221 ++ * this tricky aspect). Hence we return true only if this condition
17222 ++ * does not hold, or if the queue is slow enough to deserve only to be
17223 ++ * kicked off for preserving a high throughput.
17224 ++*/
17225 ++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
17226 ++{
17227 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
17228 ++ "may_budget_timeout: wr %d left %d timeout %d",
17229 ++ bfq_bfqq_wait_request(bfqq),
17230 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
17231 ++ bfq_bfqq_budget_timeout(bfqq));
17232 ++
17233 ++ return (!bfq_bfqq_wait_request(bfqq) ||
17234 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
17235 ++ &&
17236 ++ bfq_bfqq_budget_timeout(bfqq);
17237 ++}
17238 ++
17239 ++/*
17240 ++ * For weight-raised queues issuing sync requests, idling is always performed,
17241 ++ * as this is instrumental in guaranteeing a high fraction of the throughput
17242 ++ * to these queues, and hence in guaranteeing a lower latency for their
17243 ++ * requests. See [1] for details.
17244 ++ *
17245 ++ * For non-weight-raised queues, idling is instead disabled if the device is
17246 ++ * NCQ-enabled and non-rotational, as this boosts the throughput on such
17247 ++ * devices.
17248 ++ */
17249 ++static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
17250 ++{
17251 ++ struct bfq_data *bfqd = bfqq->bfqd;
17252 ++
17253 ++ return bfq_bfqq_sync(bfqq) && (
17254 ++ bfqq->raising_coeff > 1 ||
17255 ++ (bfq_bfqq_idle_window(bfqq) &&
17256 ++ !(bfqd->hw_tag &&
17257 ++ (blk_queue_nonrot(bfqd->queue) ||
17258 ++ /*
17259 ++ * If there are weight-raised busy queues, then do not idle
17260 ++ * the disk for a sync non-weight-raised queue, and hence
17261 ++ * expire the queue immediately if empty. Combined with the
17262 ++ * timestamping rules of BFQ (see [1] for details), this
17263 ++ * causes sync non-weight-raised queues to get a lower
17264 ++ * fraction of the disk throughput, and hence reduces the rate
17265 ++ * at which the processes associated to these queues ask for
17266 ++ * requests from the request pool.
17267 ++ *
17268 ++ * This is beneficial for weight-raised processes, when the
17269 ++ * system operates in request-pool saturation conditions
17270 ++ * (e.g., in the presence of write hogs). In fact, if
17271 ++ * non-weight-raised processes ask for requests at a lower
17272 ++ * rate, then weight-raised processes have a higher
17273 ++ * probability to get a request from the pool immediately
17274 ++ * (or at least soon) when they need one. Hence they have a
17275 ++ * higher probability to actually get a fraction of the disk
17276 ++ * throughput proportional to their high weight. This is
17277 ++ * especially true with NCQ-enabled drives, which enqueue
17278 ++ * several requests in advance and further reorder
17279 ++ * internally-queued requests.
17280 ++ *
17281 ++ * Mistreating non-weight-raised queues in the above-described
17282 ++ * way, when there are busy weight-raised queues, seems to
17283 ++ * mitigate starvation problems in the presence of heavy write
17284 ++ * workloads and NCQ, and hence to guarantee a higher
17285 ++ * application and system responsiveness in these hostile
17286 ++ * scenarios.
17287 ++ */
17288 ++ bfqd->raised_busy_queues > 0)
17289 ++ )
17290 ++ )
17291 ++ );
17292 ++}
17293 ++
17294 ++/*
17295 ++ * If the in-service queue is empty, but it is sync and either of the following
17296 ++ * conditions holds, then: 1) the queue must remain in service and cannot be
17297 ++ * expired, and 2) the disk must be idled to wait for the possible arrival
17298 ++ * of a new request for the queue. The conditions are:
17299 ++ * - the device is rotational and not performing NCQ, and the queue has its
17300 ++ * idle window set (in this case, waiting for a new request for the queue
17301 ++ * is likely to boost the disk throughput);
17302 ++ * - the queue is weight-raised (waiting for the request is necessary to
17303 ++ * provide the queue with fairness and latency guarantees, see [1] for
17304 ++ * details).
17305 ++ */
17306 ++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
17307 ++{
17308 ++ struct bfq_data *bfqd = bfqq->bfqd;
17309 ++
17310 ++ return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
17311 ++ bfq_bfqq_must_not_expire(bfqq) &&
17312 ++ !bfq_queue_nonrot_noidle(bfqd, bfqq));
17313 ++}
17314 ++
17315 ++/*
17316 ++ * Select a queue for service. If we have a current queue in service,
17317 ++ * check whether to continue servicing it, or retrieve and set a new one.
17318 ++ */
17319 ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
17320 ++{
17321 ++ struct bfq_queue *bfqq, *new_bfqq = NULL;
17322 ++ struct request *next_rq;
17323 ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
17324 ++
17325 ++ bfqq = bfqd->in_service_queue;
17326 ++ if (bfqq == NULL)
17327 ++ goto new_queue;
17328 ++
17329 ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
17330 ++
17331 ++ /*
17332 ++ * If another queue has a request waiting within our mean seek
17333 ++ * distance, let it run. The expire code will check for close
17334 ++ * cooperators and put the close queue at the front of the
17335 ++ * service tree. If possible, merge the expiring queue with the
17336 ++ * new bfqq.
17337 ++ */
17338 ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq);
17339 ++ if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
17340 ++ bfq_setup_merge(bfqq, new_bfqq);
17341 ++
17342 ++ if (bfq_may_expire_for_budg_timeout(bfqq) &&
17343 ++ !timer_pending(&bfqd->idle_slice_timer) &&
17344 ++ !bfq_bfqq_must_idle(bfqq))
17345 ++ goto expire;
17346 ++
17347 ++ next_rq = bfqq->next_rq;
17348 ++ /*
17349 ++ * If bfqq has requests queued and it has enough budget left to
17350 ++ * serve them, keep the queue, otherwise expire it.
17351 ++ */
17352 ++ if (next_rq != NULL) {
17353 ++ if (bfq_serv_to_charge(next_rq, bfqq) >
17354 ++ bfq_bfqq_budget_left(bfqq)) {
17355 ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
17356 ++ goto expire;
17357 ++ } else {
17358 ++ /*
17359 ++ * The idle timer may be pending because we may not
17360 ++ * disable disk idling even when a new request arrives
17361 ++ */
17362 ++ if (timer_pending(&bfqd->idle_slice_timer)) {
17363 ++ /*
17364 ++ * If we get here: 1) at least a new request
17365 ++ * has arrived but we have not disabled the
17366 ++ * timer because the request was too small,
17367 ++ * 2) then the block layer has unplugged the
17368 ++ * device, causing the dispatch to be invoked.
17369 ++ *
17370 ++ * Since the device is unplugged, now the
17371 ++ * requests are probably large enough to
17372 ++ * provide a reasonable throughput.
17373 ++ * So we disable idling.
17374 ++ */
17375 ++ bfq_clear_bfqq_wait_request(bfqq);
17376 ++ del_timer(&bfqd->idle_slice_timer);
17377 ++ }
17378 ++ if (new_bfqq == NULL)
17379 ++ goto keep_queue;
17380 ++ else
17381 ++ goto expire;
17382 ++ }
17383 ++ }
17384 ++
17385 ++ /*
17386 ++ * No requests pending. If the in-service queue has no cooperator and
17387 ++ * still has requests in flight (possibly waiting for a completion)
17388 ++ * or is idling for a new request, then keep it.
17389 ++ */
17390 ++ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
17391 ++ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
17392 ++ bfqq = NULL;
17393 ++ goto keep_queue;
17394 ++ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
17395 ++ /*
17396 ++ * Expiring the queue because there is a close cooperator,
17397 ++ * cancel timer.
17398 ++ */
17399 ++ bfq_clear_bfqq_wait_request(bfqq);
17400 ++ del_timer(&bfqd->idle_slice_timer);
17401 ++ }
17402 ++
17403 ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
17404 ++expire:
17405 ++ bfq_bfqq_expire(bfqd, bfqq, 0, reason);
17406 ++new_queue:
17407 ++ bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
17408 ++ bfq_log(bfqd, "select_queue: new queue %d returned",
17409 ++ bfqq != NULL ? bfqq->pid : 0);
17410 ++keep_queue:
17411 ++ return bfqq;
17412 ++}
17413 ++
17414 ++static void bfq_update_raising_data(struct bfq_data *bfqd,
17415 ++ struct bfq_queue *bfqq)
17416 ++{
17417 ++ if (bfqq->raising_coeff > 1) { /* queue is being boosted */
17418 ++ struct bfq_entity *entity = &bfqq->entity;
17419 ++
17420 ++ bfq_log_bfqq(bfqd, bfqq,
17421 ++ "raising period dur %u/%u msec, "
17422 ++ "old raising coeff %u, w %d(%d)",
17423 ++ jiffies_to_msecs(jiffies -
17424 ++ bfqq->last_rais_start_finish),
17425 ++ jiffies_to_msecs(bfqq->raising_cur_max_time),
17426 ++ bfqq->raising_coeff,
17427 ++ bfqq->entity.weight, bfqq->entity.orig_weight);
17428 ++
17429 ++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
17430 ++ entity->orig_weight * bfqq->raising_coeff);
17431 ++ if (entity->ioprio_changed)
17432 ++ bfq_log_bfqq(bfqd, bfqq,
17433 ++ "WARN: pending prio change");
17434 ++ /*
17435 ++ * If too much time has elapsed from the beginning
17436 ++ * of this weight-raising, stop it.
17437 ++ */
17438 ++ if (time_is_before_jiffies(bfqq->last_rais_start_finish +
17439 ++ bfqq->raising_cur_max_time)) {
17440 ++ bfqq->last_rais_start_finish = jiffies;
17441 ++ bfq_log_bfqq(bfqd, bfqq,
17442 ++ "wrais ending at %lu, "
17443 ++ "rais_max_time %u",
17444 ++ bfqq->last_rais_start_finish,
17445 ++ jiffies_to_msecs(bfqq->
17446 ++ raising_cur_max_time));
17447 ++ bfq_bfqq_end_raising(bfqq);
17448 ++ __bfq_entity_update_weight_prio(
17449 ++ bfq_entity_service_tree(entity),
17450 ++ entity);
17451 ++ }
17452 ++ }
17453 ++}
17454 ++
17455 ++/*
17456 ++ * Dispatch one request from bfqq, moving it to the request queue
17457 ++ * dispatch list.
17458 ++ */
17459 ++static int bfq_dispatch_request(struct bfq_data *bfqd,
17460 ++ struct bfq_queue *bfqq)
17461 ++{
17462 ++ int dispatched = 0;
17463 ++ struct request *rq;
17464 ++ unsigned long service_to_charge;
17465 ++
17466 ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
17467 ++
17468 ++ /* Follow expired path, else get first next available. */
17469 ++ rq = bfq_check_fifo(bfqq);
17470 ++ if (rq == NULL)
17471 ++ rq = bfqq->next_rq;
17472 ++ service_to_charge = bfq_serv_to_charge(rq, bfqq);
17473 ++
17474 ++ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
17475 ++ /*
17476 ++ * This may happen if the next rq is chosen
17477 ++ * in fifo order instead of sector order.
17478 ++ * The budget is properly dimensioned
17479 ++ * to be always sufficient to serve the next request
17480 ++ * only if it is chosen in sector order. The reason is
17481 ++ * that it would be quite inefficient and little useful
17482 ++ * to always make sure that the budget is large enough
17483 ++ * to serve even the possible next rq in fifo order.
17484 ++ * In fact, requests are seldom served in fifo order.
17485 ++ *
17486 ++ * Expire the queue for budget exhaustion, and
17487 ++ * make sure that the next act_budget is enough
17488 ++ * to serve the next request, even if it comes
17489 ++ * from the fifo expired path.
17490 ++ */
17491 ++ bfqq->next_rq = rq;
17492 ++ /*
17493 ++ * Since this dispatch is failed, make sure that
17494 ++ * a new one will be performed
17495 ++ */
17496 ++ if (!bfqd->rq_in_driver)
17497 ++ bfq_schedule_dispatch(bfqd);
17498 ++ goto expire;
17499 ++ }
17500 ++
17501 ++ /* Finally, insert request into driver dispatch list. */
17502 ++ bfq_bfqq_served(bfqq, service_to_charge);
17503 ++ bfq_dispatch_insert(bfqd->queue, rq);
17504 ++
17505 ++ bfq_update_raising_data(bfqd, bfqq);
17506 ++
17507 ++ bfq_log_bfqq(bfqd, bfqq,
17508 ++ "dispatched %u sec req (%llu), budg left %lu",
17509 ++ blk_rq_sectors(rq),
17510 ++ (long long unsigned)blk_rq_pos(rq),
17511 ++ bfq_bfqq_budget_left(bfqq));
17512 ++
17513 ++ dispatched++;
17514 ++
17515 ++ if (bfqd->in_service_bic == NULL) {
17516 ++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
17517 ++ bfqd->in_service_bic = RQ_BIC(rq);
17518 ++ }
17519 ++
17520 ++ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
17521 ++ dispatched >= bfqd->bfq_max_budget_async_rq) ||
17522 ++ bfq_class_idle(bfqq)))
17523 ++ goto expire;
17524 ++
17525 ++ return dispatched;
17526 ++
17527 ++expire:
17528 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
17529 ++ return dispatched;
17530 ++}
17531 ++
17532 ++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
17533 ++{
17534 ++ int dispatched = 0;
17535 ++
17536 ++ while (bfqq->next_rq != NULL) {
17537 ++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
17538 ++ dispatched++;
17539 ++ }
17540 ++
17541 ++ BUG_ON(!list_empty(&bfqq->fifo));
17542 ++ return dispatched;
17543 ++}
17544 ++
17545 ++/*
17546 ++ * Drain our current requests. Used for barriers and when switching
17547 ++ * io schedulers on-the-fly.
17548 ++ */
17549 ++static int bfq_forced_dispatch(struct bfq_data *bfqd)
17550 ++{
17551 ++ struct bfq_queue *bfqq, *n;
17552 ++ struct bfq_service_tree *st;
17553 ++ int dispatched = 0;
17554 ++
17555 ++ bfqq = bfqd->in_service_queue;
17556 ++ if (bfqq != NULL)
17557 ++ __bfq_bfqq_expire(bfqd, bfqq);
17558 ++
17559 ++ /*
17560 ++ * Loop through classes, and be careful to leave the scheduler
17561 ++ * in a consistent state, as feedback mechanisms and vtime
17562 ++ * updates cannot be disabled during the process.
17563 ++ */
17564 ++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
17565 ++ st = bfq_entity_service_tree(&bfqq->entity);
17566 ++
17567 ++ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
17568 ++ bfqq->max_budget = bfq_max_budget(bfqd);
17569 ++
17570 ++ bfq_forget_idle(st);
17571 ++ }
17572 ++
17573 ++ BUG_ON(bfqd->busy_queues != 0);
17574 ++
17575 ++ return dispatched;
17576 ++}
17577 ++
17578 ++static int bfq_dispatch_requests(struct request_queue *q, int force)
17579 ++{
17580 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
17581 ++ struct bfq_queue *bfqq;
17582 ++ int max_dispatch;
17583 ++
17584 ++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
17585 ++ if (bfqd->busy_queues == 0)
17586 ++ return 0;
17587 ++
17588 ++ if (unlikely(force))
17589 ++ return bfq_forced_dispatch(bfqd);
17590 ++
17591 ++ bfqq = bfq_select_queue(bfqd);
17592 ++ if (bfqq == NULL)
17593 ++ return 0;
17594 ++
17595 ++ max_dispatch = bfqd->bfq_quantum;
17596 ++ if (bfq_class_idle(bfqq))
17597 ++ max_dispatch = 1;
17598 ++
17599 ++ if (!bfq_bfqq_sync(bfqq))
17600 ++ max_dispatch = bfqd->bfq_max_budget_async_rq;
17601 ++
17602 ++ if (bfqq->dispatched >= max_dispatch) {
17603 ++ if (bfqd->busy_queues > 1)
17604 ++ return 0;
17605 ++ if (bfqq->dispatched >= 4 * max_dispatch)
17606 ++ return 0;
17607 ++ }
17608 ++
17609 ++ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
17610 ++ return 0;
17611 ++
17612 ++ bfq_clear_bfqq_wait_request(bfqq);
17613 ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
17614 ++
17615 ++ if (!bfq_dispatch_request(bfqd, bfqq))
17616 ++ return 0;
17617 ++
17618 ++ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",
17619 ++ bfqq->pid, max_dispatch);
17620 ++
17621 ++ return 1;
17622 ++}
17623 ++
17624 ++/*
17625 ++ * Task holds one reference to the queue, dropped when task exits. Each rq
17626 ++ * in-flight on this queue also holds a reference, dropped when rq is freed.
17627 ++ *
17628 ++ * Queue lock must be held here.
17629 ++ */
17630 ++static void bfq_put_queue(struct bfq_queue *bfqq)
17631 ++{
17632 ++ struct bfq_data *bfqd = bfqq->bfqd;
17633 ++
17634 ++ BUG_ON(atomic_read(&bfqq->ref) <= 0);
17635 ++
17636 ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
17637 ++ atomic_read(&bfqq->ref));
17638 ++ if (!atomic_dec_and_test(&bfqq->ref))
17639 ++ return;
17640 ++
17641 ++ BUG_ON(rb_first(&bfqq->sort_list) != NULL);
17642 ++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
17643 ++ BUG_ON(bfqq->entity.tree != NULL);
17644 ++ BUG_ON(bfq_bfqq_busy(bfqq));
17645 ++ BUG_ON(bfqd->in_service_queue == bfqq);
17646 ++
17647 ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
17648 ++
17649 ++ kmem_cache_free(bfq_pool, bfqq);
17650 ++}
17651 ++
17652 ++static void bfq_put_cooperator(struct bfq_queue *bfqq)
17653 ++{
17654 ++ struct bfq_queue *__bfqq, *next;
17655 ++
17656 ++ /*
17657 ++ * If this queue was scheduled to merge with another queue, be
17658 ++ * sure to drop the reference taken on that queue (and others in
17659 ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
17660 ++ */
17661 ++ __bfqq = bfqq->new_bfqq;
17662 ++ while (__bfqq) {
17663 ++ if (__bfqq == bfqq) {
17664 ++ WARN(1, "bfqq->new_bfqq loop detected.\n");
17665 ++ break;
17666 ++ }
17667 ++ next = __bfqq->new_bfqq;
17668 ++ bfq_put_queue(__bfqq);
17669 ++ __bfqq = next;
17670 ++ }
17671 ++}
17672 ++
17673 ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
17674 ++{
17675 ++ if (bfqq == bfqd->in_service_queue) {
17676 ++ __bfq_bfqq_expire(bfqd, bfqq);
17677 ++ bfq_schedule_dispatch(bfqd);
17678 ++ }
17679 ++
17680 ++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
17681 ++ atomic_read(&bfqq->ref));
17682 ++
17683 ++ bfq_put_cooperator(bfqq);
17684 ++
17685 ++ bfq_put_queue(bfqq);
17686 ++}
17687 ++
17688 ++static void bfq_init_icq(struct io_cq *icq)
17689 ++{
17690 ++ struct bfq_io_cq *bic = icq_to_bic(icq);
17691 ++
17692 ++ bic->ttime.last_end_request = jiffies;
17693 ++}
17694 ++
17695 ++static void bfq_exit_icq(struct io_cq *icq)
17696 ++{
17697 ++ struct bfq_io_cq *bic = icq_to_bic(icq);
17698 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
17699 ++
17700 ++ if (bic->bfqq[BLK_RW_ASYNC]) {
17701 ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
17702 ++ bic->bfqq[BLK_RW_ASYNC] = NULL;
17703 ++ }
17704 ++
17705 ++ if (bic->bfqq[BLK_RW_SYNC]) {
17706 ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
17707 ++ bic->bfqq[BLK_RW_SYNC] = NULL;
17708 ++ }
17709 ++}
17710 ++
17711 ++/*
17712 ++ * Update the entity prio values; note that the new values will not
17713 ++ * be used until the next (re)activation.
17714 ++ */
17715 ++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
17716 ++{
17717 ++ struct task_struct *tsk = current;
17718 ++ int ioprio_class;
17719 ++
17720 ++ if (!bfq_bfqq_prio_changed(bfqq))
17721 ++ return;
17722 ++
17723 ++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
17724 ++ switch (ioprio_class) {
17725 ++ default:
17726 ++ dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
17727 ++ "bfq: bad prio %x\n", ioprio_class);
17728 ++ case IOPRIO_CLASS_NONE:
17729 ++ /*
17730 ++ * No prio set, inherit CPU scheduling settings.
17731 ++ */
17732 ++ bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
17733 ++ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
17734 ++ break;
17735 ++ case IOPRIO_CLASS_RT:
17736 ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
17737 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
17738 ++ break;
17739 ++ case IOPRIO_CLASS_BE:
17740 ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
17741 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
17742 ++ break;
17743 ++ case IOPRIO_CLASS_IDLE:
17744 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
17745 ++ bfqq->entity.new_ioprio = 7;
17746 ++ bfq_clear_bfqq_idle_window(bfqq);
17747 ++ break;
17748 ++ }
17749 ++
17750 ++ bfqq->entity.ioprio_changed = 1;
17751 ++
17752 ++ /*
17753 ++ * Keep track of original prio settings in case we have to temporarily
17754 ++ * elevate the priority of this queue.
17755 ++ */
17756 ++ bfqq->org_ioprio = bfqq->entity.new_ioprio;
17757 ++ bfq_clear_bfqq_prio_changed(bfqq);
17758 ++}
17759 ++
17760 ++static void bfq_changed_ioprio(struct bfq_io_cq *bic)
17761 ++{
17762 ++ struct bfq_data *bfqd;
17763 ++ struct bfq_queue *bfqq, *new_bfqq;
17764 ++ struct bfq_group *bfqg;
17765 ++ unsigned long uninitialized_var(flags);
17766 ++ int ioprio = bic->icq.ioc->ioprio;
17767 ++
17768 ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
17769 ++ &flags);
17770 ++ /*
17771 ++ * This condition may trigger on a newly created bic, be sure to drop
17772 ++ * the lock before returning.
17773 ++ */
17774 ++ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
17775 ++ goto out;
17776 ++
17777 ++ bfqq = bic->bfqq[BLK_RW_ASYNC];
17778 ++ if (bfqq != NULL) {
17779 ++ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
17780 ++ sched_data);
17781 ++ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
17782 ++ GFP_ATOMIC);
17783 ++ if (new_bfqq != NULL) {
17784 ++ bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
17785 ++ bfq_log_bfqq(bfqd, bfqq,
17786 ++ "changed_ioprio: bfqq %p %d",
17787 ++ bfqq, atomic_read(&bfqq->ref));
17788 ++ bfq_put_queue(bfqq);
17789 ++ }
17790 ++ }
17791 ++
17792 ++ bfqq = bic->bfqq[BLK_RW_SYNC];
17793 ++ if (bfqq != NULL)
17794 ++ bfq_mark_bfqq_prio_changed(bfqq);
17795 ++
17796 ++ bic->ioprio = ioprio;
17797 ++
17798 ++out:
17799 ++ bfq_put_bfqd_unlock(bfqd, &flags);
17800 ++}
17801 ++
17802 ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
17803 ++ pid_t pid, int is_sync)
17804 ++{
17805 ++ RB_CLEAR_NODE(&bfqq->entity.rb_node);
17806 ++ INIT_LIST_HEAD(&bfqq->fifo);
17807 ++
17808 ++ atomic_set(&bfqq->ref, 0);
17809 ++ bfqq->bfqd = bfqd;
17810 ++
17811 ++ bfq_mark_bfqq_prio_changed(bfqq);
17812 ++
17813 ++ if (is_sync) {
17814 ++ if (!bfq_class_idle(bfqq))
17815 ++ bfq_mark_bfqq_idle_window(bfqq);
17816 ++ bfq_mark_bfqq_sync(bfqq);
17817 ++ }
17818 ++
17819 ++ /* Tentative initial value to trade off between thr and lat */
17820 ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
17821 ++ bfqq->pid = pid;
17822 ++
17823 ++ bfqq->raising_coeff = 1;
17824 ++ bfqq->last_rais_start_finish = 0;
17825 ++ /*
17826 ++ * Set to the value for which bfqq will not be deemed as
17827 ++ * soft rt when it becomes backlogged.
17828 ++ */
17829 ++ bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);
17830 ++}
17831 ++
17832 ++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
17833 ++ struct bfq_group *bfqg,
17834 ++ int is_sync,
17835 ++ struct bfq_io_cq *bic,
17836 ++ gfp_t gfp_mask)
17837 ++{
17838 ++ struct bfq_queue *bfqq, *new_bfqq = NULL;
17839 ++
17840 ++retry:
17841 ++ /* bic always exists here */
17842 ++ bfqq = bic_to_bfqq(bic, is_sync);
17843 ++
17844 ++ /*
17845 ++ * Always try a new alloc if we fall back to the OOM bfqq
17846 ++ * originally, since it should just be a temporary situation.
17847 ++ */
17848 ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
17849 ++ bfqq = NULL;
17850 ++ if (new_bfqq != NULL) {
17851 ++ bfqq = new_bfqq;
17852 ++ new_bfqq = NULL;
17853 ++ } else if (gfp_mask & __GFP_WAIT) {
17854 ++ spin_unlock_irq(bfqd->queue->queue_lock);
17855 ++ new_bfqq = kmem_cache_alloc_node(bfq_pool,
17856 ++ gfp_mask | __GFP_ZERO,
17857 ++ bfqd->queue->node);
17858 ++ spin_lock_irq(bfqd->queue->queue_lock);
17859 ++ if (new_bfqq != NULL)
17860 ++ goto retry;
17861 ++ } else {
17862 ++ bfqq = kmem_cache_alloc_node(bfq_pool,
17863 ++ gfp_mask | __GFP_ZERO,
17864 ++ bfqd->queue->node);
17865 ++ }
17866 ++
17867 ++ if (bfqq != NULL) {
17868 ++ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
17869 ++ bfq_log_bfqq(bfqd, bfqq, "allocated");
17870 ++ } else {
17871 ++ bfqq = &bfqd->oom_bfqq;
17872 ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
17873 ++ }
17874 ++
17875 ++ bfq_init_prio_data(bfqq, bic);
17876 ++ bfq_init_entity(&bfqq->entity, bfqg);
17877 ++ }
17878 ++
17879 ++ if (new_bfqq != NULL)
17880 ++ kmem_cache_free(bfq_pool, new_bfqq);
17881 ++
17882 ++ return bfqq;
17883 ++}
17884 ++
17885 ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
17886 ++ struct bfq_group *bfqg,
17887 ++ int ioprio_class, int ioprio)
17888 ++{
17889 ++ switch (ioprio_class) {
17890 ++ case IOPRIO_CLASS_RT:
17891 ++ return &bfqg->async_bfqq[0][ioprio];
17892 ++ case IOPRIO_CLASS_NONE:
17893 ++ ioprio = IOPRIO_NORM;
17894 ++ /* fall through */
17895 ++ case IOPRIO_CLASS_BE:
17896 ++ return &bfqg->async_bfqq[1][ioprio];
17897 ++ case IOPRIO_CLASS_IDLE:
17898 ++ return &bfqg->async_idle_bfqq;
17899 ++ default:
17900 ++ BUG();
17901 ++ }
17902 ++}
17903 ++
17904 ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
17905 ++ struct bfq_group *bfqg, int is_sync,
17906 ++ struct bfq_io_cq *bic, gfp_t gfp_mask)
17907 ++{
17908 ++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
17909 ++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
17910 ++ struct bfq_queue **async_bfqq = NULL;
17911 ++ struct bfq_queue *bfqq = NULL;
17912 ++
17913 ++ if (!is_sync) {
17914 ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
17915 ++ ioprio);
17916 ++ bfqq = *async_bfqq;
17917 ++ }
17918 ++
17919 ++ if (bfqq == NULL)
17920 ++ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
17921 ++
17922 ++ /*
17923 ++ * Pin the queue now that it's allocated, scheduler exit will prune it.
17924 ++ */
17925 ++ if (!is_sync && *async_bfqq == NULL) {
17926 ++ atomic_inc(&bfqq->ref);
17927 ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
17928 ++ bfqq, atomic_read(&bfqq->ref));
17929 ++ *async_bfqq = bfqq;
17930 ++ }
17931 ++
17932 ++ atomic_inc(&bfqq->ref);
17933 ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
17934 ++ atomic_read(&bfqq->ref));
17935 ++ return bfqq;
17936 ++}
17937 ++
17938 ++static void bfq_update_io_thinktime(struct bfq_data *bfqd,
17939 ++ struct bfq_io_cq *bic)
17940 ++{
17941 ++ unsigned long elapsed = jiffies - bic->ttime.last_end_request;
17942 ++ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
17943 ++
17944 ++ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
17945 ++ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
17946 ++ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /
17947 ++ bic->ttime.ttime_samples;
17948 ++}
17949 ++
17950 ++static void bfq_update_io_seektime(struct bfq_data *bfqd,
17951 ++ struct bfq_queue *bfqq,
17952 ++ struct request *rq)
17953 ++{
17954 ++ sector_t sdist;
17955 ++ u64 total;
17956 ++
17957 ++ if (bfqq->last_request_pos < blk_rq_pos(rq))
17958 ++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
17959 ++ else
17960 ++ sdist = bfqq->last_request_pos - blk_rq_pos(rq);
17961 ++
17962 ++ /*
17963 ++ * Don't allow the seek distance to get too large from the
17964 ++ * odd fragment, pagein, etc.
17965 ++ */
17966 ++ if (bfqq->seek_samples == 0) /* first request, not really a seek */
17967 ++ sdist = 0;
17968 ++ else if (bfqq->seek_samples <= 60) /* second & third seek */
17969 ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
17970 ++ else
17971 ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
17972 ++
17973 ++ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
17974 ++ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
17975 ++ total = bfqq->seek_total + (bfqq->seek_samples/2);
17976 ++ do_div(total, bfqq->seek_samples);
17977 ++ bfqq->seek_mean = (sector_t)total;
17978 ++
17979 ++ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
17980 ++ (u64)bfqq->seek_mean);
17981 ++}
17982 ++
17983 ++/*
17984 ++ * Disable idle window if the process thinks too long or seeks so much that
17985 ++ * it doesn't matter.
17986 ++ */
17987 ++static void bfq_update_idle_window(struct bfq_data *bfqd,
17988 ++ struct bfq_queue *bfqq,
17989 ++ struct bfq_io_cq *bic)
17990 ++{
17991 ++ int enable_idle;
17992 ++
17993 ++ /* Don't idle for async or idle io prio class. */
17994 ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
17995 ++ return;
17996 ++
17997 ++ enable_idle = bfq_bfqq_idle_window(bfqq);
17998 ++
17999 ++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
18000 ++ bfqd->bfq_slice_idle == 0 ||
18001 ++ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
18002 ++ bfqq->raising_coeff == 1))
18003 ++ enable_idle = 0;
18004 ++ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
18005 ++ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
18006 ++ bfqq->raising_coeff == 1)
18007 ++ enable_idle = 0;
18008 ++ else
18009 ++ enable_idle = 1;
18010 ++ }
18011 ++ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
18012 ++ enable_idle);
18013 ++
18014 ++ if (enable_idle)
18015 ++ bfq_mark_bfqq_idle_window(bfqq);
18016 ++ else
18017 ++ bfq_clear_bfqq_idle_window(bfqq);
18018 ++}
18019 ++
18020 ++/*
18021 ++ * Called when a new fs request (rq) is added to bfqq. Check if there's
18022 ++ * something we should do about it.
18023 ++ */
18024 ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
18025 ++ struct request *rq)
18026 ++{
18027 ++ struct bfq_io_cq *bic = RQ_BIC(rq);
18028 ++
18029 ++ if (rq->cmd_flags & REQ_META)
18030 ++ bfqq->meta_pending++;
18031 ++
18032 ++ bfq_update_io_thinktime(bfqd, bic);
18033 ++ bfq_update_io_seektime(bfqd, bfqq, rq);
18034 ++ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
18035 ++ !BFQQ_SEEKY(bfqq))
18036 ++ bfq_update_idle_window(bfqd, bfqq, bic);
18037 ++
18038 ++ bfq_log_bfqq(bfqd, bfqq,
18039 ++ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
18040 ++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
18041 ++ (long long unsigned)bfqq->seek_mean);
18042 ++
18043 ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
18044 ++
18045 ++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
18046 ++ int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
18047 ++ blk_rq_sectors(rq) < 32;
18048 ++ int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
18049 ++
18050 ++ /*
18051 ++ * There is just this request queued: if the request
18052 ++ * is small and the queue is not to be expired, then
18053 ++ * just exit.
18054 ++ *
18055 ++ * In this way, if the disk is being idled to wait for
18056 ++ * a new request from the in-service queue, we avoid
18057 ++ * unplugging the device and committing the disk to serve
18058 ++ * just a small request. On the contrary, we wait for
18059 ++ * the block layer to decide when to unplug the device:
18060 ++ * hopefully, new requests will be merged to this one
18061 ++ * quickly, then the device will be unplugged and
18062 ++ * larger requests will be dispatched.
18063 ++ */
18064 ++ if (small_req && !budget_timeout)
18065 ++ return;
18066 ++
18067 ++ /*
18068 ++ * A large enough request arrived, or the queue is to
18069 ++ * be expired: in both cases disk idling is to be
18070 ++ * stopped, so clear wait_request flag and reset
18071 ++ * timer.
18072 ++ */
18073 ++ bfq_clear_bfqq_wait_request(bfqq);
18074 ++ del_timer(&bfqd->idle_slice_timer);
18075 ++
18076 ++ /*
18077 ++ * The queue is not empty, because a new request just
18078 ++ * arrived. Hence we can safely expire the queue, in
18079 ++ * case of budget timeout, without risking that the
18080 ++ * timestamps of the queue are not updated correctly.
18081 ++ * See [1] for more details.
18082 ++ */
18083 ++ if (budget_timeout)
18084 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
18085 ++
18086 ++ /*
18087 ++ * Let the request rip immediately, or let a new queue be
18088 ++ * selected if bfqq has just been expired.
18089 ++ */
18090 ++ __blk_run_queue(bfqd->queue);
18091 ++ }
18092 ++}
18093 ++
18094 ++static void bfq_insert_request(struct request_queue *q, struct request *rq)
18095 ++{
18096 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
18097 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
18098 ++
18099 ++ assert_spin_locked(bfqd->queue->queue_lock);
18100 ++ bfq_init_prio_data(bfqq, RQ_BIC(rq));
18101 ++
18102 ++ bfq_add_rq_rb(rq);
18103 ++
18104 ++ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
18105 ++ list_add_tail(&rq->queuelist, &bfqq->fifo);
18106 ++
18107 ++ bfq_rq_enqueued(bfqd, bfqq, rq);
18108 ++}
18109 ++
18110 ++static void bfq_update_hw_tag(struct bfq_data *bfqd)
18111 ++{
18112 ++ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
18113 ++ bfqd->rq_in_driver);
18114 ++
18115 ++ if (bfqd->hw_tag == 1)
18116 ++ return;
18117 ++
18118 ++ /*
18119 ++ * This sample is valid if the number of outstanding requests
18120 ++ * is large enough to allow a queueing behavior. Note that the
18121 ++ * sum is not exact, as it's not taking into account deactivated
18122 ++ * requests.
18123 ++ */
18124 ++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
18125 ++ return;
18126 ++
18127 ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
18128 ++ return;
18129 ++
18130 ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
18131 ++ bfqd->max_rq_in_driver = 0;
18132 ++ bfqd->hw_tag_samples = 0;
18133 ++}
18134 ++
18135 ++static void bfq_completed_request(struct request_queue *q, struct request *rq)
18136 ++{
18137 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
18138 ++ struct bfq_data *bfqd = bfqq->bfqd;
18139 ++ const int sync = rq_is_sync(rq);
18140 ++
18141 ++ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
18142 ++ blk_rq_sectors(rq), sync);
18143 ++
18144 ++ bfq_update_hw_tag(bfqd);
18145 ++
18146 ++ WARN_ON(!bfqd->rq_in_driver);
18147 ++ WARN_ON(!bfqq->dispatched);
18148 ++ bfqd->rq_in_driver--;
18149 ++ bfqq->dispatched--;
18150 ++
18151 ++ if (bfq_bfqq_sync(bfqq))
18152 ++ bfqd->sync_flight--;
18153 ++
18154 ++ if (sync)
18155 ++ RQ_BIC(rq)->ttime.last_end_request = jiffies;
18156 ++
18157 ++ /*
18158 ++ * The computation of softrt_next_start was scheduled for the next
18159 ++ * request completion: it is now time to compute it.
18160 ++ */
18161 ++ if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list))
18162 ++ bfqq->soft_rt_next_start =
18163 ++ bfq_bfqq_softrt_next_start(bfqd, bfqq);
18164 ++
18165 ++ /*
18166 ++ * If this is the in-service queue, check if it needs to be expired,
18167 ++ * or if we want to idle in case it has no pending requests.
18168 ++ */
18169 ++ if (bfqd->in_service_queue == bfqq) {
18170 ++ if (bfq_bfqq_budget_new(bfqq))
18171 ++ bfq_set_budget_timeout(bfqd);
18172 ++
18173 ++ if (bfq_bfqq_must_idle(bfqq)) {
18174 ++ bfq_arm_slice_timer(bfqd);
18175 ++ goto out;
18176 ++ } else if (bfq_may_expire_for_budg_timeout(bfqq))
18177 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
18178 ++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
18179 ++ (bfqq->dispatched == 0 ||
18180 ++ !bfq_bfqq_must_not_expire(bfqq)))
18181 ++ bfq_bfqq_expire(bfqd, bfqq, 0,
18182 ++ BFQ_BFQQ_NO_MORE_REQUESTS);
18183 ++ }
18184 ++
18185 ++ if (!bfqd->rq_in_driver)
18186 ++ bfq_schedule_dispatch(bfqd);
18187 ++
18188 ++out:
18189 ++ return;
18190 ++}
18191 ++
18192 ++static inline int __bfq_may_queue(struct bfq_queue *bfqq)
18193 ++{
18194 ++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
18195 ++ bfq_clear_bfqq_must_alloc(bfqq);
18196 ++ return ELV_MQUEUE_MUST;
18197 ++ }
18198 ++
18199 ++ return ELV_MQUEUE_MAY;
18200 ++}
18201 ++
18202 ++static int bfq_may_queue(struct request_queue *q, int rw)
18203 ++{
18204 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
18205 ++ struct task_struct *tsk = current;
18206 ++ struct bfq_io_cq *bic;
18207 ++ struct bfq_queue *bfqq;
18208 ++
18209 ++ /*
18210 ++ * Don't force setup of a queue from here, as a call to may_queue
18211 ++ * does not necessarily imply that a request actually will be queued.
18212 ++ * So just lookup a possibly existing queue, or return 'may queue'
18213 ++ * if that fails.
18214 ++ */
18215 ++ bic = bfq_bic_lookup(bfqd, tsk->io_context);
18216 ++ if (bic == NULL)
18217 ++ return ELV_MQUEUE_MAY;
18218 ++
18219 ++ bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
18220 ++ if (bfqq != NULL) {
18221 ++ bfq_init_prio_data(bfqq, bic);
18222 ++
18223 ++ return __bfq_may_queue(bfqq);
18224 ++ }
18225 ++
18226 ++ return ELV_MQUEUE_MAY;
18227 ++}
18228 ++
18229 ++/*
18230 ++ * Queue lock held here.
18231 ++ */
18232 ++static void bfq_put_request(struct request *rq)
18233 ++{
18234 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
18235 ++
18236 ++ if (bfqq != NULL) {
18237 ++ const int rw = rq_data_dir(rq);
18238 ++
18239 ++ BUG_ON(!bfqq->allocated[rw]);
18240 ++ bfqq->allocated[rw]--;
18241 ++
18242 ++ rq->elv.priv[0] = NULL;
18243 ++ rq->elv.priv[1] = NULL;
18244 ++
18245 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
18246 ++ bfqq, atomic_read(&bfqq->ref));
18247 ++ bfq_put_queue(bfqq);
18248 ++ }
18249 ++}
18250 ++
18251 ++static struct bfq_queue *
18252 ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
18253 ++ struct bfq_queue *bfqq)
18254 ++{
18255 ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
18256 ++ (long unsigned)bfqq->new_bfqq->pid);
18257 ++ bic_set_bfqq(bic, bfqq->new_bfqq, 1);
18258 ++ bfq_mark_bfqq_coop(bfqq->new_bfqq);
18259 ++ bfq_put_queue(bfqq);
18260 ++ return bic_to_bfqq(bic, 1);
18261 ++}
18262 ++
18263 ++/*
18264 ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
18265 ++ * was the last process referring to said bfqq.
18266 ++ */
18267 ++static struct bfq_queue *
18268 ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
18269 ++{
18270 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
18271 ++ if (bfqq_process_refs(bfqq) == 1) {
18272 ++ bfqq->pid = current->pid;
18273 ++ bfq_clear_bfqq_coop(bfqq);
18274 ++ bfq_clear_bfqq_split_coop(bfqq);
18275 ++ return bfqq;
18276 ++ }
18277 ++
18278 ++ bic_set_bfqq(bic, NULL, 1);
18279 ++
18280 ++ bfq_put_cooperator(bfqq);
18281 ++
18282 ++ bfq_put_queue(bfqq);
18283 ++ return NULL;
18284 ++}
18285 ++
18286 ++/*
18287 ++ * Allocate bfq data structures associated with this request.
18288 ++ */
18289 ++static int bfq_set_request(struct request_queue *q, struct request *rq,
18290 ++ struct bio *bio, gfp_t gfp_mask)
18291 ++{
18292 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
18293 ++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
18294 ++ const int rw = rq_data_dir(rq);
18295 ++ const int is_sync = rq_is_sync(rq);
18296 ++ struct bfq_queue *bfqq;
18297 ++ struct bfq_group *bfqg;
18298 ++ unsigned long flags;
18299 ++
18300 ++ might_sleep_if(gfp_mask & __GFP_WAIT);
18301 ++
18302 ++ bfq_changed_ioprio(bic);
18303 ++
18304 ++ spin_lock_irqsave(q->queue_lock, flags);
18305 ++
18306 ++ if (bic == NULL)
18307 ++ goto queue_fail;
18308 ++
18309 ++ bfqg = bfq_bic_update_cgroup(bic);
18310 ++
18311 ++new_queue:
18312 ++ bfqq = bic_to_bfqq(bic, is_sync);
18313 ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
18314 ++ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
18315 ++ bic_set_bfqq(bic, bfqq, is_sync);
18316 ++ } else {
18317 ++ /*
18318 ++ * If the queue was seeky for too long, break it apart.
18319 ++ */
18320 ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
18321 ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
18322 ++ bfqq = bfq_split_bfqq(bic, bfqq);
18323 ++ if (!bfqq)
18324 ++ goto new_queue;
18325 ++ }
18326 ++
18327 ++ /*
18328 ++ * Check to see if this queue is scheduled to merge with
18329 ++ * another closely cooperating queue. The merging of queues
18330 ++ * happens here as it must be done in process context.
18331 ++ * The reference on new_bfqq was taken in merge_bfqqs.
18332 ++ */
18333 ++ if (bfqq->new_bfqq != NULL)
18334 ++ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
18335 ++ }
18336 ++
18337 ++ bfqq->allocated[rw]++;
18338 ++ atomic_inc(&bfqq->ref);
18339 ++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
18340 ++ atomic_read(&bfqq->ref));
18341 ++
18342 ++ rq->elv.priv[0] = bic;
18343 ++ rq->elv.priv[1] = bfqq;
18344 ++
18345 ++ spin_unlock_irqrestore(q->queue_lock, flags);
18346 ++
18347 ++ return 0;
18348 ++
18349 ++queue_fail:
18350 ++ bfq_schedule_dispatch(bfqd);
18351 ++ spin_unlock_irqrestore(q->queue_lock, flags);
18352 ++
18353 ++ return 1;
18354 ++}
18355 ++
18356 ++static void bfq_kick_queue(struct work_struct *work)
18357 ++{
18358 ++ struct bfq_data *bfqd =
18359 ++ container_of(work, struct bfq_data, unplug_work);
18360 ++ struct request_queue *q = bfqd->queue;
18361 ++
18362 ++ spin_lock_irq(q->queue_lock);
18363 ++ __blk_run_queue(q);
18364 ++ spin_unlock_irq(q->queue_lock);
18365 ++}
18366 ++
18367 ++/*
18368 ++ * Handler of the expiration of the timer running if the in-service queue
18369 ++ * is idling inside its time slice.
18370 ++ */
18371 ++static void bfq_idle_slice_timer(unsigned long data)
18372 ++{
18373 ++ struct bfq_data *bfqd = (struct bfq_data *)data;
18374 ++ struct bfq_queue *bfqq;
18375 ++ unsigned long flags;
18376 ++ enum bfqq_expiration reason;
18377 ++
18378 ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
18379 ++
18380 ++ bfqq = bfqd->in_service_queue;
18381 ++ /*
18382 ++ * Theoretical race here: the in-service queue can be NULL or different
18383 ++ * from the queue that was idling if the timer handler spins on
18384 ++ * the queue_lock and a new request arrives for the current
18385 ++ * queue and there is a full dispatch cycle that changes the
18386 ++ * in-service queue. This can hardly happen, but in the worst case
18387 ++ * we just expire a queue too early.
18388 ++ */
18389 ++ if (bfqq != NULL) {
18390 ++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
18391 ++ if (bfq_bfqq_budget_timeout(bfqq))
18392 ++ /*
18393 ++ * Also here the queue can be safely expired
18394 ++ * for budget timeout without wasting
18395 ++ * guarantees
18396 ++ */
18397 ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
18398 ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
18399 ++ /*
18400 ++ * The queue may not be empty upon timer expiration,
18401 ++ * because we may not disable the timer when the first
18402 ++ * request of the in-service queue arrives during
18403 ++ * disk idling
18404 ++ */
18405 ++ reason = BFQ_BFQQ_TOO_IDLE;
18406 ++ else
18407 ++ goto schedule_dispatch;
18408 ++
18409 ++ bfq_bfqq_expire(bfqd, bfqq, 1, reason);
18410 ++ }
18411 ++
18412 ++schedule_dispatch:
18413 ++ bfq_schedule_dispatch(bfqd);
18414 ++
18415 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
18416 ++}
18417 ++
18418 ++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
18419 ++{
18420 ++ del_timer_sync(&bfqd->idle_slice_timer);
18421 ++ cancel_work_sync(&bfqd->unplug_work);
18422 ++}
18423 ++
18424 ++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
18425 ++ struct bfq_queue **bfqq_ptr)
18426 ++{
18427 ++ struct bfq_group *root_group = bfqd->root_group;
18428 ++ struct bfq_queue *bfqq = *bfqq_ptr;
18429 ++
18430 ++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
18431 ++ if (bfqq != NULL) {
18432 ++ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
18433 ++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
18434 ++ bfqq, atomic_read(&bfqq->ref));
18435 ++ bfq_put_queue(bfqq);
18436 ++ *bfqq_ptr = NULL;
18437 ++ }
18438 ++}
18439 ++
18440 ++/*
18441 ++ * Release all the bfqg references to its async queues. If we are
18442 ++ * deallocating the group these queues may still contain requests, so
18443 ++ * we reparent them to the root cgroup (i.e., the only one that will
18444 ++ * exist for sure untill all the requests on a device are gone).
18445 ++ */
18446 ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
18447 ++{
18448 ++ int i, j;
18449 ++
18450 ++ for (i = 0; i < 2; i++)
18451 ++ for (j = 0; j < IOPRIO_BE_NR; j++)
18452 ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
18453 ++
18454 ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
18455 ++}
18456 ++
18457 ++static void bfq_exit_queue(struct elevator_queue *e)
18458 ++{
18459 ++ struct bfq_data *bfqd = e->elevator_data;
18460 ++ struct request_queue *q = bfqd->queue;
18461 ++ struct bfq_queue *bfqq, *n;
18462 ++
18463 ++ bfq_shutdown_timer_wq(bfqd);
18464 ++
18465 ++ spin_lock_irq(q->queue_lock);
18466 ++
18467 ++ BUG_ON(bfqd->in_service_queue != NULL);
18468 ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
18469 ++ bfq_deactivate_bfqq(bfqd, bfqq, 0);
18470 ++
18471 ++ bfq_disconnect_groups(bfqd);
18472 ++ spin_unlock_irq(q->queue_lock);
18473 ++
18474 ++ bfq_shutdown_timer_wq(bfqd);
18475 ++
18476 ++ synchronize_rcu();
18477 ++
18478 ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
18479 ++
18480 ++ bfq_free_root_group(bfqd);
18481 ++ kfree(bfqd);
18482 ++}
18483 ++
18484 ++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
18485 ++{
18486 ++ struct bfq_group *bfqg;
18487 ++ struct bfq_data *bfqd;
18488 ++ struct elevator_queue *eq;
18489 ++
18490 ++ eq = elevator_alloc(q, e);
18491 ++ if (eq == NULL)
18492 ++ return -ENOMEM;
18493 ++
18494 ++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
18495 ++ if (bfqd == NULL) {
18496 ++ kobject_put(&eq->kobj);
18497 ++ return -ENOMEM;
18498 ++ }
18499 ++ eq->elevator_data = bfqd;
18500 ++
18501 ++ /*
18502 ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
18503 ++ * Grab a permanent reference to it, so that the normal code flow
18504 ++ * will not attempt to free it.
18505 ++ */
18506 ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
18507 ++ atomic_inc(&bfqd->oom_bfqq.ref);
18508 ++
18509 ++ bfqd->queue = q;
18510 ++
18511 ++ spin_lock_irq(q->queue_lock);
18512 ++ q->elevator = eq;
18513 ++ spin_unlock_irq(q->queue_lock);
18514 ++
18515 ++ bfqg = bfq_alloc_root_group(bfqd, q->node);
18516 ++ if (bfqg == NULL) {
18517 ++ kfree(bfqd);
18518 ++ kobject_put(&eq->kobj);
18519 ++ return -ENOMEM;
18520 ++ }
18521 ++
18522 ++ bfqd->root_group = bfqg;
18523 ++
18524 ++ init_timer(&bfqd->idle_slice_timer);
18525 ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
18526 ++ bfqd->idle_slice_timer.data = (unsigned long)bfqd;
18527 ++
18528 ++ bfqd->rq_pos_tree = RB_ROOT;
18529 ++
18530 ++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
18531 ++
18532 ++ INIT_LIST_HEAD(&bfqd->active_list);
18533 ++ INIT_LIST_HEAD(&bfqd->idle_list);
18534 ++
18535 ++ bfqd->hw_tag = -1;
18536 ++
18537 ++ bfqd->bfq_max_budget = bfq_default_max_budget;
18538 ++
18539 ++ bfqd->bfq_quantum = bfq_quantum;
18540 ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
18541 ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
18542 ++ bfqd->bfq_back_max = bfq_back_max;
18543 ++ bfqd->bfq_back_penalty = bfq_back_penalty;
18544 ++ bfqd->bfq_slice_idle = bfq_slice_idle;
18545 ++ bfqd->bfq_class_idle_last_service = 0;
18546 ++ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
18547 ++ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
18548 ++ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
18549 ++
18550 ++ bfqd->low_latency = true;
18551 ++
18552 ++ bfqd->bfq_raising_coeff = 20;
18553 ++ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
18554 ++ bfqd->bfq_raising_max_time = 0;
18555 ++ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
18556 ++ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
18557 ++ bfqd->bfq_raising_max_softrt_rate = 7000; /*
18558 ++ * Approximate rate required
18559 ++ * to playback or record a
18560 ++ * high-definition compressed
18561 ++ * video.
18562 ++ */
18563 ++ bfqd->raised_busy_queues = 0;
18564 ++
18565 ++ /* Initially estimate the device's peak rate as the reference rate */
18566 ++ if (blk_queue_nonrot(bfqd->queue)) {
18567 ++ bfqd->RT_prod = R_nonrot * T_nonrot;
18568 ++ bfqd->peak_rate = R_nonrot;
18569 ++ } else {
18570 ++ bfqd->RT_prod = R_rot * T_rot;
18571 ++ bfqd->peak_rate = R_rot;
18572 ++ }
18573 ++
18574 ++ return 0;
18575 ++}
18576 ++
18577 ++static void bfq_slab_kill(void)
18578 ++{
18579 ++ if (bfq_pool != NULL)
18580 ++ kmem_cache_destroy(bfq_pool);
18581 ++}
18582 ++
18583 ++static int __init bfq_slab_setup(void)
18584 ++{
18585 ++ bfq_pool = KMEM_CACHE(bfq_queue, 0);
18586 ++ if (bfq_pool == NULL)
18587 ++ return -ENOMEM;
18588 ++ return 0;
18589 ++}
18590 ++
18591 ++static ssize_t bfq_var_show(unsigned int var, char *page)
18592 ++{
18593 ++ return sprintf(page, "%d\n", var);
18594 ++}
18595 ++
18596 ++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
18597 ++{
18598 ++ unsigned long new_val;
18599 ++ int ret = kstrtoul(page, 10, &new_val);
18600 ++
18601 ++ if (ret == 0)
18602 ++ *var = new_val;
18603 ++
18604 ++ return count;
18605 ++}
18606 ++
18607 ++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
18608 ++{
18609 ++ struct bfq_data *bfqd = e->elevator_data;
18610 ++ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
18611 ++ jiffies_to_msecs(bfqd->bfq_raising_max_time) :
18612 ++ jiffies_to_msecs(bfq_wrais_duration(bfqd)));
18613 ++}
18614 ++
18615 ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
18616 ++{
18617 ++ struct bfq_queue *bfqq;
18618 ++ struct bfq_data *bfqd = e->elevator_data;
18619 ++ ssize_t num_char = 0;
18620 ++
18621 ++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
18622 ++ bfqd->queued);
18623 ++
18624 ++ spin_lock_irq(bfqd->queue->queue_lock);
18625 ++
18626 ++ num_char += sprintf(page + num_char, "Active:\n");
18627 ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
18628 ++ num_char += sprintf(page + num_char,
18629 ++ "pid%d: weight %hu, nr_queued %d %d,"
18630 ++ " dur %d/%u\n",
18631 ++ bfqq->pid,
18632 ++ bfqq->entity.weight,
18633 ++ bfqq->queued[0],
18634 ++ bfqq->queued[1],
18635 ++ jiffies_to_msecs(jiffies -
18636 ++ bfqq->last_rais_start_finish),
18637 ++ jiffies_to_msecs(bfqq->raising_cur_max_time));
18638 ++ }
18639 ++
18640 ++ num_char += sprintf(page + num_char, "Idle:\n");
18641 ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
18642 ++ num_char += sprintf(page + num_char,
18643 ++ "pid%d: weight %hu, dur %d/%u\n",
18644 ++ bfqq->pid,
18645 ++ bfqq->entity.weight,
18646 ++ jiffies_to_msecs(jiffies -
18647 ++ bfqq->last_rais_start_finish),
18648 ++ jiffies_to_msecs(bfqq->raising_cur_max_time));
18649 ++ }
18650 ++
18651 ++ spin_unlock_irq(bfqd->queue->queue_lock);
18652 ++
18653 ++ return num_char;
18654 ++}
18655 ++
18656 ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
18657 ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \
18658 ++{ \
18659 ++ struct bfq_data *bfqd = e->elevator_data; \
18660 ++ unsigned int __data = __VAR; \
18661 ++ if (__CONV) \
18662 ++ __data = jiffies_to_msecs(__data); \
18663 ++ return bfq_var_show(__data, (page)); \
18664 ++}
18665 ++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
18666 ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
18667 ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
18668 ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
18669 ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
18670 ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
18671 ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
18672 ++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
18673 ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
18674 ++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
18675 ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
18676 ++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
18677 ++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
18678 ++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
18679 ++ 1);
18680 ++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
18681 ++ bfqd->bfq_raising_min_inter_arr_async,
18682 ++ 1);
18683 ++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
18684 ++ bfqd->bfq_raising_max_softrt_rate, 0);
18685 ++#undef SHOW_FUNCTION
18686 ++
18687 ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
18688 ++static ssize_t \
18689 ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \
18690 ++{ \
18691 ++ struct bfq_data *bfqd = e->elevator_data; \
18692 ++ unsigned long uninitialized_var(__data); \
18693 ++ int ret = bfq_var_store(&__data, (page), count); \
18694 ++ if (__data < (MIN)) \
18695 ++ __data = (MIN); \
18696 ++ else if (__data > (MAX)) \
18697 ++ __data = (MAX); \
18698 ++ if (__CONV) \
18699 ++ *(__PTR) = msecs_to_jiffies(__data); \
18700 ++ else \
18701 ++ *(__PTR) = __data; \
18702 ++ return ret; \
18703 ++}
18704 ++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
18705 ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
18706 ++ INT_MAX, 1);
18707 ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
18708 ++ INT_MAX, 1);
18709 ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
18710 ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
18711 ++ INT_MAX, 0);
18712 ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
18713 ++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
18714 ++ 1, INT_MAX, 0);
18715 ++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
18716 ++ INT_MAX, 1);
18717 ++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
18718 ++ INT_MAX, 0);
18719 ++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
18720 ++ INT_MAX, 1);
18721 ++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
18722 ++ INT_MAX, 1);
18723 ++STORE_FUNCTION(bfq_raising_min_idle_time_store,
18724 ++ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
18725 ++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
18726 ++ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
18727 ++STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
18728 ++ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
18729 ++#undef STORE_FUNCTION
18730 ++
18731 ++/* do nothing for the moment */
18732 ++static ssize_t bfq_weights_store(struct elevator_queue *e,
18733 ++ const char *page, size_t count)
18734 ++{
18735 ++ return count;
18736 ++}
18737 ++
18738 ++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
18739 ++{
18740 ++ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
18741 ++
18742 ++ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
18743 ++ return bfq_calc_max_budget(bfqd->peak_rate, timeout);
18744 ++ else
18745 ++ return bfq_default_max_budget;
18746 ++}
18747 ++
18748 ++static ssize_t bfq_max_budget_store(struct elevator_queue *e,
18749 ++ const char *page, size_t count)
18750 ++{
18751 ++ struct bfq_data *bfqd = e->elevator_data;
18752 ++ unsigned long uninitialized_var(__data);
18753 ++ int ret = bfq_var_store(&__data, (page), count);
18754 ++
18755 ++ if (__data == 0)
18756 ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
18757 ++ else {
18758 ++ if (__data > INT_MAX)
18759 ++ __data = INT_MAX;
18760 ++ bfqd->bfq_max_budget = __data;
18761 ++ }
18762 ++
18763 ++ bfqd->bfq_user_max_budget = __data;
18764 ++
18765 ++ return ret;
18766 ++}
18767 ++
18768 ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
18769 ++ const char *page, size_t count)
18770 ++{
18771 ++ struct bfq_data *bfqd = e->elevator_data;
18772 ++ unsigned long uninitialized_var(__data);
18773 ++ int ret = bfq_var_store(&__data, (page), count);
18774 ++
18775 ++ if (__data < 1)
18776 ++ __data = 1;
18777 ++ else if (__data > INT_MAX)
18778 ++ __data = INT_MAX;
18779 ++
18780 ++ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
18781 ++ if (bfqd->bfq_user_max_budget == 0)
18782 ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
18783 ++
18784 ++ return ret;
18785 ++}
18786 ++
18787 ++static ssize_t bfq_low_latency_store(struct elevator_queue *e,
18788 ++ const char *page, size_t count)
18789 ++{
18790 ++ struct bfq_data *bfqd = e->elevator_data;
18791 ++ unsigned long uninitialized_var(__data);
18792 ++ int ret = bfq_var_store(&__data, (page), count);
18793 ++
18794 ++ if (__data > 1)
18795 ++ __data = 1;
18796 ++ if (__data == 0 && bfqd->low_latency != 0)
18797 ++ bfq_end_raising(bfqd);
18798 ++ bfqd->low_latency = __data;
18799 ++
18800 ++ return ret;
18801 ++}
18802 ++
18803 ++#define BFQ_ATTR(name) \
18804 ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
18805 ++
18806 ++static struct elv_fs_entry bfq_attrs[] = {
18807 ++ BFQ_ATTR(quantum),
18808 ++ BFQ_ATTR(fifo_expire_sync),
18809 ++ BFQ_ATTR(fifo_expire_async),
18810 ++ BFQ_ATTR(back_seek_max),
18811 ++ BFQ_ATTR(back_seek_penalty),
18812 ++ BFQ_ATTR(slice_idle),
18813 ++ BFQ_ATTR(max_budget),
18814 ++ BFQ_ATTR(max_budget_async_rq),
18815 ++ BFQ_ATTR(timeout_sync),
18816 ++ BFQ_ATTR(timeout_async),
18817 ++ BFQ_ATTR(low_latency),
18818 ++ BFQ_ATTR(raising_coeff),
18819 ++ BFQ_ATTR(raising_max_time),
18820 ++ BFQ_ATTR(raising_rt_max_time),
18821 ++ BFQ_ATTR(raising_min_idle_time),
18822 ++ BFQ_ATTR(raising_min_inter_arr_async),
18823 ++ BFQ_ATTR(raising_max_softrt_rate),
18824 ++ BFQ_ATTR(weights),
18825 ++ __ATTR_NULL
18826 ++};
18827 ++
18828 ++static struct elevator_type iosched_bfq = {
18829 ++ .ops = {
18830 ++ .elevator_merge_fn = bfq_merge,
18831 ++ .elevator_merged_fn = bfq_merged_request,
18832 ++ .elevator_merge_req_fn = bfq_merged_requests,
18833 ++ .elevator_allow_merge_fn = bfq_allow_merge,
18834 ++ .elevator_dispatch_fn = bfq_dispatch_requests,
18835 ++ .elevator_add_req_fn = bfq_insert_request,
18836 ++ .elevator_activate_req_fn = bfq_activate_request,
18837 ++ .elevator_deactivate_req_fn = bfq_deactivate_request,
18838 ++ .elevator_completed_req_fn = bfq_completed_request,
18839 ++ .elevator_former_req_fn = elv_rb_former_request,
18840 ++ .elevator_latter_req_fn = elv_rb_latter_request,
18841 ++ .elevator_init_icq_fn = bfq_init_icq,
18842 ++ .elevator_exit_icq_fn = bfq_exit_icq,
18843 ++ .elevator_set_req_fn = bfq_set_request,
18844 ++ .elevator_put_req_fn = bfq_put_request,
18845 ++ .elevator_may_queue_fn = bfq_may_queue,
18846 ++ .elevator_init_fn = bfq_init_queue,
18847 ++ .elevator_exit_fn = bfq_exit_queue,
18848 ++ },
18849 ++ .icq_size = sizeof(struct bfq_io_cq),
18850 ++ .icq_align = __alignof__(struct bfq_io_cq),
18851 ++ .elevator_attrs = bfq_attrs,
18852 ++ .elevator_name = "bfq",
18853 ++ .elevator_owner = THIS_MODULE,
18854 ++};
18855 ++
18856 ++static int __init bfq_init(void)
18857 ++{
18858 ++ /*
18859 ++ * Can be 0 on HZ < 1000 setups.
18860 ++ */
18861 ++ if (bfq_slice_idle == 0)
18862 ++ bfq_slice_idle = 1;
18863 ++
18864 ++ if (bfq_timeout_async == 0)
18865 ++ bfq_timeout_async = 1;
18866 ++
18867 ++ if (bfq_slab_setup())
18868 ++ return -ENOMEM;
18869 ++
18870 ++ elv_register(&iosched_bfq);
18871 ++ printk(KERN_INFO "BFQ I/O-scheduler version: v7r1");
18872 ++
18873 ++ return 0;
18874 ++}
18875 ++
18876 ++static void __exit bfq_exit(void)
18877 ++{
18878 ++ elv_unregister(&iosched_bfq);
18879 ++ bfq_slab_kill();
18880 ++}
18881 ++
18882 ++module_init(bfq_init);
18883 ++module_exit(bfq_exit);
18884 ++
18885 ++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
18886 ++MODULE_LICENSE("GPL");
18887 ++MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
18888 +diff --git a/block/bfq-sched.c b/block/bfq-sched.c
18889 +new file mode 100644
18890 +index 0000000..999b475
18891 +--- /dev/null
18892 ++++ b/block/bfq-sched.c
18893 +@@ -0,0 +1,1078 @@
18894 ++/*
18895 ++ * BFQ: Hierarchical B-WF2Q+ scheduler.
18896 ++ *
18897 ++ * Based on ideas and code from CFQ:
18898 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
18899 ++ *
18900 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
18901 ++ * Paolo Valente <paolo.valente@×××××××.it>
18902 ++ *
18903 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
18904 ++ */
18905 ++
18906 ++#ifdef CONFIG_CGROUP_BFQIO
18907 ++#define for_each_entity(entity) \
18908 ++ for (; entity != NULL; entity = entity->parent)
18909 ++
18910 ++#define for_each_entity_safe(entity, parent) \
18911 ++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
18912 ++
18913 ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
18914 ++ int extract,
18915 ++ struct bfq_data *bfqd);
18916 ++
18917 ++static inline void bfq_update_budget(struct bfq_entity *next_in_service)
18918 ++{
18919 ++ struct bfq_entity *bfqg_entity;
18920 ++ struct bfq_group *bfqg;
18921 ++ struct bfq_sched_data *group_sd;
18922 ++
18923 ++ BUG_ON(next_in_service == NULL);
18924 ++
18925 ++ group_sd = next_in_service->sched_data;
18926 ++
18927 ++ bfqg = container_of(group_sd, struct bfq_group, sched_data);
18928 ++ /*
18929 ++ * bfq_group's my_entity field is not NULL only if the group
18930 ++ * is not the root group. We must not touch the root entity
18931 ++ * as it must never become an in-service entity.
18932 ++ */
18933 ++ bfqg_entity = bfqg->my_entity;
18934 ++ if (bfqg_entity != NULL)
18935 ++ bfqg_entity->budget = next_in_service->budget;
18936 ++}
18937 ++
18938 ++static int bfq_update_next_in_service(struct bfq_sched_data *sd)
18939 ++{
18940 ++ struct bfq_entity *next_in_service;
18941 ++
18942 ++ if (sd->in_service_entity != NULL)
18943 ++ /* will update/requeue at the end of service */
18944 ++ return 0;
18945 ++
18946 ++ /*
18947 ++ * NOTE: this can be improved in many ways, such as returning
18948 ++ * 1 (and thus propagating upwards the update) only when the
18949 ++ * budget changes, or caching the bfqq that will be scheduled
18950 ++ * next from this subtree. By now we worry more about
18951 ++ * correctness than about performance...
18952 ++ */
18953 ++ next_in_service = bfq_lookup_next_entity(sd, 0, NULL);
18954 ++ sd->next_in_service = next_in_service;
18955 ++
18956 ++ if (next_in_service != NULL)
18957 ++ bfq_update_budget(next_in_service);
18958 ++
18959 ++ return 1;
18960 ++}
18961 ++
18962 ++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
18963 ++ struct bfq_entity *entity)
18964 ++{
18965 ++ BUG_ON(sd->next_in_service != entity);
18966 ++}
18967 ++#else
18968 ++#define for_each_entity(entity) \
18969 ++ for (; entity != NULL; entity = NULL)
18970 ++
18971 ++#define for_each_entity_safe(entity, parent) \
18972 ++ for (parent = NULL; entity != NULL; entity = parent)
18973 ++
18974 ++static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)
18975 ++{
18976 ++ return 0;
18977 ++}
18978 ++
18979 ++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
18980 ++ struct bfq_entity *entity)
18981 ++{
18982 ++}
18983 ++
18984 ++static inline void bfq_update_budget(struct bfq_entity *next_in_service)
18985 ++{
18986 ++}
18987 ++#endif
18988 ++
18989 ++/*
18990 ++ * Shift for timestamp calculations. This actually limits the maximum
18991 ++ * service allowed in one timestamp delta (small shift values increase it),
18992 ++ * the maximum total weight that can be used for the queues in the system
18993 ++ * (big shift values increase it), and the period of virtual time wraparounds.
18994 ++ */
18995 ++#define WFQ_SERVICE_SHIFT 22
18996 ++
18997 ++/**
18998 ++ * bfq_gt - compare two timestamps.
18999 ++ * @a: first ts.
19000 ++ * @b: second ts.
19001 ++ *
19002 ++ * Return @a > @b, dealing with wrapping correctly.
19003 ++ */
19004 ++static inline int bfq_gt(u64 a, u64 b)
19005 ++{
19006 ++ return (s64)(a - b) > 0;
19007 ++}
19008 ++
19009 ++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
19010 ++{
19011 ++ struct bfq_queue *bfqq = NULL;
19012 ++
19013 ++ BUG_ON(entity == NULL);
19014 ++
19015 ++ if (entity->my_sched_data == NULL)
19016 ++ bfqq = container_of(entity, struct bfq_queue, entity);
19017 ++
19018 ++ return bfqq;
19019 ++}
19020 ++
19021 ++
19022 ++/**
19023 ++ * bfq_delta - map service into the virtual time domain.
19024 ++ * @service: amount of service.
19025 ++ * @weight: scale factor (weight of an entity or weight sum).
19026 ++ */
19027 ++static inline u64 bfq_delta(unsigned long service,
19028 ++ unsigned long weight)
19029 ++{
19030 ++ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
19031 ++
19032 ++ do_div(d, weight);
19033 ++ return d;
19034 ++}
19035 ++
19036 ++/**
19037 ++ * bfq_calc_finish - assign the finish time to an entity.
19038 ++ * @entity: the entity to act upon.
19039 ++ * @service: the service to be charged to the entity.
19040 ++ */
19041 ++static inline void bfq_calc_finish(struct bfq_entity *entity,
19042 ++ unsigned long service)
19043 ++{
19044 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
19045 ++
19046 ++ BUG_ON(entity->weight == 0);
19047 ++
19048 ++ entity->finish = entity->start +
19049 ++ bfq_delta(service, entity->weight);
19050 ++
19051 ++ if (bfqq != NULL) {
19052 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
19053 ++ "calc_finish: serv %lu, w %d",
19054 ++ service, entity->weight);
19055 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
19056 ++ "calc_finish: start %llu, finish %llu, delta %llu",
19057 ++ entity->start, entity->finish,
19058 ++ bfq_delta(service, entity->weight));
19059 ++ }
19060 ++}
19061 ++
19062 ++/**
19063 ++ * bfq_entity_of - get an entity from a node.
19064 ++ * @node: the node field of the entity.
19065 ++ *
19066 ++ * Convert a node pointer to the relative entity. This is used only
19067 ++ * to simplify the logic of some functions and not as the generic
19068 ++ * conversion mechanism because, e.g., in the tree walking functions,
19069 ++ * the check for a %NULL value would be redundant.
19070 ++ */
19071 ++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
19072 ++{
19073 ++ struct bfq_entity *entity = NULL;
19074 ++
19075 ++ if (node != NULL)
19076 ++ entity = rb_entry(node, struct bfq_entity, rb_node);
19077 ++
19078 ++ return entity;
19079 ++}
19080 ++
19081 ++/**
19082 ++ * bfq_extract - remove an entity from a tree.
19083 ++ * @root: the tree root.
19084 ++ * @entity: the entity to remove.
19085 ++ */
19086 ++static inline void bfq_extract(struct rb_root *root,
19087 ++ struct bfq_entity *entity)
19088 ++{
19089 ++ BUG_ON(entity->tree != root);
19090 ++
19091 ++ entity->tree = NULL;
19092 ++ rb_erase(&entity->rb_node, root);
19093 ++}
19094 ++
19095 ++/**
19096 ++ * bfq_idle_extract - extract an entity from the idle tree.
19097 ++ * @st: the service tree of the owning @entity.
19098 ++ * @entity: the entity being removed.
19099 ++ */
19100 ++static void bfq_idle_extract(struct bfq_service_tree *st,
19101 ++ struct bfq_entity *entity)
19102 ++{
19103 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
19104 ++ struct rb_node *next;
19105 ++
19106 ++ BUG_ON(entity->tree != &st->idle);
19107 ++
19108 ++ if (entity == st->first_idle) {
19109 ++ next = rb_next(&entity->rb_node);
19110 ++ st->first_idle = bfq_entity_of(next);
19111 ++ }
19112 ++
19113 ++ if (entity == st->last_idle) {
19114 ++ next = rb_prev(&entity->rb_node);
19115 ++ st->last_idle = bfq_entity_of(next);
19116 ++ }
19117 ++
19118 ++ bfq_extract(&st->idle, entity);
19119 ++
19120 ++ if (bfqq != NULL)
19121 ++ list_del(&bfqq->bfqq_list);
19122 ++}
19123 ++
19124 ++/**
19125 ++ * bfq_insert - generic tree insertion.
19126 ++ * @root: tree root.
19127 ++ * @entity: entity to insert.
19128 ++ *
19129 ++ * This is used for the idle and the active tree, since they are both
19130 ++ * ordered by finish time.
19131 ++ */
19132 ++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
19133 ++{
19134 ++ struct bfq_entity *entry;
19135 ++ struct rb_node **node = &root->rb_node;
19136 ++ struct rb_node *parent = NULL;
19137 ++
19138 ++ BUG_ON(entity->tree != NULL);
19139 ++
19140 ++ while (*node != NULL) {
19141 ++ parent = *node;
19142 ++ entry = rb_entry(parent, struct bfq_entity, rb_node);
19143 ++
19144 ++ if (bfq_gt(entry->finish, entity->finish))
19145 ++ node = &parent->rb_left;
19146 ++ else
19147 ++ node = &parent->rb_right;
19148 ++ }
19149 ++
19150 ++ rb_link_node(&entity->rb_node, parent, node);
19151 ++ rb_insert_color(&entity->rb_node, root);
19152 ++
19153 ++ entity->tree = root;
19154 ++}
19155 ++
19156 ++/**
19157 ++ * bfq_update_min - update the min_start field of a entity.
19158 ++ * @entity: the entity to update.
19159 ++ * @node: one of its children.
19160 ++ *
19161 ++ * This function is called when @entity may store an invalid value for
19162 ++ * min_start due to updates to the active tree. The function assumes
19163 ++ * that the subtree rooted at @node (which may be its left or its right
19164 ++ * child) has a valid min_start value.
19165 ++ */
19166 ++static inline void bfq_update_min(struct bfq_entity *entity,
19167 ++ struct rb_node *node)
19168 ++{
19169 ++ struct bfq_entity *child;
19170 ++
19171 ++ if (node != NULL) {
19172 ++ child = rb_entry(node, struct bfq_entity, rb_node);
19173 ++ if (bfq_gt(entity->min_start, child->min_start))
19174 ++ entity->min_start = child->min_start;
19175 ++ }
19176 ++}
19177 ++
19178 ++/**
19179 ++ * bfq_update_active_node - recalculate min_start.
19180 ++ * @node: the node to update.
19181 ++ *
19182 ++ * @node may have changed position or one of its children may have moved,
19183 ++ * this function updates its min_start value. The left and right subtrees
19184 ++ * are assumed to hold a correct min_start value.
19185 ++ */
19186 ++static inline void bfq_update_active_node(struct rb_node *node)
19187 ++{
19188 ++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
19189 ++
19190 ++ entity->min_start = entity->start;
19191 ++ bfq_update_min(entity, node->rb_right);
19192 ++ bfq_update_min(entity, node->rb_left);
19193 ++}
19194 ++
19195 ++/**
19196 ++ * bfq_update_active_tree - update min_start for the whole active tree.
19197 ++ * @node: the starting node.
19198 ++ *
19199 ++ * @node must be the deepest modified node after an update. This function
19200 ++ * updates its min_start using the values held by its children, assuming
19201 ++ * that they did not change, and then updates all the nodes that may have
19202 ++ * changed in the path to the root. The only nodes that may have changed
19203 ++ * are the ones in the path or their siblings.
19204 ++ */
19205 ++static void bfq_update_active_tree(struct rb_node *node)
19206 ++{
19207 ++ struct rb_node *parent;
19208 ++
19209 ++up:
19210 ++ bfq_update_active_node(node);
19211 ++
19212 ++ parent = rb_parent(node);
19213 ++ if (parent == NULL)
19214 ++ return;
19215 ++
19216 ++ if (node == parent->rb_left && parent->rb_right != NULL)
19217 ++ bfq_update_active_node(parent->rb_right);
19218 ++ else if (parent->rb_left != NULL)
19219 ++ bfq_update_active_node(parent->rb_left);
19220 ++
19221 ++ node = parent;
19222 ++ goto up;
19223 ++}
19224 ++
19225 ++/**
19226 ++ * bfq_active_insert - insert an entity in the active tree of its group/device.
19227 ++ * @st: the service tree of the entity.
19228 ++ * @entity: the entity being inserted.
19229 ++ *
19230 ++ * The active tree is ordered by finish time, but an extra key is kept
19231 ++ * per each node, containing the minimum value for the start times of
19232 ++ * its children (and the node itself), so it's possible to search for
19233 ++ * the eligible node with the lowest finish time in logarithmic time.
19234 ++ */
19235 ++static void bfq_active_insert(struct bfq_service_tree *st,
19236 ++ struct bfq_entity *entity)
19237 ++{
19238 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
19239 ++ struct rb_node *node = &entity->rb_node;
19240 ++
19241 ++ bfq_insert(&st->active, entity);
19242 ++
19243 ++ if (node->rb_left != NULL)
19244 ++ node = node->rb_left;
19245 ++ else if (node->rb_right != NULL)
19246 ++ node = node->rb_right;
19247 ++
19248 ++ bfq_update_active_tree(node);
19249 ++
19250 ++ if (bfqq != NULL)
19251 ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
19252 ++}
19253 ++
19254 ++/**
19255 ++ * bfq_ioprio_to_weight - calc a weight from an ioprio.
19256 ++ * @ioprio: the ioprio value to convert.
19257 ++ */
19258 ++static unsigned short bfq_ioprio_to_weight(int ioprio)
19259 ++{
19260 ++ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
19261 ++ return IOPRIO_BE_NR - ioprio;
19262 ++}
19263 ++
19264 ++/**
19265 ++ * bfq_weight_to_ioprio - calc an ioprio from a weight.
19266 ++ * @weight: the weight value to convert.
19267 ++ *
19268 ++ * To preserve as mush as possible the old only-ioprio user interface,
19269 ++ * 0 is used as an escape ioprio value for weights (numerically) equal or
19270 ++ * larger than IOPRIO_BE_NR
19271 ++ */
19272 ++static unsigned short bfq_weight_to_ioprio(int weight)
19273 ++{
19274 ++ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
19275 ++ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
19276 ++}
19277 ++
19278 ++static inline void bfq_get_entity(struct bfq_entity *entity)
19279 ++{
19280 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
19281 ++ struct bfq_sched_data *sd;
19282 ++
19283 ++ if (bfqq != NULL) {
19284 ++ sd = entity->sched_data;
19285 ++ atomic_inc(&bfqq->ref);
19286 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
19287 ++ bfqq, atomic_read(&bfqq->ref));
19288 ++ }
19289 ++}
19290 ++
19291 ++/**
19292 ++ * bfq_find_deepest - find the deepest node that an extraction can modify.
19293 ++ * @node: the node being removed.
19294 ++ *
19295 ++ * Do the first step of an extraction in an rb tree, looking for the
19296 ++ * node that will replace @node, and returning the deepest node that
19297 ++ * the following modifications to the tree can touch. If @node is the
19298 ++ * last node in the tree return %NULL.
19299 ++ */
19300 ++static struct rb_node *bfq_find_deepest(struct rb_node *node)
19301 ++{
19302 ++ struct rb_node *deepest;
19303 ++
19304 ++ if (node->rb_right == NULL && node->rb_left == NULL)
19305 ++ deepest = rb_parent(node);
19306 ++ else if (node->rb_right == NULL)
19307 ++ deepest = node->rb_left;
19308 ++ else if (node->rb_left == NULL)
19309 ++ deepest = node->rb_right;
19310 ++ else {
19311 ++ deepest = rb_next(node);
19312 ++ if (deepest->rb_right != NULL)
19313 ++ deepest = deepest->rb_right;
19314 ++ else if (rb_parent(deepest) != node)
19315 ++ deepest = rb_parent(deepest);
19316 ++ }
19317 ++
19318 ++ return deepest;
19319 ++}
19320 ++
19321 ++/**
19322 ++ * bfq_active_extract - remove an entity from the active tree.
19323 ++ * @st: the service_tree containing the tree.
19324 ++ * @entity: the entity being removed.
19325 ++ */
19326 ++static void bfq_active_extract(struct bfq_service_tree *st,
19327 ++ struct bfq_entity *entity)
19328 ++{
19329 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
19330 ++ struct rb_node *node;
19331 ++
19332 ++ node = bfq_find_deepest(&entity->rb_node);
19333 ++ bfq_extract(&st->active, entity);
19334 ++
19335 ++ if (node != NULL)
19336 ++ bfq_update_active_tree(node);
19337 ++
19338 ++ if (bfqq != NULL)
19339 ++ list_del(&bfqq->bfqq_list);
19340 ++}
19341 ++
19342 ++/**
19343 ++ * bfq_idle_insert - insert an entity into the idle tree.
19344 ++ * @st: the service tree containing the tree.
19345 ++ * @entity: the entity to insert.
19346 ++ */
19347 ++static void bfq_idle_insert(struct bfq_service_tree *st,
19348 ++ struct bfq_entity *entity)
19349 ++{
19350 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
19351 ++ struct bfq_entity *first_idle = st->first_idle;
19352 ++ struct bfq_entity *last_idle = st->last_idle;
19353 ++
19354 ++ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
19355 ++ st->first_idle = entity;
19356 ++ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
19357 ++ st->last_idle = entity;
19358 ++
19359 ++ bfq_insert(&st->idle, entity);
19360 ++
19361 ++ if (bfqq != NULL)
19362 ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
19363 ++}
19364 ++
19365 ++/**
19366 ++ * bfq_forget_entity - remove an entity from the wfq trees.
19367 ++ * @st: the service tree.
19368 ++ * @entity: the entity being removed.
19369 ++ *
19370 ++ * Update the device status and forget everything about @entity, putting
19371 ++ * the device reference to it, if it is a queue. Entities belonging to
19372 ++ * groups are not refcounted.
19373 ++ */
19374 ++static void bfq_forget_entity(struct bfq_service_tree *st,
19375 ++ struct bfq_entity *entity)
19376 ++{
19377 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
19378 ++ struct bfq_sched_data *sd;
19379 ++
19380 ++ BUG_ON(!entity->on_st);
19381 ++
19382 ++ entity->on_st = 0;
19383 ++ st->wsum -= entity->weight;
19384 ++ if (bfqq != NULL) {
19385 ++ sd = entity->sched_data;
19386 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
19387 ++ bfqq, atomic_read(&bfqq->ref));
19388 ++ bfq_put_queue(bfqq);
19389 ++ }
19390 ++}
19391 ++
19392 ++/**
19393 ++ * bfq_put_idle_entity - release the idle tree ref of an entity.
19394 ++ * @st: service tree for the entity.
19395 ++ * @entity: the entity being released.
19396 ++ */
19397 ++static void bfq_put_idle_entity(struct bfq_service_tree *st,
19398 ++ struct bfq_entity *entity)
19399 ++{
19400 ++ bfq_idle_extract(st, entity);
19401 ++ bfq_forget_entity(st, entity);
19402 ++}
19403 ++
19404 ++/**
19405 ++ * bfq_forget_idle - update the idle tree if necessary.
19406 ++ * @st: the service tree to act upon.
19407 ++ *
19408 ++ * To preserve the global O(log N) complexity we only remove one entry here;
19409 ++ * as the idle tree will not grow indefinitely this can be done safely.
19410 ++ */
19411 ++static void bfq_forget_idle(struct bfq_service_tree *st)
19412 ++{
19413 ++ struct bfq_entity *first_idle = st->first_idle;
19414 ++ struct bfq_entity *last_idle = st->last_idle;
19415 ++
19416 ++ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
19417 ++ !bfq_gt(last_idle->finish, st->vtime)) {
19418 ++ /*
19419 ++ * Forget the whole idle tree, increasing the vtime past
19420 ++ * the last finish time of idle entities.
19421 ++ */
19422 ++ st->vtime = last_idle->finish;
19423 ++ }
19424 ++
19425 ++ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
19426 ++ bfq_put_idle_entity(st, first_idle);
19427 ++}
19428 ++
19429 ++static struct bfq_service_tree *
19430 ++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
19431 ++ struct bfq_entity *entity)
19432 ++{
19433 ++ struct bfq_service_tree *new_st = old_st;
19434 ++
19435 ++ if (entity->ioprio_changed) {
19436 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
19437 ++
19438 ++ BUG_ON(old_st->wsum < entity->weight);
19439 ++ old_st->wsum -= entity->weight;
19440 ++
19441 ++ if (entity->new_weight != entity->orig_weight) {
19442 ++ entity->orig_weight = entity->new_weight;
19443 ++ entity->ioprio =
19444 ++ bfq_weight_to_ioprio(entity->orig_weight);
19445 ++ } else if (entity->new_ioprio != entity->ioprio) {
19446 ++ entity->ioprio = entity->new_ioprio;
19447 ++ entity->orig_weight =
19448 ++ bfq_ioprio_to_weight(entity->ioprio);
19449 ++ } else
19450 ++ entity->new_weight = entity->orig_weight =
19451 ++ bfq_ioprio_to_weight(entity->ioprio);
19452 ++
19453 ++ entity->ioprio_class = entity->new_ioprio_class;
19454 ++ entity->ioprio_changed = 0;
19455 ++
19456 ++ /*
19457 ++ * NOTE: here we may be changing the weight too early,
19458 ++ * this will cause unfairness. The correct approach
19459 ++ * would have required additional complexity to defer
19460 ++ * weight changes to the proper time instants (i.e.,
19461 ++ * when entity->finish <= old_st->vtime).
19462 ++ */
19463 ++ new_st = bfq_entity_service_tree(entity);
19464 ++ entity->weight = entity->orig_weight *
19465 ++ (bfqq != NULL ? bfqq->raising_coeff : 1);
19466 ++ new_st->wsum += entity->weight;
19467 ++
19468 ++ if (new_st != old_st)
19469 ++ entity->start = new_st->vtime;
19470 ++ }
19471 ++
19472 ++ return new_st;
19473 ++}
19474 ++
19475 ++/**
19476 ++ * bfq_bfqq_served - update the scheduler status after selection for service.
19477 ++ * @bfqq: the queue being served.
19478 ++ * @served: bytes to transfer.
19479 ++ *
19480 ++ * NOTE: this can be optimized, as the timestamps of upper level entities
19481 ++ * are synchronized every time a new bfqq is selected for service. By now,
19482 ++ * we keep it to better check consistency.
19483 ++ */
19484 ++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
19485 ++{
19486 ++ struct bfq_entity *entity = &bfqq->entity;
19487 ++ struct bfq_service_tree *st;
19488 ++
19489 ++ for_each_entity(entity) {
19490 ++ st = bfq_entity_service_tree(entity);
19491 ++
19492 ++ entity->service += served;
19493 ++ BUG_ON(entity->service > entity->budget);
19494 ++ BUG_ON(st->wsum == 0);
19495 ++
19496 ++ st->vtime += bfq_delta(served, st->wsum);
19497 ++ bfq_forget_idle(st);
19498 ++ }
19499 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
19500 ++}
19501 ++
19502 ++/**
19503 ++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
19504 ++ * @bfqq: the queue that needs a service update.
19505 ++ *
19506 ++ * When it's not possible to be fair in the service domain, because
19507 ++ * a queue is not consuming its budget fast enough (the meaning of
19508 ++ * fast depends on the timeout parameter), we charge it a full
19509 ++ * budget. In this way we should obtain a sort of time-domain
19510 ++ * fairness among all the seeky/slow queues.
19511 ++ */
19512 ++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
19513 ++{
19514 ++ struct bfq_entity *entity = &bfqq->entity;
19515 ++
19516 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
19517 ++
19518 ++ bfq_bfqq_served(bfqq, entity->budget - entity->service);
19519 ++}
19520 ++
19521 ++/**
19522 ++ * __bfq_activate_entity - activate an entity.
19523 ++ * @entity: the entity being activated.
19524 ++ *
19525 ++ * Called whenever an entity is activated, i.e., it is not active and one
19526 ++ * of its children receives a new request, or has to be reactivated due to
19527 ++ * budget exhaustion. It uses the current budget of the entity (and the
19528 ++ * service received if @entity is active) of the queue to calculate its
19529 ++ * timestamps.
19530 ++ */
19531 ++static void __bfq_activate_entity(struct bfq_entity *entity)
19532 ++{
19533 ++ struct bfq_sched_data *sd = entity->sched_data;
19534 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
19535 ++
19536 ++ if (entity == sd->in_service_entity) {
19537 ++ BUG_ON(entity->tree != NULL);
19538 ++ /*
19539 ++ * If we are requeueing the current entity we have
19540 ++ * to take care of not charging to it service it has
19541 ++ * not received.
19542 ++ */
19543 ++ bfq_calc_finish(entity, entity->service);
19544 ++ entity->start = entity->finish;
19545 ++ sd->in_service_entity = NULL;
19546 ++ } else if (entity->tree == &st->active) {
19547 ++ /*
19548 ++ * Requeueing an entity due to a change of some
19549 ++ * next_in_service entity below it. We reuse the
19550 ++ * old start time.
19551 ++ */
19552 ++ bfq_active_extract(st, entity);
19553 ++ } else if (entity->tree == &st->idle) {
19554 ++ /*
19555 ++ * Must be on the idle tree, bfq_idle_extract() will
19556 ++ * check for that.
19557 ++ */
19558 ++ bfq_idle_extract(st, entity);
19559 ++ entity->start = bfq_gt(st->vtime, entity->finish) ?
19560 ++ st->vtime : entity->finish;
19561 ++ } else {
19562 ++ /*
19563 ++ * The finish time of the entity may be invalid, and
19564 ++ * it is in the past for sure, otherwise the queue
19565 ++ * would have been on the idle tree.
19566 ++ */
19567 ++ entity->start = st->vtime;
19568 ++ st->wsum += entity->weight;
19569 ++ bfq_get_entity(entity);
19570 ++
19571 ++ BUG_ON(entity->on_st);
19572 ++ entity->on_st = 1;
19573 ++ }
19574 ++
19575 ++ st = __bfq_entity_update_weight_prio(st, entity);
19576 ++ bfq_calc_finish(entity, entity->budget);
19577 ++ bfq_active_insert(st, entity);
19578 ++}
19579 ++
19580 ++/**
19581 ++ * bfq_activate_entity - activate an entity and its ancestors if necessary.
19582 ++ * @entity: the entity to activate.
19583 ++ *
19584 ++ * Activate @entity and all the entities on the path from it to the root.
19585 ++ */
19586 ++static void bfq_activate_entity(struct bfq_entity *entity)
19587 ++{
19588 ++ struct bfq_sched_data *sd;
19589 ++
19590 ++ for_each_entity(entity) {
19591 ++ __bfq_activate_entity(entity);
19592 ++
19593 ++ sd = entity->sched_data;
19594 ++ if (!bfq_update_next_in_service(sd))
19595 ++ /*
19596 ++ * No need to propagate the activation to the
19597 ++ * upper entities, as they will be updated when
19598 ++ * the in-service entity is rescheduled.
19599 ++ */
19600 ++ break;
19601 ++ }
19602 ++}
19603 ++
19604 ++/**
19605 ++ * __bfq_deactivate_entity - deactivate an entity from its service tree.
19606 ++ * @entity: the entity to deactivate.
19607 ++ * @requeue: if false, the entity will not be put into the idle tree.
19608 ++ *
19609 ++ * Deactivate an entity, independently from its previous state. If the
19610 ++ * entity was not on a service tree just return, otherwise if it is on
19611 ++ * any scheduler tree, extract it from that tree, and if necessary
19612 ++ * and if the caller did not specify @requeue, put it on the idle tree.
19613 ++ *
19614 ++ * Return %1 if the caller should update the entity hierarchy, i.e.,
19615 ++ * if the entity was under service or if it was the next_in_service for
19616 ++ * its sched_data; return %0 otherwise.
19617 ++ */
19618 ++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
19619 ++{
19620 ++ struct bfq_sched_data *sd = entity->sched_data;
19621 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
19622 ++ int was_in_service = entity == sd->in_service_entity;
19623 ++ int ret = 0;
19624 ++
19625 ++ if (!entity->on_st)
19626 ++ return 0;
19627 ++
19628 ++ BUG_ON(was_in_service && entity->tree != NULL);
19629 ++
19630 ++ if (was_in_service) {
19631 ++ bfq_calc_finish(entity, entity->service);
19632 ++ sd->in_service_entity = NULL;
19633 ++ } else if (entity->tree == &st->active)
19634 ++ bfq_active_extract(st, entity);
19635 ++ else if (entity->tree == &st->idle)
19636 ++ bfq_idle_extract(st, entity);
19637 ++ else if (entity->tree != NULL)
19638 ++ BUG();
19639 ++
19640 ++ if (was_in_service || sd->next_in_service == entity)
19641 ++ ret = bfq_update_next_in_service(sd);
19642 ++
19643 ++ if (!requeue || !bfq_gt(entity->finish, st->vtime))
19644 ++ bfq_forget_entity(st, entity);
19645 ++ else
19646 ++ bfq_idle_insert(st, entity);
19647 ++
19648 ++ BUG_ON(sd->in_service_entity == entity);
19649 ++ BUG_ON(sd->next_in_service == entity);
19650 ++
19651 ++ return ret;
19652 ++}
19653 ++
19654 ++/**
19655 ++ * bfq_deactivate_entity - deactivate an entity.
19656 ++ * @entity: the entity to deactivate.
19657 ++ * @requeue: true if the entity can be put on the idle tree
19658 ++ */
19659 ++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
19660 ++{
19661 ++ struct bfq_sched_data *sd;
19662 ++ struct bfq_entity *parent;
19663 ++
19664 ++ for_each_entity_safe(entity, parent) {
19665 ++ sd = entity->sched_data;
19666 ++
19667 ++ if (!__bfq_deactivate_entity(entity, requeue))
19668 ++ /*
19669 ++ * The parent entity is still backlogged, and
19670 ++ * we don't need to update it as it is still
19671 ++ * under service.
19672 ++ */
19673 ++ break;
19674 ++
19675 ++ if (sd->next_in_service != NULL)
19676 ++ /*
19677 ++ * The parent entity is still backlogged and
19678 ++ * the budgets on the path towards the root
19679 ++ * need to be updated.
19680 ++ */
19681 ++ goto update;
19682 ++
19683 ++ /*
19684 ++ * If we reach there the parent is no more backlogged and
19685 ++ * we want to propagate the dequeue upwards.
19686 ++ */
19687 ++ requeue = 1;
19688 ++ }
19689 ++
19690 ++ return;
19691 ++
19692 ++update:
19693 ++ entity = parent;
19694 ++ for_each_entity(entity) {
19695 ++ __bfq_activate_entity(entity);
19696 ++
19697 ++ sd = entity->sched_data;
19698 ++ if (!bfq_update_next_in_service(sd))
19699 ++ break;
19700 ++ }
19701 ++}
19702 ++
19703 ++/**
19704 ++ * bfq_update_vtime - update vtime if necessary.
19705 ++ * @st: the service tree to act upon.
19706 ++ *
19707 ++ * If necessary update the service tree vtime to have at least one
19708 ++ * eligible entity, skipping to its start time. Assumes that the
19709 ++ * active tree of the device is not empty.
19710 ++ *
19711 ++ * NOTE: this hierarchical implementation updates vtimes quite often,
19712 ++ * we may end up with reactivated tasks getting timestamps after a
19713 ++ * vtime skip done because we needed a ->first_active entity on some
19714 ++ * intermediate node.
19715 ++ */
19716 ++static void bfq_update_vtime(struct bfq_service_tree *st)
19717 ++{
19718 ++ struct bfq_entity *entry;
19719 ++ struct rb_node *node = st->active.rb_node;
19720 ++
19721 ++ entry = rb_entry(node, struct bfq_entity, rb_node);
19722 ++ if (bfq_gt(entry->min_start, st->vtime)) {
19723 ++ st->vtime = entry->min_start;
19724 ++ bfq_forget_idle(st);
19725 ++ }
19726 ++}
19727 ++
19728 ++/**
19729 ++ * bfq_first_active_entity - find the eligible entity with
19730 ++ * the smallest finish time
19731 ++ * @st: the service tree to select from.
19732 ++ *
19733 ++ * This function searches the first schedulable entity, starting from the
19734 ++ * root of the tree and going on the left every time on this side there is
19735 ++ * a subtree with at least one eligible (start >= vtime) entity. The path
19736 ++ * on the right is followed only if a) the left subtree contains no eligible
19737 ++ * entities and b) no eligible entity has been found yet.
19738 ++ */
19739 ++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
19740 ++{
19741 ++ struct bfq_entity *entry, *first = NULL;
19742 ++ struct rb_node *node = st->active.rb_node;
19743 ++
19744 ++ while (node != NULL) {
19745 ++ entry = rb_entry(node, struct bfq_entity, rb_node);
19746 ++left:
19747 ++ if (!bfq_gt(entry->start, st->vtime))
19748 ++ first = entry;
19749 ++
19750 ++ BUG_ON(bfq_gt(entry->min_start, st->vtime));
19751 ++
19752 ++ if (node->rb_left != NULL) {
19753 ++ entry = rb_entry(node->rb_left,
19754 ++ struct bfq_entity, rb_node);
19755 ++ if (!bfq_gt(entry->min_start, st->vtime)) {
19756 ++ node = node->rb_left;
19757 ++ goto left;
19758 ++ }
19759 ++ }
19760 ++ if (first != NULL)
19761 ++ break;
19762 ++ node = node->rb_right;
19763 ++ }
19764 ++
19765 ++ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
19766 ++ return first;
19767 ++}
19768 ++
19769 ++/**
19770 ++ * __bfq_lookup_next_entity - return the first eligible entity in @st.
19771 ++ * @st: the service tree.
19772 ++ *
19773 ++ * Update the virtual time in @st and return the first eligible entity
19774 ++ * it contains.
19775 ++ */
19776 ++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
19777 ++ bool force)
19778 ++{
19779 ++ struct bfq_entity *entity, *new_next_in_service = NULL;
19780 ++
19781 ++ if (RB_EMPTY_ROOT(&st->active))
19782 ++ return NULL;
19783 ++
19784 ++ bfq_update_vtime(st);
19785 ++ entity = bfq_first_active_entity(st);
19786 ++ BUG_ON(bfq_gt(entity->start, st->vtime));
19787 ++
19788 ++ /*
19789 ++ * If the chosen entity does not match with the sched_data's
19790 ++ * next_in_service and we are forcedly serving the IDLE priority
19791 ++ * class tree, bubble up budget update.
19792 ++ */
19793 ++ if (unlikely(force && entity != entity->sched_data->next_in_service)) {
19794 ++ new_next_in_service = entity;
19795 ++ for_each_entity(new_next_in_service)
19796 ++ bfq_update_budget(new_next_in_service);
19797 ++ }
19798 ++
19799 ++ return entity;
19800 ++}
19801 ++
19802 ++/**
19803 ++ * bfq_lookup_next_entity - return the first eligible entity in @sd.
19804 ++ * @sd: the sched_data.
19805 ++ * @extract: if true the returned entity will be also extracted from @sd.
19806 ++ *
19807 ++ * NOTE: since we cache the next_in_service entity at each level of the
19808 ++ * hierarchy, the complexity of the lookup can be decreased with
19809 ++ * absolutely no effort just returning the cached next_in_service value;
19810 ++ * we prefer to do full lookups to test the consistency of * the data
19811 ++ * structures.
19812 ++ */
19813 ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
19814 ++ int extract,
19815 ++ struct bfq_data *bfqd)
19816 ++{
19817 ++ struct bfq_service_tree *st = sd->service_tree;
19818 ++ struct bfq_entity *entity;
19819 ++ int i = 0;
19820 ++
19821 ++ BUG_ON(sd->in_service_entity != NULL);
19822 ++
19823 ++ if (bfqd != NULL &&
19824 ++ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
19825 ++ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
19826 ++ true);
19827 ++ if (entity != NULL) {
19828 ++ i = BFQ_IOPRIO_CLASSES - 1;
19829 ++ bfqd->bfq_class_idle_last_service = jiffies;
19830 ++ sd->next_in_service = entity;
19831 ++ }
19832 ++ }
19833 ++ for (; i < BFQ_IOPRIO_CLASSES; i++) {
19834 ++ entity = __bfq_lookup_next_entity(st + i, false);
19835 ++ if (entity != NULL) {
19836 ++ if (extract) {
19837 ++ bfq_check_next_in_service(sd, entity);
19838 ++ bfq_active_extract(st + i, entity);
19839 ++ sd->in_service_entity = entity;
19840 ++ sd->next_in_service = NULL;
19841 ++ }
19842 ++ break;
19843 ++ }
19844 ++ }
19845 ++
19846 ++ return entity;
19847 ++}
19848 ++
19849 ++/*
19850 ++ * Get next queue for service.
19851 ++ */
19852 ++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
19853 ++{
19854 ++ struct bfq_entity *entity = NULL;
19855 ++ struct bfq_sched_data *sd;
19856 ++ struct bfq_queue *bfqq;
19857 ++
19858 ++ BUG_ON(bfqd->in_service_queue != NULL);
19859 ++
19860 ++ if (bfqd->busy_queues == 0)
19861 ++ return NULL;
19862 ++
19863 ++ sd = &bfqd->root_group->sched_data;
19864 ++ for (; sd != NULL; sd = entity->my_sched_data) {
19865 ++ entity = bfq_lookup_next_entity(sd, 1, bfqd);
19866 ++ BUG_ON(entity == NULL);
19867 ++ entity->service = 0;
19868 ++ }
19869 ++
19870 ++ bfqq = bfq_entity_to_bfqq(entity);
19871 ++ BUG_ON(bfqq == NULL);
19872 ++
19873 ++ return bfqq;
19874 ++}
19875 ++
19876 ++/*
19877 ++ * Forced extraction of the given queue.
19878 ++ */
19879 ++static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
19880 ++ struct bfq_queue *bfqq)
19881 ++{
19882 ++ struct bfq_entity *entity;
19883 ++ struct bfq_sched_data *sd;
19884 ++
19885 ++ BUG_ON(bfqd->in_service_queue != NULL);
19886 ++
19887 ++ entity = &bfqq->entity;
19888 ++ /*
19889 ++ * Bubble up extraction/update from the leaf to the root.
19890 ++ */
19891 ++ for_each_entity(entity) {
19892 ++ sd = entity->sched_data;
19893 ++ bfq_update_budget(entity);
19894 ++ bfq_update_vtime(bfq_entity_service_tree(entity));
19895 ++ bfq_active_extract(bfq_entity_service_tree(entity), entity);
19896 ++ sd->active_entity = entity;
19897 ++ sd->next_active = NULL;
19898 ++ entity->service = 0;
19899 ++ }
19900 ++
19901 ++ return;
19902 ++}
19903 ++
19904 ++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
19905 ++{
19906 ++ if (bfqd->in_service_bic != NULL) {
19907 ++ put_io_context(bfqd->in_service_bic->icq.ioc);
19908 ++ bfqd->in_service_bic = NULL;
19909 ++ }
19910 ++
19911 ++ bfqd->in_service_queue = NULL;
19912 ++ del_timer(&bfqd->idle_slice_timer);
19913 ++}
19914 ++
19915 ++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
19916 ++ int requeue)
19917 ++{
19918 ++ struct bfq_entity *entity = &bfqq->entity;
19919 ++
19920 ++ if (bfqq == bfqd->in_service_queue)
19921 ++ __bfq_bfqd_reset_in_service(bfqd);
19922 ++
19923 ++ bfq_deactivate_entity(entity, requeue);
19924 ++}
19925 ++
19926 ++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
19927 ++{
19928 ++ struct bfq_entity *entity = &bfqq->entity;
19929 ++
19930 ++ bfq_activate_entity(entity);
19931 ++}
19932 ++
19933 ++/*
19934 ++ * Called when the bfqq no longer has requests pending, remove it from
19935 ++ * the service tree.
19936 ++ */
19937 ++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
19938 ++ int requeue)
19939 ++{
19940 ++ BUG_ON(!bfq_bfqq_busy(bfqq));
19941 ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
19942 ++
19943 ++ bfq_log_bfqq(bfqd, bfqq, "del from busy");
19944 ++
19945 ++ bfq_clear_bfqq_busy(bfqq);
19946 ++
19947 ++ BUG_ON(bfqd->busy_queues == 0);
19948 ++ bfqd->busy_queues--;
19949 ++ if (bfqq->raising_coeff > 1)
19950 ++ bfqd->raised_busy_queues--;
19951 ++
19952 ++ bfq_deactivate_bfqq(bfqd, bfqq, requeue);
19953 ++}
19954 ++
19955 ++/*
19956 ++ * Called when an inactive queue receives a new request.
19957 ++ */
19958 ++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
19959 ++{
19960 ++ BUG_ON(bfq_bfqq_busy(bfqq));
19961 ++ BUG_ON(bfqq == bfqd->in_service_queue);
19962 ++
19963 ++ bfq_log_bfqq(bfqd, bfqq, "add to busy");
19964 ++
19965 ++ bfq_activate_bfqq(bfqd, bfqq);
19966 ++
19967 ++ bfq_mark_bfqq_busy(bfqq);
19968 ++ bfqd->busy_queues++;
19969 ++ if (bfqq->raising_coeff > 1)
19970 ++ bfqd->raised_busy_queues++;
19971 ++}
19972 +diff --git a/block/bfq.h b/block/bfq.h
19973 +new file mode 100644
19974 +index 0000000..f9b5881
19975 +--- /dev/null
19976 ++++ b/block/bfq.h
19977 +@@ -0,0 +1,614 @@
19978 ++/*
19979 ++ * BFQ-v7r1 for 3.13.0: data structures and common functions prototypes.
19980 ++ *
19981 ++ * Based on ideas and code from CFQ:
19982 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
19983 ++ *
19984 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
19985 ++ * Paolo Valente <paolo.valente@×××××××.it>
19986 ++ *
19987 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
19988 ++ */
19989 ++
19990 ++#ifndef _BFQ_H
19991 ++#define _BFQ_H
19992 ++
19993 ++#include <linux/blktrace_api.h>
19994 ++#include <linux/hrtimer.h>
19995 ++#include <linux/ioprio.h>
19996 ++#include <linux/rbtree.h>
19997 ++
19998 ++#define BFQ_IOPRIO_CLASSES 3
19999 ++#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
20000 ++
20001 ++#define BFQ_MIN_WEIGHT 1
20002 ++#define BFQ_MAX_WEIGHT 1000
20003 ++
20004 ++#define BFQ_DEFAULT_GRP_WEIGHT 10
20005 ++#define BFQ_DEFAULT_GRP_IOPRIO 0
20006 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
20007 ++
20008 ++struct bfq_entity;
20009 ++
20010 ++/**
20011 ++ * struct bfq_service_tree - per ioprio_class service tree.
20012 ++ * @active: tree for active entities (i.e., those backlogged).
20013 ++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
20014 ++ * @first_idle: idle entity with minimum F_i.
20015 ++ * @last_idle: idle entity with maximum F_i.
20016 ++ * @vtime: scheduler virtual time.
20017 ++ * @wsum: scheduler weight sum; active and idle entities contribute to it.
20018 ++ *
20019 ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
20020 ++ * ioprio_class has its own independent scheduler, and so its own
20021 ++ * bfq_service_tree. All the fields are protected by the queue lock
20022 ++ * of the containing bfqd.
20023 ++ */
20024 ++struct bfq_service_tree {
20025 ++ struct rb_root active;
20026 ++ struct rb_root idle;
20027 ++
20028 ++ struct bfq_entity *first_idle;
20029 ++ struct bfq_entity *last_idle;
20030 ++
20031 ++ u64 vtime;
20032 ++ unsigned long wsum;
20033 ++};
20034 ++
20035 ++/**
20036 ++ * struct bfq_sched_data - multi-class scheduler.
20037 ++ * @in_service_entity: entity under service.
20038 ++ * @next_in_service: head-of-the-line entity in the scheduler.
20039 ++ * @service_tree: array of service trees, one per ioprio_class.
20040 ++ *
20041 ++ * bfq_sched_data is the basic scheduler queue. It supports three
20042 ++ * ioprio_classes, and can be used either as a toplevel queue or as
20043 ++ * an intermediate queue on a hierarchical setup.
20044 ++ * @next_in_service points to the active entity of the sched_data
20045 ++ * service trees that will be scheduled next.
20046 ++ *
20047 ++ * The supported ioprio_classes are the same as in CFQ, in descending
20048 ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
20049 ++ * Requests from higher priority queues are served before all the
20050 ++ * requests from lower priority queues; among requests of the same
20051 ++ * queue requests are served according to B-WF2Q+.
20052 ++ * All the fields are protected by the queue lock of the containing bfqd.
20053 ++ */
20054 ++struct bfq_sched_data {
20055 ++ struct bfq_entity *in_service_entity;
20056 ++ struct bfq_entity *next_in_service;
20057 ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
20058 ++};
20059 ++
20060 ++/**
20061 ++ * struct bfq_entity - schedulable entity.
20062 ++ * @rb_node: service_tree member.
20063 ++ * @on_st: flag, true if the entity is on a tree (either the active or
20064 ++ * the idle one of its service_tree).
20065 ++ * @finish: B-WF2Q+ finish timestamp (aka F_i).
20066 ++ * @start: B-WF2Q+ start timestamp (aka S_i).
20067 ++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
20068 ++ * @min_start: minimum start time of the (active) subtree rooted at
20069 ++ * this entity; used for O(log N) lookups into active trees.
20070 ++ * @service: service received during the last round of service.
20071 ++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
20072 ++ * @weight: weight of the queue
20073 ++ * @parent: parent entity, for hierarchical scheduling.
20074 ++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
20075 ++ * associated scheduler queue, %NULL on leaf nodes.
20076 ++ * @sched_data: the scheduler queue this entity belongs to.
20077 ++ * @ioprio: the ioprio in use.
20078 ++ * @new_weight: when a weight change is requested, the new weight value.
20079 ++ * @orig_weight: original weight, used to implement weight boosting
20080 ++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
20081 ++ * @ioprio_class: the ioprio_class in use.
20082 ++ * @new_ioprio_class: when an ioprio_class change is requested, the new
20083 ++ * ioprio_class value.
20084 ++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
20085 ++ * ioprio_class change.
20086 ++ *
20087 ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
20088 ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
20089 ++ * entity belongs to the sched_data of the parent group in the cgroup
20090 ++ * hierarchy. Non-leaf entities have also their own sched_data, stored
20091 ++ * in @my_sched_data.
20092 ++ *
20093 ++ * Each entity stores independently its priority values; this would
20094 ++ * allow different weights on different devices, but this
20095 ++ * functionality is not exported to userspace by now. Priorities and
20096 ++ * weights are updated lazily, first storing the new values into the
20097 ++ * new_* fields, then setting the @ioprio_changed flag. As soon as
20098 ++ * there is a transition in the entity state that allows the priority
20099 ++ * update to take place the effective and the requested priority
20100 ++ * values are synchronized.
20101 ++ *
20102 ++ * Unless cgroups are used, the weight value is calculated from the
20103 ++ * ioprio to export the same interface as CFQ. When dealing with
20104 ++ * ``well-behaved'' queues (i.e., queues that do not spend too much
20105 ++ * time to consume their budget and have true sequential behavior, and
20106 ++ * when there are no external factors breaking anticipation) the
20107 ++ * relative weights at each level of the cgroups hierarchy should be
20108 ++ * guaranteed. All the fields are protected by the queue lock of the
20109 ++ * containing bfqd.
20110 ++ */
20111 ++struct bfq_entity {
20112 ++ struct rb_node rb_node;
20113 ++
20114 ++ int on_st;
20115 ++
20116 ++ u64 finish;
20117 ++ u64 start;
20118 ++
20119 ++ struct rb_root *tree;
20120 ++
20121 ++ u64 min_start;
20122 ++
20123 ++ unsigned long service, budget;
20124 ++ unsigned short weight, new_weight;
20125 ++ unsigned short orig_weight;
20126 ++
20127 ++ struct bfq_entity *parent;
20128 ++
20129 ++ struct bfq_sched_data *my_sched_data;
20130 ++ struct bfq_sched_data *sched_data;
20131 ++
20132 ++ unsigned short ioprio, new_ioprio;
20133 ++ unsigned short ioprio_class, new_ioprio_class;
20134 ++
20135 ++ int ioprio_changed;
20136 ++};
20137 ++
20138 ++struct bfq_group;
20139 ++
20140 ++/**
20141 ++ * struct bfq_queue - leaf schedulable entity.
20142 ++ * @ref: reference counter.
20143 ++ * @bfqd: parent bfq_data.
20144 ++ * @new_bfqq: shared bfq_queue if queue is cooperating with
20145 ++ * one or more other queues.
20146 ++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
20147 ++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
20148 ++ * @sort_list: sorted list of pending requests.
20149 ++ * @next_rq: if fifo isn't expired, next request to serve.
20150 ++ * @queued: nr of requests queued in @sort_list.
20151 ++ * @allocated: currently allocated requests.
20152 ++ * @meta_pending: pending metadata requests.
20153 ++ * @fifo: fifo list of requests in sort_list.
20154 ++ * @entity: entity representing this queue in the scheduler.
20155 ++ * @max_budget: maximum budget allowed from the feedback mechanism.
20156 ++ * @budget_timeout: budget expiration (in jiffies).
20157 ++ * @dispatched: number of requests on the dispatch list or inside driver.
20158 ++ * @org_ioprio: saved ioprio during boosted periods.
20159 ++ * @flags: status flags.
20160 ++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
20161 ++ * @seek_samples: number of seeks sampled
20162 ++ * @seek_total: sum of the distances of the seeks sampled
20163 ++ * @seek_mean: mean seek distance
20164 ++ * @last_request_pos: position of the last request enqueued
20165 ++ * @pid: pid of the process owning the queue, used for logging purposes.
20166 ++ * @last_rais_start_time: last (idle -> weight-raised) transition attempt
20167 ++ * @raising_cur_max_time: current max raising time for this queue
20168 ++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from
20169 ++ * idle to backlogged
20170 ++ * @service_from_backlogged: cumulative service received from the @bfq_queue
20171 ++ * since the last transition from idle to backlogged
20172 ++ *
20173 ++ * A bfq_queue is a leaf request queue; it can be associated to an io_context
20174 ++ * or more (if it is an async one). @cgroup holds a reference to the
20175 ++ * cgroup, to be sure that it does not disappear while a bfqq still
20176 ++ * references it (mostly to avoid races between request issuing and task
20177 ++ * migration followed by cgroup distruction).
20178 ++ * All the fields are protected by the queue lock of the containing bfqd.
20179 ++ */
20180 ++struct bfq_queue {
20181 ++ atomic_t ref;
20182 ++ struct bfq_data *bfqd;
20183 ++
20184 ++ /* fields for cooperating queues handling */
20185 ++ struct bfq_queue *new_bfqq;
20186 ++ struct rb_node pos_node;
20187 ++ struct rb_root *pos_root;
20188 ++
20189 ++ struct rb_root sort_list;
20190 ++ struct request *next_rq;
20191 ++ int queued[2];
20192 ++ int allocated[2];
20193 ++ int meta_pending;
20194 ++ struct list_head fifo;
20195 ++
20196 ++ struct bfq_entity entity;
20197 ++
20198 ++ unsigned long max_budget;
20199 ++ unsigned long budget_timeout;
20200 ++
20201 ++ int dispatched;
20202 ++
20203 ++ unsigned short org_ioprio;
20204 ++
20205 ++ unsigned int flags;
20206 ++
20207 ++ struct list_head bfqq_list;
20208 ++
20209 ++ unsigned int seek_samples;
20210 ++ u64 seek_total;
20211 ++ sector_t seek_mean;
20212 ++ sector_t last_request_pos;
20213 ++
20214 ++ pid_t pid;
20215 ++
20216 ++ /* weight-raising fields */
20217 ++ unsigned long raising_cur_max_time;
20218 ++ unsigned long soft_rt_next_start;
20219 ++ unsigned long last_rais_start_finish;
20220 ++ unsigned int raising_coeff;
20221 ++ unsigned long last_idle_bklogged;
20222 ++ unsigned long service_from_backlogged;
20223 ++};
20224 ++
20225 ++/**
20226 ++ * struct bfq_ttime - per process thinktime stats.
20227 ++ * @ttime_total: total process thinktime
20228 ++ * @ttime_samples: number of thinktime samples
20229 ++ * @ttime_mean: average process thinktime
20230 ++ */
20231 ++struct bfq_ttime {
20232 ++ unsigned long last_end_request;
20233 ++
20234 ++ unsigned long ttime_total;
20235 ++ unsigned long ttime_samples;
20236 ++ unsigned long ttime_mean;
20237 ++};
20238 ++
20239 ++/**
20240 ++ * struct bfq_io_cq - per (request_queue, io_context) structure.
20241 ++ * @icq: associated io_cq structure
20242 ++ * @bfqq: array of two process queues, the sync and the async
20243 ++ * @ttime: associated @bfq_ttime struct
20244 ++ */
20245 ++struct bfq_io_cq {
20246 ++ struct io_cq icq; /* must be the first member */
20247 ++ struct bfq_queue *bfqq[2];
20248 ++ struct bfq_ttime ttime;
20249 ++ int ioprio;
20250 ++};
20251 ++
20252 ++/**
20253 ++ * struct bfq_data - per device data structure.
20254 ++ * @queue: request queue for the managed device.
20255 ++ * @root_group: root bfq_group for the device.
20256 ++ * @rq_pos_tree: rbtree sorted by next_request position,
20257 ++ * used when determining if two or more queues
20258 ++ * have interleaving requests (see bfq_close_cooperator).
20259 ++ * @busy_queues: number of bfq_queues containing requests (including the
20260 ++ * queue under service, even if it is idling).
20261 ++ * @raised_busy_queues: number of weight-raised busy bfq_queues.
20262 ++ * @queued: number of queued requests.
20263 ++ * @rq_in_driver: number of requests dispatched and waiting for completion.
20264 ++ * @sync_flight: number of sync requests in the driver.
20265 ++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
20266 ++ * completed requests .
20267 ++ * @hw_tag_samples: nr of samples used to calculate hw_tag.
20268 ++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
20269 ++ * @budgets_assigned: number of budgets assigned.
20270 ++ * @idle_slice_timer: timer set when idling for the next sequential request
20271 ++ * from the queue under service.
20272 ++ * @unplug_work: delayed work to restart dispatching on the request queue.
20273 ++ * @in_service_queue: bfq_queue under service.
20274 ++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.
20275 ++ * @last_position: on-disk position of the last served request.
20276 ++ * @last_budget_start: beginning of the last budget.
20277 ++ * @last_idling_start: beginning of the last idle slice.
20278 ++ * @peak_rate: peak transfer rate observed for a budget.
20279 ++ * @peak_rate_samples: number of samples used to calculate @peak_rate.
20280 ++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
20281 ++ * @group_list: list of all the bfq_groups active on the device.
20282 ++ * @active_list: list of all the bfq_queues active on the device.
20283 ++ * @idle_list: list of all the bfq_queues idle on the device.
20284 ++ * @bfq_quantum: max number of requests dispatched per dispatch round.
20285 ++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
20286 ++ * requests are served in fifo order.
20287 ++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
20288 ++ * @bfq_back_max: maximum allowed backward seek.
20289 ++ * @bfq_slice_idle: maximum idling time.
20290 ++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
20291 ++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
20292 ++ * async queues.
20293 ++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
20294 ++ * to prevent seeky queues to impose long latencies to well
20295 ++ * behaved ones (this also implies that seeky queues cannot
20296 ++ * receive guarantees in the service domain; after a timeout
20297 ++ * they are charged for the whole allocated budget, to try
20298 ++ * to preserve a behavior reasonably fair among them, but
20299 ++ * without service-domain guarantees).
20300 ++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
20301 ++ * queue is multiplied
20302 ++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
20303 ++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
20304 ++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
20305 ++ * may be reactivated for a queue (in jiffies)
20306 ++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
20307 ++ * after which weight-raising may be
20308 ++ * reactivated for an already busy queue
20309 ++ * (in jiffies)
20310 ++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
20311 ++ * sectors per seconds
20312 ++ * @RT_prod: cached value of the product R*T used for computing the maximum
20313 ++ * duration of the weight raising automatically
20314 ++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
20315 ++ *
20316 ++ * All the fields are protected by the @queue lock.
20317 ++ */
20318 ++struct bfq_data {
20319 ++ struct request_queue *queue;
20320 ++
20321 ++ struct bfq_group *root_group;
20322 ++
20323 ++ struct rb_root rq_pos_tree;
20324 ++
20325 ++ int busy_queues;
20326 ++ int raised_busy_queues;
20327 ++ int queued;
20328 ++ int rq_in_driver;
20329 ++ int sync_flight;
20330 ++
20331 ++ int max_rq_in_driver;
20332 ++ int hw_tag_samples;
20333 ++ int hw_tag;
20334 ++
20335 ++ int budgets_assigned;
20336 ++
20337 ++ struct timer_list idle_slice_timer;
20338 ++ struct work_struct unplug_work;
20339 ++
20340 ++ struct bfq_queue *in_service_queue;
20341 ++ struct bfq_io_cq *in_service_bic;
20342 ++
20343 ++ sector_t last_position;
20344 ++
20345 ++ ktime_t last_budget_start;
20346 ++ ktime_t last_idling_start;
20347 ++ int peak_rate_samples;
20348 ++ u64 peak_rate;
20349 ++ unsigned long bfq_max_budget;
20350 ++
20351 ++ struct hlist_head group_list;
20352 ++ struct list_head active_list;
20353 ++ struct list_head idle_list;
20354 ++
20355 ++ unsigned int bfq_quantum;
20356 ++ unsigned int bfq_fifo_expire[2];
20357 ++ unsigned int bfq_back_penalty;
20358 ++ unsigned int bfq_back_max;
20359 ++ unsigned int bfq_slice_idle;
20360 ++ u64 bfq_class_idle_last_service;
20361 ++
20362 ++ unsigned int bfq_user_max_budget;
20363 ++ unsigned int bfq_max_budget_async_rq;
20364 ++ unsigned int bfq_timeout[2];
20365 ++
20366 ++ bool low_latency;
20367 ++
20368 ++ /* parameters of the low_latency heuristics */
20369 ++ unsigned int bfq_raising_coeff;
20370 ++ unsigned int bfq_raising_max_time;
20371 ++ unsigned int bfq_raising_rt_max_time;
20372 ++ unsigned int bfq_raising_min_idle_time;
20373 ++ unsigned long bfq_raising_min_inter_arr_async;
20374 ++ unsigned int bfq_raising_max_softrt_rate;
20375 ++ u64 RT_prod;
20376 ++
20377 ++ struct bfq_queue oom_bfqq;
20378 ++};
20379 ++
20380 ++enum bfqq_state_flags {
20381 ++ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */
20382 ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
20383 ++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
20384 ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
20385 ++ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
20386 ++ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */
20387 ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
20388 ++ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
20389 ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
20390 ++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
20391 ++ BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */
20392 ++};
20393 ++
20394 ++#define BFQ_BFQQ_FNS(name) \
20395 ++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
20396 ++{ \
20397 ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
20398 ++} \
20399 ++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
20400 ++{ \
20401 ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
20402 ++} \
20403 ++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
20404 ++{ \
20405 ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
20406 ++}
20407 ++
20408 ++BFQ_BFQQ_FNS(busy);
20409 ++BFQ_BFQQ_FNS(wait_request);
20410 ++BFQ_BFQQ_FNS(must_alloc);
20411 ++BFQ_BFQQ_FNS(fifo_expire);
20412 ++BFQ_BFQQ_FNS(idle_window);
20413 ++BFQ_BFQQ_FNS(prio_changed);
20414 ++BFQ_BFQQ_FNS(sync);
20415 ++BFQ_BFQQ_FNS(budget_new);
20416 ++BFQ_BFQQ_FNS(coop);
20417 ++BFQ_BFQQ_FNS(split_coop);
20418 ++BFQ_BFQQ_FNS(softrt_update);
20419 ++#undef BFQ_BFQQ_FNS
20420 ++
20421 ++/* Logging facilities. */
20422 ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
20423 ++ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
20424 ++
20425 ++#define bfq_log(bfqd, fmt, args...) \
20426 ++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
20427 ++
20428 ++/* Expiration reasons. */
20429 ++enum bfqq_expiration {
20430 ++ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */
20431 ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
20432 ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
20433 ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
20434 ++};
20435 ++
20436 ++#ifdef CONFIG_CGROUP_BFQIO
20437 ++/**
20438 ++ * struct bfq_group - per (device, cgroup) data structure.
20439 ++ * @entity: schedulable entity to insert into the parent group sched_data.
20440 ++ * @sched_data: own sched_data, to contain child entities (they may be
20441 ++ * both bfq_queues and bfq_groups).
20442 ++ * @group_node: node to be inserted into the bfqio_cgroup->group_data
20443 ++ * list of the containing cgroup's bfqio_cgroup.
20444 ++ * @bfqd_node: node to be inserted into the @bfqd->group_list list
20445 ++ * of the groups active on the same device; used for cleanup.
20446 ++ * @bfqd: the bfq_data for the device this group acts upon.
20447 ++ * @async_bfqq: array of async queues for all the tasks belonging to
20448 ++ * the group, one queue per ioprio value per ioprio_class,
20449 ++ * except for the idle class that has only one queue.
20450 ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
20451 ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
20452 ++ * to avoid too many special cases during group creation/migration.
20453 ++ *
20454 ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
20455 ++ * there is a set of bfq_groups, each one collecting the lower-level
20456 ++ * entities belonging to the group that are acting on the same device.
20457 ++ *
20458 ++ * Locking works as follows:
20459 ++ * o @group_node is protected by the bfqio_cgroup lock, and is accessed
20460 ++ * via RCU from its readers.
20461 ++ * o @bfqd is protected by the queue lock, RCU is used to access it
20462 ++ * from the readers.
20463 ++ * o All the other fields are protected by the @bfqd queue lock.
20464 ++ */
20465 ++struct bfq_group {
20466 ++ struct bfq_entity entity;
20467 ++ struct bfq_sched_data sched_data;
20468 ++
20469 ++ struct hlist_node group_node;
20470 ++ struct hlist_node bfqd_node;
20471 ++
20472 ++ void *bfqd;
20473 ++
20474 ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
20475 ++ struct bfq_queue *async_idle_bfqq;
20476 ++
20477 ++ struct bfq_entity *my_entity;
20478 ++};
20479 ++
20480 ++/**
20481 ++ * struct bfqio_cgroup - bfq cgroup data structure.
20482 ++ * @css: subsystem state for bfq in the containing cgroup.
20483 ++ * @online: flag marked when the subsystem is inserted.
20484 ++ * @weight: cgroup weight.
20485 ++ * @ioprio: cgroup ioprio.
20486 ++ * @ioprio_class: cgroup ioprio_class.
20487 ++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
20488 ++ * @group_data: list containing the bfq_group belonging to this cgroup.
20489 ++ *
20490 ++ * @group_data is accessed using RCU, with @lock protecting the updates,
20491 ++ * @ioprio and @ioprio_class are protected by @lock.
20492 ++ */
20493 ++struct bfqio_cgroup {
20494 ++ struct cgroup_subsys_state css;
20495 ++ bool online;
20496 ++
20497 ++ unsigned short weight, ioprio, ioprio_class;
20498 ++
20499 ++ spinlock_t lock;
20500 ++ struct hlist_head group_data;
20501 ++};
20502 ++#else
20503 ++struct bfq_group {
20504 ++ struct bfq_sched_data sched_data;
20505 ++
20506 ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
20507 ++ struct bfq_queue *async_idle_bfqq;
20508 ++};
20509 ++#endif
20510 ++
20511 ++static inline struct bfq_service_tree *
20512 ++bfq_entity_service_tree(struct bfq_entity *entity)
20513 ++{
20514 ++ struct bfq_sched_data *sched_data = entity->sched_data;
20515 ++ unsigned int idx = entity->ioprio_class - 1;
20516 ++
20517 ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
20518 ++ BUG_ON(sched_data == NULL);
20519 ++
20520 ++ return sched_data->service_tree + idx;
20521 ++}
20522 ++
20523 ++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
20524 ++ int is_sync)
20525 ++{
20526 ++ return bic->bfqq[!!is_sync];
20527 ++}
20528 ++
20529 ++static inline void bic_set_bfqq(struct bfq_io_cq *bic,
20530 ++ struct bfq_queue *bfqq, int is_sync)
20531 ++{
20532 ++ bic->bfqq[!!is_sync] = bfqq;
20533 ++}
20534 ++
20535 ++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
20536 ++{
20537 ++ return bic->icq.q->elevator->elevator_data;
20538 ++}
20539 ++
20540 ++/**
20541 ++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
20542 ++ * @ptr: a pointer to a bfqd.
20543 ++ * @flags: storage for the flags to be saved.
20544 ++ *
20545 ++ * This function allows bfqg->bfqd to be protected by the
20546 ++ * queue lock of the bfqd they reference; the pointer is dereferenced
20547 ++ * under RCU, so the storage for bfqd is assured to be safe as long
20548 ++ * as the RCU read side critical section does not end. After the
20549 ++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
20550 ++ * sure that no other writer accessed it. If we raced with a writer,
20551 ++ * the function returns NULL, with the queue unlocked, otherwise it
20552 ++ * returns the dereferenced pointer, with the queue locked.
20553 ++ */
20554 ++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
20555 ++ unsigned long *flags)
20556 ++{
20557 ++ struct bfq_data *bfqd;
20558 ++
20559 ++ rcu_read_lock();
20560 ++ bfqd = rcu_dereference(*(struct bfq_data **)ptr);
20561 ++
20562 ++ if (bfqd != NULL) {
20563 ++ spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
20564 ++ if (*ptr == bfqd)
20565 ++ goto out;
20566 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
20567 ++ }
20568 ++
20569 ++ bfqd = NULL;
20570 ++out:
20571 ++ rcu_read_unlock();
20572 ++ return bfqd;
20573 ++}
20574 ++
20575 ++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
20576 ++ unsigned long *flags)
20577 ++{
20578 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
20579 ++}
20580 ++
20581 ++static void bfq_changed_ioprio(struct bfq_io_cq *bic);
20582 ++static void bfq_put_queue(struct bfq_queue *bfqq);
20583 ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
20584 ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
20585 ++ struct bfq_group *bfqg, int is_sync,
20586 ++ struct bfq_io_cq *bic, gfp_t gfp_mask);
20587 ++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
20588 ++ struct bfq_group *bfqg);
20589 ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
20590 ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
20591 ++#endif
20592 +--
20593 +1.8.5.2
20594 +
20595
20596 Added: genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch
20597 ===================================================================
20598 --- genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch (rev 0)
20599 +++ genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch 2014-02-07 15:42:35 UTC (rev 2666)
20600 @@ -0,0 +1,1034 @@
20601 +From 39b1dba58b2562ba0d93a33a4f9af662d3c790c5 Mon Sep 17 00:00:00 2001
20602 +From: Mauro Andreolini <mauro.andreolini@×××××××.it>
20603 +Date: Thu, 23 Jan 2014 16:54:44 +0100
20604 +Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r1 for
20605 + 3.13.0
20606 +
20607 +A set of processes may happen to perform interleaved reads, i.e., requests
20608 +whose union would give rise to a sequential read pattern. There are two
20609 +typical cases: in the first case, processes read fixed-size chunks of
20610 +data at a fixed distance from each other, while in the second case processes
20611 +may read variable-size chunks at variable distances. The latter case occurs
20612 +for example with KVM, which splits the I/O generated by the guest into
20613 +multiple chunks, and lets these chunks be served by a pool of cooperating
20614 +processes, iteratively assigning the next chunk of I/O to the first
20615 +available process. CFQ uses actual queue merging for the first type of
20616 +rocesses, whereas it uses preemption to get a sequential read pattern out
20617 +of the read requests performed by the second type of processes. In the end
20618 +it uses two different mechanisms to achieve the same goal: boosting the
20619 +throughput with interleaved I/O.
20620 +
20621 +This patch introduces Early Queue Merge (EQM), a unified mechanism to get a
20622 +sequential read pattern with both types of processes. The main idea is
20623 +checking newly arrived requests against the next request of the active queue
20624 +both in case of actual request insert and in case of request merge. By doing
20625 +so, both the types of processes can be handled by just merging their queues.
20626 +EQM is then simpler and more compact than the pair of mechanisms used in
20627 +CFQ.
20628 +
20629 +Finally, EQM also preserves the typical low-latency properties of BFQ, by
20630 +properly restoring the weight-raising state of a queue when it gets back to
20631 +a non-merged state.
20632 +
20633 +Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>
20634 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
20635 +Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>
20636 +---
20637 + block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++----------------
20638 + block/bfq-sched.c | 28 ---
20639 + block/bfq.h | 16 ++
20640 + 3 files changed, 474 insertions(+), 227 deletions(-)
20641 +
20642 +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
20643 +index eb760de..06ee844 100644
20644 +--- a/block/bfq-iosched.c
20645 ++++ b/block/bfq-iosched.c
20646 +@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
20647 + return dur;
20648 + }
20649 +
20650 ++static inline void
20651 ++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
20652 ++{
20653 ++ if (bic->saved_idle_window)
20654 ++ bfq_mark_bfqq_idle_window(bfqq);
20655 ++ else
20656 ++ bfq_clear_bfqq_idle_window(bfqq);
20657 ++ if (bic->raising_time_left && bfqq->bfqd->low_latency) {
20658 ++ /*
20659 ++ * Start a weight raising period with the duration given by
20660 ++ * the raising_time_left snapshot.
20661 ++ */
20662 ++ if (bfq_bfqq_busy(bfqq))
20663 ++ bfqq->bfqd->raised_busy_queues++;
20664 ++ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;
20665 ++ bfqq->raising_cur_max_time = bic->raising_time_left;
20666 ++ bfqq->last_rais_start_finish = jiffies;
20667 ++ bfqq->entity.ioprio_changed = 1;
20668 ++ }
20669 ++ /*
20670 ++ * Clear raising_time_left to prevent bfq_bfqq_save_state() from
20671 ++ * getting confused about the queue's need of a weight-raising
20672 ++ * period.
20673 ++ */
20674 ++ bic->raising_time_left = 0;
20675 ++}
20676 ++
20677 ++/*
20678 ++ * Must be called with the queue_lock held.
20679 ++ */
20680 ++static int bfqq_process_refs(struct bfq_queue *bfqq)
20681 ++{
20682 ++ int process_refs, io_refs;
20683 ++
20684 ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
20685 ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
20686 ++ BUG_ON(process_refs < 0);
20687 ++ return process_refs;
20688 ++}
20689 ++
20690 + static void bfq_add_rq_rb(struct request *rq)
20691 + {
20692 + struct bfq_queue *bfqq = RQ_BFQQ(rq);
20693 +@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq)
20694 + if (!bfqd->low_latency)
20695 + goto add_bfqq_busy;
20696 +
20697 ++ if (bfq_bfqq_just_split(bfqq))
20698 ++ goto set_ioprio_changed;
20699 ++
20700 + /*
20701 +- * If the queue is not being boosted and has been idle
20702 +- * for enough time, start a weight-raising period
20703 ++ * If the queue:
20704 ++ * - is not being boosted,
20705 ++ * - has been idle for enough time,
20706 ++ * - is not a sync queue or is linked to a bfq_io_cq (it is
20707 ++ * shared "for its nature" or it is not shared and its
20708 ++ * requests have not been redirected to a shared queue)
20709 ++ * start a weight-raising period.
20710 + */
20711 +- if (old_raising_coeff == 1 &&
20712 +- (idle_for_long_time || soft_rt)) {
20713 ++ if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&
20714 ++ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
20715 + bfqq->raising_coeff = bfqd->bfq_raising_coeff;
20716 + if (idle_for_long_time)
20717 + bfqq->raising_cur_max_time =
20718 +@@ -574,6 +622,7 @@ static void bfq_add_rq_rb(struct request *rq)
20719 + bfqd->bfq_raising_rt_max_time;
20720 + }
20721 + }
20722 ++set_ioprio_changed:
20723 + if (old_raising_coeff != bfqq->raising_coeff)
20724 + entity->ioprio_changed = 1;
20725 + add_bfqq_busy:
20726 +@@ -756,90 +805,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)
20727 + spin_unlock_irq(bfqd->queue->queue_lock);
20728 + }
20729 +
20730 +-static int bfq_allow_merge(struct request_queue *q, struct request *rq,
20731 +- struct bio *bio)
20732 +-{
20733 +- struct bfq_data *bfqd = q->elevator->elevator_data;
20734 +- struct bfq_io_cq *bic;
20735 +- struct bfq_queue *bfqq;
20736 +-
20737 +- /*
20738 +- * Disallow merge of a sync bio into an async request.
20739 +- */
20740 +- if (bfq_bio_sync(bio) && !rq_is_sync(rq))
20741 +- return 0;
20742 +-
20743 +- /*
20744 +- * Lookup the bfqq that this bio will be queued with. Allow
20745 +- * merge only if rq is queued there.
20746 +- * Queue lock is held here.
20747 +- */
20748 +- bic = bfq_bic_lookup(bfqd, current->io_context);
20749 +- if (bic == NULL)
20750 +- return 0;
20751 +-
20752 +- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
20753 +- return bfqq == RQ_BFQQ(rq);
20754 +-}
20755 +-
20756 +-static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
20757 +- struct bfq_queue *bfqq)
20758 +-{
20759 +- if (bfqq != NULL) {
20760 +- bfq_mark_bfqq_must_alloc(bfqq);
20761 +- bfq_mark_bfqq_budget_new(bfqq);
20762 +- bfq_clear_bfqq_fifo_expire(bfqq);
20763 +-
20764 +- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
20765 +-
20766 +- bfq_log_bfqq(bfqd, bfqq,
20767 +- "set_in_service_queue, cur-budget = %lu",
20768 +- bfqq->entity.budget);
20769 +- }
20770 +-
20771 +- bfqd->in_service_queue = bfqq;
20772 +-}
20773 +-
20774 +-/*
20775 +- * Get and set a new queue for service.
20776 +- */
20777 +-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
20778 +- struct bfq_queue *bfqq)
20779 ++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
20780 + {
20781 +- if (!bfqq)
20782 +- bfqq = bfq_get_next_queue(bfqd);
20783 ++ if (request)
20784 ++ return blk_rq_pos(io_struct);
20785 + else
20786 +- bfq_get_next_queue_forced(bfqd, bfqq);
20787 +-
20788 +- __bfq_set_in_service_queue(bfqd, bfqq);
20789 +- return bfqq;
20790 ++ return ((struct bio *)io_struct)->bi_sector;
20791 + }
20792 +
20793 +-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
20794 +- struct request *rq)
20795 ++static inline sector_t bfq_dist_from(sector_t pos1,
20796 ++ sector_t pos2)
20797 + {
20798 +- if (blk_rq_pos(rq) >= bfqd->last_position)
20799 +- return blk_rq_pos(rq) - bfqd->last_position;
20800 ++ if (pos1 >= pos2)
20801 ++ return pos1 - pos2;
20802 + else
20803 +- return bfqd->last_position - blk_rq_pos(rq);
20804 ++ return pos2 - pos1;
20805 + }
20806 +
20807 +-/*
20808 +- * Return true if bfqq has no request pending and rq is close enough to
20809 +- * bfqd->last_position, or if rq is closer to bfqd->last_position than
20810 +- * bfqq->next_rq
20811 +- */
20812 +-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
20813 ++static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
20814 ++ sector_t sector)
20815 + {
20816 +- return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
20817 ++ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
20818 ++ BFQQ_SEEK_THR;
20819 + }
20820 +
20821 +-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
20822 ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
20823 + {
20824 + struct rb_root *root = &bfqd->rq_pos_tree;
20825 + struct rb_node *parent, *node;
20826 + struct bfq_queue *__bfqq;
20827 +- sector_t sector = bfqd->last_position;
20828 +
20829 + if (RB_EMPTY_ROOT(root))
20830 + return NULL;
20831 +@@ -858,7 +852,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
20832 + * position).
20833 + */
20834 + __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
20835 +- if (bfq_rq_close(bfqd, __bfqq->next_rq))
20836 ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
20837 + return __bfqq;
20838 +
20839 + if (blk_rq_pos(__bfqq->next_rq) < sector)
20840 +@@ -869,7 +863,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
20841 + return NULL;
20842 +
20843 + __bfqq = rb_entry(node, struct bfq_queue, pos_node);
20844 +- if (bfq_rq_close(bfqd, __bfqq->next_rq))
20845 ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
20846 + return __bfqq;
20847 +
20848 + return NULL;
20849 +@@ -878,14 +872,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
20850 + /*
20851 + * bfqd - obvious
20852 + * cur_bfqq - passed in so that we don't decide that the current queue
20853 +- * is closely cooperating with itself.
20854 +- *
20855 +- * We are assuming that cur_bfqq has dispatched at least one request,
20856 +- * and that bfqd->last_position reflects a position on the disk associated
20857 +- * with the I/O issued by cur_bfqq.
20858 ++ * is closely cooperating with itself
20859 ++ * sector - used as a reference point to search for a close queue
20860 + */
20861 + static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
20862 +- struct bfq_queue *cur_bfqq)
20863 ++ struct bfq_queue *cur_bfqq,
20864 ++ sector_t sector)
20865 + {
20866 + struct bfq_queue *bfqq;
20867 +
20868 +@@ -905,7 +897,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
20869 + * working closely on the same area of the disk. In that case,
20870 + * we can group them together and don't waste time idling.
20871 + */
20872 +- bfqq = bfqq_close(bfqd);
20873 ++ bfqq = bfqq_close(bfqd, sector);
20874 + if (bfqq == NULL || bfqq == cur_bfqq)
20875 + return NULL;
20876 +
20877 +@@ -932,6 +924,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
20878 + return bfqq;
20879 + }
20880 +
20881 ++static struct bfq_queue *
20882 ++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
20883 ++{
20884 ++ int process_refs, new_process_refs;
20885 ++ struct bfq_queue *__bfqq;
20886 ++
20887 ++ /*
20888 ++ * If there are no process references on the new_bfqq, then it is
20889 ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
20890 ++ * may have dropped their last reference (not just their last process
20891 ++ * reference).
20892 ++ */
20893 ++ if (!bfqq_process_refs(new_bfqq))
20894 ++ return NULL;
20895 ++
20896 ++ /* Avoid a circular list and skip interim queue merges. */
20897 ++ while ((__bfqq = new_bfqq->new_bfqq)) {
20898 ++ if (__bfqq == bfqq)
20899 ++ return NULL;
20900 ++ new_bfqq = __bfqq;
20901 ++ }
20902 ++
20903 ++ process_refs = bfqq_process_refs(bfqq);
20904 ++ new_process_refs = bfqq_process_refs(new_bfqq);
20905 ++ /*
20906 ++ * If the process for the bfqq has gone away, there is no
20907 ++ * sense in merging the queues.
20908 ++ */
20909 ++ if (process_refs == 0 || new_process_refs == 0)
20910 ++ return NULL;
20911 ++
20912 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
20913 ++ new_bfqq->pid);
20914 ++
20915 ++ /*
20916 ++ * Merging is just a redirection: the requests of the process owning
20917 ++ * one of the two queues are redirected to the other queue. The latter
20918 ++ * queue, in its turn, is set as shared if this is the first time that
20919 ++ * the requests of some process are redirected to it.
20920 ++ *
20921 ++ * We redirect bfqq to new_bfqq and not the opposite, because we
20922 ++ * are in the context of the process owning bfqq, hence we have the
20923 ++ * io_cq of this process. So we can immediately configure this io_cq
20924 ++ * to redirect the requests of the process to new_bfqq.
20925 ++ *
20926 ++ * NOTE, even if new_bfqq coincides with the in-service queue, the
20927 ++ * io_cq of new_bfqq is not available, because, if the in-service queue
20928 ++ * is shared, bfqd->in_service_bic may not point to the io_cq of the
20929 ++ * in-service queue.
20930 ++ * Redirecting the requests of the process owning bfqq to the currently
20931 ++ * in-service queue is in any case the best option, as we feed the
20932 ++ * in-service queue with new requests close to the last request served
20933 ++ * and, by doing so, hopefully increase the throughput.
20934 ++ */
20935 ++ bfqq->new_bfqq = new_bfqq;
20936 ++ atomic_add(process_refs, &new_bfqq->ref);
20937 ++ return new_bfqq;
20938 ++}
20939 ++
20940 ++/*
20941 ++ * Attempt to schedule a merge of bfqq with the currently in-service queue or
20942 ++ * with a close queue among the scheduled queues.
20943 ++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
20944 ++ * structure otherwise.
20945 ++ */
20946 ++static struct bfq_queue *
20947 ++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
20948 ++ void *io_struct, bool request)
20949 ++{
20950 ++ struct bfq_queue *in_service_bfqq, *new_bfqq;
20951 ++
20952 ++ if (bfqq->new_bfqq)
20953 ++ return bfqq->new_bfqq;
20954 ++
20955 ++ if (!io_struct)
20956 ++ return NULL;
20957 ++
20958 ++ in_service_bfqq = bfqd->in_service_queue;
20959 ++
20960 ++ if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||
20961 ++ !bfqd->in_service_bic)
20962 ++ goto check_scheduled;
20963 ++
20964 ++ if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))
20965 ++ goto check_scheduled;
20966 ++
20967 ++ if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))
20968 ++ goto check_scheduled;
20969 ++
20970 ++ if (in_service_bfqq->entity.parent != bfqq->entity.parent)
20971 ++ goto check_scheduled;
20972 ++
20973 ++ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
20974 ++ bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {
20975 ++ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
20976 ++ if (new_bfqq != NULL)
20977 ++ return new_bfqq; /* Merge with the in-service queue */
20978 ++ }
20979 ++
20980 ++ /*
20981 ++ * Check whether there is a cooperator among currently scheduled
20982 ++ * queues. The only thing we need is that the bio/request is not
20983 ++ * NULL, as we need it to establish whether a cooperator exists.
20984 ++ */
20985 ++check_scheduled:
20986 ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq,
20987 ++ bfq_io_struct_pos(io_struct, request));
20988 ++ if (new_bfqq)
20989 ++ return bfq_setup_merge(bfqq, new_bfqq);
20990 ++
20991 ++ return NULL;
20992 ++}
20993 ++
20994 ++static inline void
20995 ++bfq_bfqq_save_state(struct bfq_queue *bfqq)
20996 ++{
20997 ++ /*
20998 ++ * If bfqq->bic == NULL, the queue is already shared or its requests
20999 ++ * have already been redirected to a shared queue; both idle window
21000 ++ * and weight raising state have already been saved. Do nothing.
21001 ++ */
21002 ++ if (bfqq->bic == NULL)
21003 ++ return;
21004 ++ if (bfqq->bic->raising_time_left)
21005 ++ /*
21006 ++ * This is the queue of a just-started process, and would
21007 ++ * deserve weight raising: we set raising_time_left to the full
21008 ++ * weight-raising duration to trigger weight-raising when and
21009 ++ * if the queue is split and the first request of the queue
21010 ++ * is enqueued.
21011 ++ */
21012 ++ bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);
21013 ++ else if (bfqq->raising_coeff > 1) {
21014 ++ unsigned long wrais_duration =
21015 ++ jiffies - bfqq->last_rais_start_finish;
21016 ++ /*
21017 ++ * It may happen that a queue's weight raising period lasts
21018 ++ * longer than its raising_cur_max_time, as weight raising is
21019 ++ * handled only when a request is enqueued or dispatched (it
21020 ++ * does not use any timer). If the weight raising period is
21021 ++ * about to end, don't save it.
21022 ++ */
21023 ++ if (bfqq->raising_cur_max_time <= wrais_duration)
21024 ++ bfqq->bic->raising_time_left = 0;
21025 ++ else
21026 ++ bfqq->bic->raising_time_left =
21027 ++ bfqq->raising_cur_max_time - wrais_duration;
21028 ++ /*
21029 ++ * The bfq_queue is becoming shared or the requests of the
21030 ++ * process owning the queue are being redirected to a shared
21031 ++ * queue. Stop the weight raising period of the queue, as in
21032 ++ * both cases it should not be owned by an interactive or soft
21033 ++ * real-time application.
21034 ++ */
21035 ++ bfq_bfqq_end_raising(bfqq);
21036 ++ } else
21037 ++ bfqq->bic->raising_time_left = 0;
21038 ++ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
21039 ++}
21040 ++
21041 ++static inline void
21042 ++bfq_get_bic_reference(struct bfq_queue *bfqq)
21043 ++{
21044 ++ /*
21045 ++ * If bfqq->bic has a non-NULL value, the bic to which it belongs
21046 ++ * is about to begin using a shared bfq_queue.
21047 ++ */
21048 ++ if (bfqq->bic)
21049 ++ atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
21050 ++}
21051 ++
21052 ++static void
21053 ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
21054 ++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
21055 ++{
21056 ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
21057 ++ (long unsigned)new_bfqq->pid);
21058 ++ /* Save weight raising and idle window of the merged queues */
21059 ++ bfq_bfqq_save_state(bfqq);
21060 ++ bfq_bfqq_save_state(new_bfqq);
21061 ++ /*
21062 ++ * Grab a reference to the bic, to prevent it from being destroyed
21063 ++ * before being possibly touched by a bfq_split_bfqq().
21064 ++ */
21065 ++ bfq_get_bic_reference(bfqq);
21066 ++ bfq_get_bic_reference(new_bfqq);
21067 ++ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */
21068 ++ bic_set_bfqq(bic, new_bfqq, 1);
21069 ++ bfq_mark_bfqq_coop(new_bfqq);
21070 ++ /*
21071 ++ * new_bfqq now belongs to at least two bics (it is a shared queue): set
21072 ++ * new_bfqq->bic to NULL. bfqq either:
21073 ++ * - does not belong to any bic any more, and hence bfqq->bic must
21074 ++ * be set to NULL, or
21075 ++ * - is a queue whose owning bics have already been redirected to a
21076 ++ * different queue, hence the queue is destined to not belong to any
21077 ++ * bic soon and bfqq->bic is already NULL (therefore the next
21078 ++ * assignment causes no harm).
21079 ++ */
21080 ++ new_bfqq->bic = NULL;
21081 ++ bfqq->bic = NULL;
21082 ++ bfq_put_queue(bfqq);
21083 ++}
21084 ++
21085 ++static int bfq_allow_merge(struct request_queue *q, struct request *rq,
21086 ++ struct bio *bio)
21087 ++{
21088 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
21089 ++ struct bfq_io_cq *bic;
21090 ++ struct bfq_queue *bfqq, *new_bfqq;
21091 ++
21092 ++ /*
21093 ++ * Disallow merge of a sync bio into an async request.
21094 ++ */
21095 ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
21096 ++ return 0;
21097 ++
21098 ++ /*
21099 ++ * Lookup the bfqq that this bio will be queued with. Allow
21100 ++ * merge only if rq is queued there.
21101 ++ * Queue lock is held here.
21102 ++ */
21103 ++ bic = bfq_bic_lookup(bfqd, current->io_context);
21104 ++ if (bic == NULL)
21105 ++ return 0;
21106 ++
21107 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
21108 ++ /*
21109 ++ * We take advantage of this function to perform an early merge
21110 ++ * of the queues of possible cooperating processes.
21111 ++ */
21112 ++ if (bfqq != NULL) {
21113 ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
21114 ++ if (new_bfqq != NULL) {
21115 ++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
21116 ++ /*
21117 ++ * If we get here, the bio will be queued in the shared queue,
21118 ++ * i.e., new_bfqq, so use new_bfqq to decide whether bio and
21119 ++ * rq can be merged.
21120 ++ */
21121 ++ bfqq = new_bfqq;
21122 ++ }
21123 ++ }
21124 ++
21125 ++ return bfqq == RQ_BFQQ(rq);
21126 ++}
21127 ++
21128 ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
21129 ++ struct bfq_queue *bfqq)
21130 ++{
21131 ++ if (bfqq != NULL) {
21132 ++ bfq_mark_bfqq_must_alloc(bfqq);
21133 ++ bfq_mark_bfqq_budget_new(bfqq);
21134 ++ bfq_clear_bfqq_fifo_expire(bfqq);
21135 ++
21136 ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
21137 ++
21138 ++ bfq_log_bfqq(bfqd, bfqq,
21139 ++ "set_in_service_queue, cur-budget = %lu",
21140 ++ bfqq->entity.budget);
21141 ++ }
21142 ++
21143 ++ bfqd->in_service_queue = bfqq;
21144 ++}
21145 ++
21146 ++/*
21147 ++ * Get and set a new queue for service.
21148 ++ */
21149 ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
21150 ++{
21151 ++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
21152 ++
21153 ++ __bfq_set_in_service_queue(bfqd, bfqq);
21154 ++ return bfqq;
21155 ++}
21156 ++
21157 + /*
21158 + * If enough samples have been computed, return the current max budget
21159 + * stored in bfqd, which is dynamically updated according to the
21160 +@@ -1079,63 +1347,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
21161 + return rq;
21162 + }
21163 +
21164 +-/*
21165 +- * Must be called with the queue_lock held.
21166 +- */
21167 +-static int bfqq_process_refs(struct bfq_queue *bfqq)
21168 +-{
21169 +- int process_refs, io_refs;
21170 +-
21171 +- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
21172 +- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
21173 +- BUG_ON(process_refs < 0);
21174 +- return process_refs;
21175 +-}
21176 +-
21177 +-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
21178 +-{
21179 +- int process_refs, new_process_refs;
21180 +- struct bfq_queue *__bfqq;
21181 +-
21182 +- /*
21183 +- * If there are no process references on the new_bfqq, then it is
21184 +- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
21185 +- * may have dropped their last reference (not just their last process
21186 +- * reference).
21187 +- */
21188 +- if (!bfqq_process_refs(new_bfqq))
21189 +- return;
21190 +-
21191 +- /* Avoid a circular list and skip interim queue merges. */
21192 +- while ((__bfqq = new_bfqq->new_bfqq)) {
21193 +- if (__bfqq == bfqq)
21194 +- return;
21195 +- new_bfqq = __bfqq;
21196 +- }
21197 +-
21198 +- process_refs = bfqq_process_refs(bfqq);
21199 +- new_process_refs = bfqq_process_refs(new_bfqq);
21200 +- /*
21201 +- * If the process for the bfqq has gone away, there is no
21202 +- * sense in merging the queues.
21203 +- */
21204 +- if (process_refs == 0 || new_process_refs == 0)
21205 +- return;
21206 +-
21207 +- /*
21208 +- * Merge in the direction of the lesser amount of work.
21209 +- */
21210 +- if (new_process_refs >= process_refs) {
21211 +- bfqq->new_bfqq = new_bfqq;
21212 +- atomic_add(process_refs, &new_bfqq->ref);
21213 +- } else {
21214 +- new_bfqq->new_bfqq = bfqq;
21215 +- atomic_add(new_process_refs, &bfqq->ref);
21216 +- }
21217 +- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
21218 +- new_bfqq->pid);
21219 +-}
21220 +-
21221 + static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
21222 + {
21223 + struct bfq_entity *entity = &bfqq->entity;
21224 +@@ -1729,7 +1940,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
21225 + */
21226 + static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
21227 + {
21228 +- struct bfq_queue *bfqq, *new_bfqq = NULL;
21229 ++ struct bfq_queue *bfqq;
21230 + struct request *next_rq;
21231 + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
21232 +
21233 +@@ -1739,17 +1950,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
21234 +
21235 + bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
21236 +
21237 +- /*
21238 +- * If another queue has a request waiting within our mean seek
21239 +- * distance, let it run. The expire code will check for close
21240 +- * cooperators and put the close queue at the front of the
21241 +- * service tree. If possible, merge the expiring queue with the
21242 +- * new bfqq.
21243 +- */
21244 +- new_bfqq = bfq_close_cooperator(bfqd, bfqq);
21245 +- if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
21246 +- bfq_setup_merge(bfqq, new_bfqq);
21247 +-
21248 + if (bfq_may_expire_for_budg_timeout(bfqq) &&
21249 + !timer_pending(&bfqd->idle_slice_timer) &&
21250 + !bfq_bfqq_must_idle(bfqq))
21251 +@@ -1786,36 +1986,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
21252 + bfq_clear_bfqq_wait_request(bfqq);
21253 + del_timer(&bfqd->idle_slice_timer);
21254 + }
21255 +- if (new_bfqq == NULL)
21256 +- goto keep_queue;
21257 +- else
21258 +- goto expire;
21259 ++ goto keep_queue;
21260 + }
21261 + }
21262 +
21263 + /*
21264 +- * No requests pending. If the in-service queue has no cooperator and
21265 +- * still has requests in flight (possibly waiting for a completion)
21266 +- * or is idling for a new request, then keep it.
21267 ++ * No requests pending. If the in-service queue still has requests in
21268 ++ * flight (possibly waiting for a completion) or is idling for a new
21269 ++ * request, then keep it.
21270 + */
21271 +- if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
21272 +- (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
21273 ++ if (timer_pending(&bfqd->idle_slice_timer) ||
21274 ++ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {
21275 + bfqq = NULL;
21276 + goto keep_queue;
21277 +- } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
21278 +- /*
21279 +- * Expiring the queue because there is a close cooperator,
21280 +- * cancel timer.
21281 +- */
21282 +- bfq_clear_bfqq_wait_request(bfqq);
21283 +- del_timer(&bfqd->idle_slice_timer);
21284 + }
21285 +
21286 + reason = BFQ_BFQQ_NO_MORE_REQUESTS;
21287 + expire:
21288 + bfq_bfqq_expire(bfqd, bfqq, 0, reason);
21289 + new_queue:
21290 +- bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
21291 ++ bfqq = bfq_set_in_service_queue(bfqd);
21292 + bfq_log(bfqd, "select_queue: new queue %d returned",
21293 + bfqq != NULL ? bfqq->pid : 0);
21294 + keep_queue:
21295 +@@ -1825,9 +2015,8 @@ keep_queue:
21296 + static void bfq_update_raising_data(struct bfq_data *bfqd,
21297 + struct bfq_queue *bfqq)
21298 + {
21299 ++ struct bfq_entity *entity = &bfqq->entity;
21300 + if (bfqq->raising_coeff > 1) { /* queue is being boosted */
21301 +- struct bfq_entity *entity = &bfqq->entity;
21302 +-
21303 + bfq_log_bfqq(bfqd, bfqq,
21304 + "raising period dur %u/%u msec, "
21305 + "old raising coeff %u, w %d(%d)",
21306 +@@ -1844,7 +2033,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,
21307 + "WARN: pending prio change");
21308 + /*
21309 + * If too much time has elapsed from the beginning
21310 +- * of this weight-raising, stop it.
21311 ++ * of this weight-raising period, stop it.
21312 + */
21313 + if (time_is_before_jiffies(bfqq->last_rais_start_finish +
21314 + bfqq->raising_cur_max_time)) {
21315 +@@ -1856,11 +2045,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,
21316 + jiffies_to_msecs(bfqq->
21317 + raising_cur_max_time));
21318 + bfq_bfqq_end_raising(bfqq);
21319 +- __bfq_entity_update_weight_prio(
21320 +- bfq_entity_service_tree(entity),
21321 +- entity);
21322 + }
21323 + }
21324 ++ /* Update weight both if it must be raised and if it must be lowered */
21325 ++ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))
21326 ++ __bfq_entity_update_weight_prio(
21327 ++ bfq_entity_service_tree(entity),
21328 ++ entity);
21329 + }
21330 +
21331 + /*
21332 +@@ -2101,6 +2292,25 @@ static void bfq_init_icq(struct io_cq *icq)
21333 + struct bfq_io_cq *bic = icq_to_bic(icq);
21334 +
21335 + bic->ttime.last_end_request = jiffies;
21336 ++ /*
21337 ++ * A newly created bic indicates that the process has just
21338 ++ * started doing I/O, and is probably mapping into memory its
21339 ++ * executable and libraries: it definitely needs weight raising.
21340 ++ * There is however the possibility that the process performs,
21341 ++ * for a while, I/O close to some other process. EQM intercepts
21342 ++ * this behavior and may merge the queue corresponding to the
21343 ++ * process with some other queue, BEFORE the weight of the queue
21344 ++ * is raised. Merged queues are not weight-raised (they are assumed
21345 ++ * to belong to processes that benefit only from high throughput).
21346 ++ * If the merge is basically the consequence of an accident, then
21347 ++ * the queue will be split soon and will get back its old weight.
21348 ++ * It is then important to write down somewhere that this queue
21349 ++ * does need weight raising, even if it did not make it to get its
21350 ++ * weight raised before being merged. To this purpose, we overload
21351 ++ * the field raising_time_left and assign 1 to it, to mark the queue
21352 ++ * as needing weight raising.
21353 ++ */
21354 ++ bic->raising_time_left = 1;
21355 + }
21356 +
21357 + static void bfq_exit_icq(struct io_cq *icq)
21358 +@@ -2114,6 +2324,13 @@ static void bfq_exit_icq(struct io_cq *icq)
21359 + }
21360 +
21361 + if (bic->bfqq[BLK_RW_SYNC]) {
21362 ++ /*
21363 ++ * If the bic is using a shared queue, put the reference
21364 ++ * taken on the io_context when the bic started using a
21365 ++ * shared bfq_queue.
21366 ++ */
21367 ++ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
21368 ++ put_io_context(icq->ioc);
21369 + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
21370 + bic->bfqq[BLK_RW_SYNC] = NULL;
21371 + }
21372 +@@ -2405,6 +2622,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
21373 + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
21374 + return;
21375 +
21376 ++ /* Idle window just restored, statistics are meaningless. */
21377 ++ if (bfq_bfqq_just_split(bfqq))
21378 ++ return;
21379 ++
21380 + enable_idle = bfq_bfqq_idle_window(bfqq);
21381 +
21382 + if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
21383 +@@ -2445,6 +2666,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
21384 + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
21385 + !BFQQ_SEEKY(bfqq))
21386 + bfq_update_idle_window(bfqd, bfqq, bic);
21387 ++ bfq_clear_bfqq_just_split(bfqq);
21388 +
21389 + bfq_log_bfqq(bfqd, bfqq,
21390 + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
21391 +@@ -2505,13 +2727,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
21392 + static void bfq_insert_request(struct request_queue *q, struct request *rq)
21393 + {
21394 + struct bfq_data *bfqd = q->elevator->elevator_data;
21395 +- struct bfq_queue *bfqq = RQ_BFQQ(rq);
21396 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
21397 +
21398 + assert_spin_locked(bfqd->queue->queue_lock);
21399 ++
21400 ++ /*
21401 ++ * An unplug may trigger a requeue of a request from the device
21402 ++ * driver: make sure we are in process context while trying to
21403 ++ * merge two bfq_queues.
21404 ++ */
21405 ++ if (!in_interrupt()) {
21406 ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
21407 ++ if (new_bfqq != NULL) {
21408 ++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
21409 ++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
21410 ++ /*
21411 ++ * Release the request's reference to the old bfqq
21412 ++ * and make sure one is taken to the shared queue.
21413 ++ */
21414 ++ new_bfqq->allocated[rq_data_dir(rq)]++;
21415 ++ bfqq->allocated[rq_data_dir(rq)]--;
21416 ++ atomic_inc(&new_bfqq->ref);
21417 ++ bfq_put_queue(bfqq);
21418 ++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
21419 ++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
21420 ++ bfqq, new_bfqq);
21421 ++ rq->elv.priv[1] = new_bfqq;
21422 ++ bfqq = new_bfqq;
21423 ++ }
21424 ++ }
21425 ++
21426 + bfq_init_prio_data(bfqq, RQ_BIC(rq));
21427 +
21428 + bfq_add_rq_rb(rq);
21429 +
21430 ++ /*
21431 ++ * Here a newly-created bfq_queue has already started a weight-raising
21432 ++ * period: clear raising_time_left to prevent bfq_bfqq_save_state()
21433 ++ * from assigning it a full weight-raising period. See the detailed
21434 ++ * comments about this field in bfq_init_icq().
21435 ++ */
21436 ++ if (bfqq->bic != NULL)
21437 ++ bfqq->bic->raising_time_left = 0;
21438 + rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
21439 + list_add_tail(&rq->queuelist, &bfqq->fifo);
21440 +
21441 +@@ -2659,18 +2916,6 @@ static void bfq_put_request(struct request *rq)
21442 + }
21443 + }
21444 +
21445 +-static struct bfq_queue *
21446 +-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
21447 +- struct bfq_queue *bfqq)
21448 +-{
21449 +- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
21450 +- (long unsigned)bfqq->new_bfqq->pid);
21451 +- bic_set_bfqq(bic, bfqq->new_bfqq, 1);
21452 +- bfq_mark_bfqq_coop(bfqq->new_bfqq);
21453 +- bfq_put_queue(bfqq);
21454 +- return bic_to_bfqq(bic, 1);
21455 +-}
21456 +-
21457 + /*
21458 + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
21459 + * was the last process referring to said bfqq.
21460 +@@ -2679,6 +2924,9 @@ static struct bfq_queue *
21461 + bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
21462 + {
21463 + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
21464 ++
21465 ++ put_io_context(bic->icq.ioc);
21466 ++
21467 + if (bfqq_process_refs(bfqq) == 1) {
21468 + bfqq->pid = current->pid;
21469 + bfq_clear_bfqq_coop(bfqq);
21470 +@@ -2707,6 +2955,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
21471 + struct bfq_queue *bfqq;
21472 + struct bfq_group *bfqg;
21473 + unsigned long flags;
21474 ++ bool split = false;
21475 +
21476 + might_sleep_if(gfp_mask & __GFP_WAIT);
21477 +
21478 +@@ -2725,24 +2974,14 @@ new_queue:
21479 + bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
21480 + bic_set_bfqq(bic, bfqq, is_sync);
21481 + } else {
21482 +- /*
21483 +- * If the queue was seeky for too long, break it apart.
21484 +- */
21485 ++ /* If the queue was seeky for too long, break it apart. */
21486 + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
21487 + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
21488 + bfqq = bfq_split_bfqq(bic, bfqq);
21489 ++ split = true;
21490 + if (!bfqq)
21491 + goto new_queue;
21492 + }
21493 +-
21494 +- /*
21495 +- * Check to see if this queue is scheduled to merge with
21496 +- * another closely cooperating queue. The merging of queues
21497 +- * happens here as it must be done in process context.
21498 +- * The reference on new_bfqq was taken in merge_bfqqs.
21499 +- */
21500 +- if (bfqq->new_bfqq != NULL)
21501 +- bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
21502 + }
21503 +
21504 + bfqq->allocated[rw]++;
21505 +@@ -2753,6 +2992,26 @@ new_queue:
21506 + rq->elv.priv[0] = bic;
21507 + rq->elv.priv[1] = bfqq;
21508 +
21509 ++ /*
21510 ++ * If a bfq_queue has only one process reference, it is owned
21511 ++ * by only one bfq_io_cq: we can set the bic field of the
21512 ++ * bfq_queue to the address of that structure. Also, if the
21513 ++ * queue has just been split, mark a flag so that the
21514 ++ * information is available to the other scheduler hooks.
21515 ++ */
21516 ++ if (bfqq_process_refs(bfqq) == 1) {
21517 ++ bfqq->bic = bic;
21518 ++ if (split) {
21519 ++ bfq_mark_bfqq_just_split(bfqq);
21520 ++ /*
21521 ++ * If the queue has just been split from a shared queue,
21522 ++ * restore the idle window and the possible weight
21523 ++ * raising period.
21524 ++ */
21525 ++ bfq_bfqq_resume_state(bfqq, bic);
21526 ++ }
21527 ++ }
21528 ++
21529 + spin_unlock_irqrestore(q->queue_lock, flags);
21530 +
21531 + return 0;
21532 +diff --git a/block/bfq-sched.c b/block/bfq-sched.c
21533 +index 999b475..e54ea33 100644
21534 +--- a/block/bfq-sched.c
21535 ++++ b/block/bfq-sched.c
21536 +@@ -980,34 +980,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
21537 + return bfqq;
21538 + }
21539 +
21540 +-/*
21541 +- * Forced extraction of the given queue.
21542 +- */
21543 +-static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
21544 +- struct bfq_queue *bfqq)
21545 +-{
21546 +- struct bfq_entity *entity;
21547 +- struct bfq_sched_data *sd;
21548 +-
21549 +- BUG_ON(bfqd->in_service_queue != NULL);
21550 +-
21551 +- entity = &bfqq->entity;
21552 +- /*
21553 +- * Bubble up extraction/update from the leaf to the root.
21554 +- */
21555 +- for_each_entity(entity) {
21556 +- sd = entity->sched_data;
21557 +- bfq_update_budget(entity);
21558 +- bfq_update_vtime(bfq_entity_service_tree(entity));
21559 +- bfq_active_extract(bfq_entity_service_tree(entity), entity);
21560 +- sd->active_entity = entity;
21561 +- sd->next_active = NULL;
21562 +- entity->service = 0;
21563 +- }
21564 +-
21565 +- return;
21566 +-}
21567 +-
21568 + static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
21569 + {
21570 + if (bfqd->in_service_bic != NULL) {
21571 +diff --git a/block/bfq.h b/block/bfq.h
21572 +index f9b5881..0bfad40 100644
21573 +--- a/block/bfq.h
21574 ++++ b/block/bfq.h
21575 +@@ -192,6 +192,8 @@ struct bfq_group;
21576 + * idle to backlogged
21577 + * @service_from_backlogged: cumulative service received from the @bfq_queue
21578 + * since the last transition from idle to backlogged
21579 ++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
21580 ++ * queue is shared
21581 + *
21582 + * A bfq_queue is a leaf request queue; it can be associated to an io_context
21583 + * or more (if it is an async one). @cgroup holds a reference to the
21584 +@@ -235,6 +237,7 @@ struct bfq_queue {
21585 + sector_t last_request_pos;
21586 +
21587 + pid_t pid;
21588 ++ struct bfq_io_cq *bic;
21589 +
21590 + /* weight-raising fields */
21591 + unsigned long raising_cur_max_time;
21592 +@@ -264,12 +267,23 @@ struct bfq_ttime {
21593 + * @icq: associated io_cq structure
21594 + * @bfqq: array of two process queues, the sync and the async
21595 + * @ttime: associated @bfq_ttime struct
21596 ++ * @raising_time_left: snapshot of the time left before weight raising ends
21597 ++ * for the sync queue associated to this process; this
21598 ++ * snapshot is taken to remember this value while the weight
21599 ++ * raising is suspended because the queue is merged with a
21600 ++ * shared queue, and is used to set @raising_cur_max_time
21601 ++ * when the queue is split from the shared queue and its
21602 ++ * weight is raised again
21603 ++ * @saved_idle_window: same purpose as the previous field for the idle window
21604 + */
21605 + struct bfq_io_cq {
21606 + struct io_cq icq; /* must be the first member */
21607 + struct bfq_queue *bfqq[2];
21608 + struct bfq_ttime ttime;
21609 + int ioprio;
21610 ++
21611 ++ unsigned int raising_time_left;
21612 ++ unsigned int saved_idle_window;
21613 + };
21614 +
21615 + /**
21616 +@@ -411,6 +425,7 @@ enum bfqq_state_flags {
21617 + BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
21618 + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
21619 + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
21620 ++ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */
21621 + BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */
21622 + };
21623 +
21624 +@@ -438,6 +453,7 @@ BFQ_BFQQ_FNS(sync);
21625 + BFQ_BFQQ_FNS(budget_new);
21626 + BFQ_BFQQ_FNS(coop);
21627 + BFQ_BFQQ_FNS(split_coop);
21628 ++BFQ_BFQQ_FNS(just_split);
21629 + BFQ_BFQQ_FNS(softrt_update);
21630 + #undef BFQ_BFQQ_FNS
21631 +
21632 +--
21633 +1.8.5.2
21634 +