Gentoo Archives: gentoo-commits

From: "Tom Wijsman (tomwij)" <tomwij@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] linux-patches r2661 - genpatches-2.6/trunk/3.14
Date: Thu, 30 Jan 2014 16:49:54
Message-Id: 20140130164948.29F352004C@flycatcher.gentoo.org
1 Author: tomwij
2 Date: 2014-01-30 16:49:47 +0000 (Thu, 30 Jan 2014)
3 New Revision: 2661
4
5 Added:
6 genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch
7 genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1
8 genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch
9 Removed:
10 genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch
11 genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1
12 genpatches-2.6/trunk/3.14/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1
13 genpatches-2.6/trunk/3.14/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch
14 Modified:
15 genpatches-2.6/trunk/3.14/0000_README
16 Log:
17 BFQ v7 3.13.
18
19 Modified: genpatches-2.6/trunk/3.14/0000_README
20 ===================================================================
21 --- genpatches-2.6/trunk/3.14/0000_README 2014-01-29 14:41:45 UTC (rev 2660)
22 +++ genpatches-2.6/trunk/3.14/0000_README 2014-01-30 16:49:47 UTC (rev 2661)
23 @@ -83,18 +83,14 @@
24 From: Tom Wijsman <TomWij@g.o>
25 Desc: Add Gentoo Linux support config settings and defaults.
26
27 -Patch: 5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch
28 +Patch: 5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7-3.13.patch
29 From: http://algo.ing.unimo.it/people/paolo/disk_sched/
30 -Desc: BFQ v6r2 patch 1 for 3.11: Build, cgroups and kconfig bits
31 +Desc: BFQ v7 patch 1 for 3.13: Build, cgroups and kconfig bits
32
33 -Patch: 5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1
34 +Patch: 5000_BFQ-2-block-introduce-the-v7-I-O-sched-for-3.13.patch1
35 From: http://algo.ing.unimo.it/people/paolo/disk_sched/
36 -Desc: BFQ v6r2 patch 2 for 3.10: BFQ Scheduler
37 +Desc: BFQ v7 patch 2 for 3.13: BFQ Scheduler
38
39 -Patch: 5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1
40 +Patch: 5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7-for-3.13.0.patch
41 From: http://algo.ing.unimo.it/people/paolo/disk_sched/
42 -Desc: BFQ v6r2 patch 3 for 3.10: Early Queue Merge (EQM)
43 -
44 -Patch: 5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch
45 -From: http://algo.ing.unimo.it/people/paolo/disk_sched/
46 -Desc: BFQ v6r2 for 3.11.0 to BFQ v6r2 for 3.12.0.
47 +Desc: BFQ v7 patch 3 for 3.13: Early Queue Merge (EQM)
48 \ No newline at end of file
49
50 Added: genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch
51 ===================================================================
52 --- genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch (rev 0)
53 +++ genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch 2014-01-30 16:49:47 UTC (rev 2661)
54 @@ -0,0 +1,104 @@
55 +From 7f029ed2a02bea57b791c032d6242129c3372a84 Mon Sep 17 00:00:00 2001
56 +From: Paolo Valente <paolo.valente@×××××××.it>
57 +Date: Tue, 3 Sep 2013 16:50:42 +0200
58 +Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7-3.13
59 +
60 +Update Kconfig.iosched and do the related Makefile changes to include
61 +kernel configuration options for BFQ. Also add the bfqio controller
62 +to the cgroups subsystem.
63 +
64 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
65 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
66 +---
67 + block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++
68 + block/Makefile | 1 +
69 + include/linux/cgroup_subsys.h | 4 ++++
70 + 3 files changed, 37 insertions(+)
71 +
72 +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
73 +index 421bef9..8f552ba 100644
74 +--- a/block/Kconfig.iosched
75 ++++ b/block/Kconfig.iosched
76 +@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED
77 + ---help---
78 + Enable group IO scheduling in CFQ.
79 +
80 ++config IOSCHED_BFQ
81 ++ tristate "BFQ I/O scheduler"
82 ++ default n
83 ++ ---help---
84 ++ The BFQ I/O scheduler tries to distribute bandwidth among
85 ++ all processes according to their weights.
86 ++ It aims at distributing the bandwidth as desired, independently of
87 ++ the disk parameters and with any workload. It also tries to
88 ++ guarantee low latency to interactive and soft real-time
89 ++ applications. If compiled built-in (saying Y here), BFQ can
90 ++ be configured to support hierarchical scheduling.
91 ++
92 ++config CGROUP_BFQIO
93 ++ bool "BFQ hierarchical scheduling support"
94 ++ depends on CGROUPS && IOSCHED_BFQ=y
95 ++ default n
96 ++ ---help---
97 ++ Enable hierarchical scheduling in BFQ, using the cgroups
98 ++ filesystem interface. The name of the subsystem will be
99 ++ bfqio.
100 ++
101 + choice
102 + prompt "Default I/O scheduler"
103 + default DEFAULT_CFQ
104 +@@ -52,6 +73,16 @@ choice
105 + config DEFAULT_CFQ
106 + bool "CFQ" if IOSCHED_CFQ=y
107 +
108 ++ config DEFAULT_BFQ
109 ++ bool "BFQ" if IOSCHED_BFQ=y
110 ++ help
111 ++ Selects BFQ as the default I/O scheduler which will be
112 ++ used by default for all block devices.
113 ++ The BFQ I/O scheduler aims at distributing the bandwidth
114 ++ as desired, independently of the disk parameters and with
115 ++ any workload. It also tries to guarantee low latency to
116 ++ interactive and soft real-time applications.
117 ++
118 + config DEFAULT_NOOP
119 + bool "No-op"
120 +
121 +@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED
122 + string
123 + default "deadline" if DEFAULT_DEADLINE
124 + default "cfq" if DEFAULT_CFQ
125 ++ default "bfq" if DEFAULT_BFQ
126 + default "noop" if DEFAULT_NOOP
127 +
128 + endmenu
129 +diff --git a/block/Makefile b/block/Makefile
130 +index 20645e8..cbd83fb 100644
131 +--- a/block/Makefile
132 ++++ b/block/Makefile
133 +@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
134 + obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
135 + obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
136 + obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
137 ++obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
138 +
139 + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
140 + obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
141 +diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
142 +index b613ffd..43c5dc9 100644
143 +--- a/include/linux/cgroup_subsys.h
144 ++++ b/include/linux/cgroup_subsys.h
145 +@@ -39,6 +39,10 @@ SUBSYS(net_cls)
146 + SUBSYS(blkio)
147 + #endif
148 +
149 ++#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)
150 ++SUBSYS(bfqio)
151 ++#endif
152 ++
153 + #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)
154 + SUBSYS(perf)
155 + #endif
156 +--
157 +1.8.5.2
158 +
159
160 Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch
161 ===================================================================
162 --- genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch 2014-01-29 14:41:45 UTC (rev 2660)
163 +++ genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch 2014-01-30 16:49:47 UTC (rev 2661)
164 @@ -1,97 +0,0 @@
165 -From 3728677b4d3cd39d83be87f9939328201b871c48 Mon Sep 17 00:00:00 2001
166 -From: Arianna Avanzini <avanzini.arianna@×××××.com>
167 -Date: Tue, 3 Sep 2013 16:50:42 +0200
168 -Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v6r2-3.11
169 -
170 -Update Kconfig.iosched and do the related Makefile changes to include
171 -kernel configuration options for BFQ. Also add the bfqio controller
172 -to the cgroups subsystem.
173 -
174 -Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
175 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
176 ----
177 - block/Kconfig.iosched | 25 +++++++++++++++++++++++++
178 - block/Makefile | 1 +
179 - include/linux/cgroup_subsys.h | 4 ++++
180 - 3 files changed, 30 insertions(+)
181 -
182 -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
183 -index 421bef9..695e064 100644
184 ---- a/block/Kconfig.iosched
185 -+++ b/block/Kconfig.iosched
186 -@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED
187 - ---help---
188 - Enable group IO scheduling in CFQ.
189 -
190 -+config IOSCHED_BFQ
191 -+ tristate "BFQ I/O scheduler"
192 -+ default n
193 -+ ---help---
194 -+ The BFQ I/O scheduler tries to distribute bandwidth among
195 -+ all processes according to their weights.
196 -+ It aims at distributing the bandwidth as desired, independently of
197 -+ the disk parameters and with any workload. It also tries to
198 -+ guarantee low latency to interactive and soft real-time
199 -+ applications. If compiled built-in (saying Y here), BFQ can
200 -+ be configured to support hierarchical scheduling.
201 -+
202 -+config CGROUP_BFQIO
203 -+ bool "BFQ hierarchical scheduling support"
204 -+ depends on CGROUPS && IOSCHED_BFQ=y
205 -+ default n
206 -+ ---help---
207 -+ Enable hierarchical scheduling in BFQ, using the cgroups
208 -+ filesystem interface. The name of the subsystem will be
209 -+ bfqio.
210 -+
211 - choice
212 - prompt "Default I/O scheduler"
213 - default DEFAULT_CFQ
214 -@@ -52,6 +73,9 @@ choice
215 - config DEFAULT_CFQ
216 - bool "CFQ" if IOSCHED_CFQ=y
217 -
218 -+ config DEFAULT_BFQ
219 -+ bool "BFQ" if IOSCHED_BFQ=y
220 -+
221 - config DEFAULT_NOOP
222 - bool "No-op"
223 -
224 -@@ -61,6 +85,7 @@ config DEFAULT_IOSCHED
225 - string
226 - default "deadline" if DEFAULT_DEADLINE
227 - default "cfq" if DEFAULT_CFQ
228 -+ default "bfq" if DEFAULT_BFQ
229 - default "noop" if DEFAULT_NOOP
230 -
231 - endmenu
232 -diff --git a/block/Makefile b/block/Makefile
233 -index 39b76ba..c0d20fa 100644
234 ---- a/block/Makefile
235 -+++ b/block/Makefile
236 -@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
237 - obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
238 - obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
239 - obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
240 -+obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
241 -
242 - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
243 - obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
244 -diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
245 -index b613ffd..43c5dc9 100644
246 ---- a/include/linux/cgroup_subsys.h
247 -+++ b/include/linux/cgroup_subsys.h
248 -@@ -39,6 +39,10 @@ SUBSYS(net_cls)
249 - SUBSYS(blkio)
250 - #endif
251 -
252 -+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)
253 -+SUBSYS(bfqio)
254 -+#endif
255 -+
256 - #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)
257 - SUBSYS(perf)
258 - #endif
259 ---
260 -1.8.1.4
261 -
262
263 Added: genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1
264 ===================================================================
265 --- genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1 (rev 0)
266 +++ genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1 2014-01-30 16:49:47 UTC (rev 2661)
267 @@ -0,0 +1,6008 @@
268 +From 3747f129106ce58fbad1b8f05cc836a6addd8588 Mon Sep 17 00:00:00 2001
269 +From: Paolo Valente <paolo.valente@×××××××.it>
270 +Date: Thu, 9 May 2013 19:10:02 +0200
271 +Subject: [PATCH 2/3] block: introduce the BFQ-v7 I/O sched for 3.13
272 +
273 +Add the BFQ-v7 I/O scheduler to 3.13.
274 +The general structure is borrowed from CFQ, as much of the code for
275 +handling I/O contexts Over time, several useful features have been
276 +ported from CFQ as well (details in the changelog in README.BFQ). A
277 +(bfq_)queue is associated to each task doing I/O on a device, and each
278 +time a scheduling decision has to be made a queue is selected and served
279 +until it expires.
280 +
281 + - Slices are given in the service domain: tasks are assigned
282 + budgets, measured in number of sectors. Once got the disk, a task
283 + must however consume its assigned budget within a configurable
284 + maximum time (by default, the maximum possible value of the
285 + budgets is automatically computed to comply with this timeout).
286 + This allows the desired latency vs "throughput boosting" tradeoff
287 + to be set.
288 +
289 + - Budgets are scheduled according to a variant of WF2Q+, implemented
290 + using an augmented rb-tree to take eligibility into account while
291 + preserving an O(log N) overall complexity.
292 +
293 + - A low-latency tunable is provided; if enabled, both interactive
294 + and soft real-time applications are guaranteed a very low latency.
295 +
296 + - Latency guarantees are preserved also in the presence of NCQ.
297 +
298 + - Also with flash-based devices, a high throughput is achieved
299 + while still preserving latency guarantees.
300 +
301 + - BFQ features Early Queue Merge (EQM), a sort of fusion of the
302 + cooperating-queue-merging and the preemption mechanisms present
303 + in CFQ. EQM is in fact a unified mechanism that tries to get a
304 + sequential read pattern, and hence a high throughput, with any
305 + set of processes performing interleaved I/O over a contiguous
306 + sequence of sectors.
307 +
308 + - BFQ supports full hierarchical scheduling, exporting a cgroups
309 + interface. Since each node has a full scheduler, each group can
310 + be assigned its own weight.
311 +
312 + - If the cgroups interface is not used, only I/O priorities can be
313 + assigned to processes, with ioprio values mapped to weights
314 + with the relation weight = IOPRIO_BE_NR - ioprio.
315 +
316 + - ioprio classes are served in strict priority order, i.e., lower
317 + priority queues are not served as long as there are higher
318 + priority queues. Among queues in the same class the bandwidth is
319 + distributed in proportion to the weight of each queue. A very
320 + thin extra bandwidth is however guaranteed to the Idle class, to
321 + prevent it from starving.
322 +
323 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
324 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
325 +---
326 + block/bfq-cgroup.c | 910 ++++++++++++++
327 + block/bfq-ioc.c | 36 +
328 + block/bfq-iosched.c | 3268 +++++++++++++++++++++++++++++++++++++++++++++++++++
329 + block/bfq-sched.c | 1077 +++++++++++++++++
330 + block/bfq.h | 614 ++++++++++
331 + 5 files changed, 5905 insertions(+)
332 + create mode 100644 block/bfq-cgroup.c
333 + create mode 100644 block/bfq-ioc.c
334 + create mode 100644 block/bfq-iosched.c
335 + create mode 100644 block/bfq-sched.c
336 + create mode 100644 block/bfq.h
337 +
338 +diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
339 +new file mode 100644
340 +index 0000000..b889acf
341 +--- /dev/null
342 ++++ b/block/bfq-cgroup.c
343 +@@ -0,0 +1,910 @@
344 ++/*
345 ++ * BFQ: CGROUPS support.
346 ++ *
347 ++ * Based on ideas and code from CFQ:
348 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
349 ++ *
350 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
351 ++ * Paolo Valente <paolo.valente@×××××××.it>
352 ++ *
353 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
354 ++ *
355 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
356 ++ */
357 ++
358 ++#ifdef CONFIG_CGROUP_BFQIO
359 ++
360 ++static DEFINE_MUTEX(bfqio_mutex);
361 ++
362 ++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
363 ++{
364 ++ return bgrp ? !bgrp->online : false;
365 ++}
366 ++
367 ++static struct bfqio_cgroup bfqio_root_cgroup = {
368 ++ .weight = BFQ_DEFAULT_GRP_WEIGHT,
369 ++ .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
370 ++ .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
371 ++};
372 ++
373 ++static inline void bfq_init_entity(struct bfq_entity *entity,
374 ++ struct bfq_group *bfqg)
375 ++{
376 ++ entity->weight = entity->new_weight;
377 ++ entity->orig_weight = entity->new_weight;
378 ++ entity->ioprio = entity->new_ioprio;
379 ++ entity->ioprio_class = entity->new_ioprio_class;
380 ++ entity->parent = bfqg->my_entity;
381 ++ entity->sched_data = &bfqg->sched_data;
382 ++}
383 ++
384 ++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
385 ++{
386 ++ return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
387 ++}
388 ++
389 ++/*
390 ++ * Search the bfq_group for bfqd into the hash table (by now only a list)
391 ++ * of bgrp. Must be called under rcu_read_lock().
392 ++ */
393 ++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
394 ++ struct bfq_data *bfqd)
395 ++{
396 ++ struct bfq_group *bfqg;
397 ++ void *key;
398 ++
399 ++ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
400 ++ key = rcu_dereference(bfqg->bfqd);
401 ++ if (key == bfqd)
402 ++ return bfqg;
403 ++ }
404 ++
405 ++ return NULL;
406 ++}
407 ++
408 ++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
409 ++ struct bfq_group *bfqg)
410 ++{
411 ++ struct bfq_entity *entity = &bfqg->entity;
412 ++
413 ++ /*
414 ++ * If the weight of the entity has never been set via the sysfs
415 ++ * interface, then bgrp->weight == 0. In this case we initialize
416 ++ * the weight from the current ioprio value. Otherwise, the group
417 ++ * weight, if set, has priority over the ioprio value.
418 ++ */
419 ++ if (bgrp->weight == 0) {
420 ++ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
421 ++ entity->new_ioprio = bgrp->ioprio;
422 ++ } else {
423 ++ entity->new_weight = bgrp->weight;
424 ++ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
425 ++ }
426 ++ entity->orig_weight = entity->weight = entity->new_weight;
427 ++ entity->ioprio = entity->new_ioprio;
428 ++ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
429 ++ entity->my_sched_data = &bfqg->sched_data;
430 ++}
431 ++
432 ++static inline void bfq_group_set_parent(struct bfq_group *bfqg,
433 ++ struct bfq_group *parent)
434 ++{
435 ++ struct bfq_entity *entity;
436 ++
437 ++ BUG_ON(parent == NULL);
438 ++ BUG_ON(bfqg == NULL);
439 ++
440 ++ entity = &bfqg->entity;
441 ++ entity->parent = parent->my_entity;
442 ++ entity->sched_data = &parent->sched_data;
443 ++}
444 ++
445 ++/**
446 ++ * bfq_group_chain_alloc - allocate a chain of groups.
447 ++ * @bfqd: queue descriptor.
448 ++ * @css: the leaf cgroup_subsys_state this chain starts from.
449 ++ *
450 ++ * Allocate a chain of groups starting from the one belonging to
451 ++ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
452 ++ * to the root has already an allocated group on @bfqd.
453 ++ */
454 ++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
455 ++ struct cgroup_subsys_state *css)
456 ++{
457 ++ struct bfqio_cgroup *bgrp;
458 ++ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
459 ++
460 ++ for (; css != NULL; css = css->parent) {
461 ++ bgrp = css_to_bfqio(css);
462 ++
463 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
464 ++ if (bfqg != NULL) {
465 ++ /*
466 ++ * All the cgroups in the path from there to the
467 ++ * root must have a bfq_group for bfqd, so we don't
468 ++ * need any more allocations.
469 ++ */
470 ++ break;
471 ++ }
472 ++
473 ++ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
474 ++ if (bfqg == NULL)
475 ++ goto cleanup;
476 ++
477 ++ bfq_group_init_entity(bgrp, bfqg);
478 ++ bfqg->my_entity = &bfqg->entity;
479 ++
480 ++ if (leaf == NULL) {
481 ++ leaf = bfqg;
482 ++ prev = leaf;
483 ++ } else {
484 ++ bfq_group_set_parent(prev, bfqg);
485 ++ /*
486 ++ * Build a list of allocated nodes using the bfqd
487 ++ * filed, that is still unused and will be initialized
488 ++ * only after the node will be connected.
489 ++ */
490 ++ prev->bfqd = bfqg;
491 ++ prev = bfqg;
492 ++ }
493 ++ }
494 ++
495 ++ return leaf;
496 ++
497 ++cleanup:
498 ++ while (leaf != NULL) {
499 ++ prev = leaf;
500 ++ leaf = leaf->bfqd;
501 ++ kfree(prev);
502 ++ }
503 ++
504 ++ return NULL;
505 ++}
506 ++
507 ++/**
508 ++ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
509 ++ * @bfqd: the queue descriptor.
510 ++ * @css: the leaf cgroup_subsys_state to start from.
511 ++ * @leaf: the leaf group (to be associated to @cgroup).
512 ++ *
513 ++ * Try to link a chain of groups to a cgroup hierarchy, connecting the
514 ++ * nodes bottom-up, so we can be sure that when we find a cgroup in the
515 ++ * hierarchy that already as a group associated to @bfqd all the nodes
516 ++ * in the path to the root cgroup have one too.
517 ++ *
518 ++ * On locking: the queue lock protects the hierarchy (there is a hierarchy
519 ++ * per device) while the bfqio_cgroup lock protects the list of groups
520 ++ * belonging to the same cgroup.
521 ++ */
522 ++static void bfq_group_chain_link(struct bfq_data *bfqd,
523 ++ struct cgroup_subsys_state *css,
524 ++ struct bfq_group *leaf)
525 ++{
526 ++ struct bfqio_cgroup *bgrp;
527 ++ struct bfq_group *bfqg, *next, *prev = NULL;
528 ++ unsigned long flags;
529 ++
530 ++ assert_spin_locked(bfqd->queue->queue_lock);
531 ++
532 ++ for (; css != NULL && leaf != NULL; css = css->parent) {
533 ++ bgrp = css_to_bfqio(css);
534 ++ next = leaf->bfqd;
535 ++
536 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
537 ++ BUG_ON(bfqg != NULL);
538 ++
539 ++ spin_lock_irqsave(&bgrp->lock, flags);
540 ++
541 ++ rcu_assign_pointer(leaf->bfqd, bfqd);
542 ++ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
543 ++ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
544 ++
545 ++ spin_unlock_irqrestore(&bgrp->lock, flags);
546 ++
547 ++ prev = leaf;
548 ++ leaf = next;
549 ++ }
550 ++
551 ++ BUG_ON(css == NULL && leaf != NULL);
552 ++ if (css != NULL && prev != NULL) {
553 ++ bgrp = css_to_bfqio(css);
554 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
555 ++ bfq_group_set_parent(prev, bfqg);
556 ++ }
557 ++}
558 ++
559 ++/**
560 ++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
561 ++ * @bfqd: queue descriptor.
562 ++ * @cgroup: cgroup being searched for.
563 ++ *
564 ++ * Return a group associated to @bfqd in @cgroup, allocating one if
565 ++ * necessary. When a group is returned all the cgroups in the path
566 ++ * to the root have a group associated to @bfqd.
567 ++ *
568 ++ * If the allocation fails, return the root group: this breaks guarantees
569 ++ * but is a safe fallbak. If this loss becames a problem it can be
570 ++ * mitigated using the equivalent weight (given by the product of the
571 ++ * weights of the groups in the path from @group to the root) in the
572 ++ * root scheduler.
573 ++ *
574 ++ * We allocate all the missing nodes in the path from the leaf cgroup
575 ++ * to the root and we connect the nodes only after all the allocations
576 ++ * have been successful.
577 ++ */
578 ++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
579 ++ struct cgroup_subsys_state *css)
580 ++{
581 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
582 ++ struct bfq_group *bfqg;
583 ++
584 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
585 ++ if (bfqg != NULL)
586 ++ return bfqg;
587 ++
588 ++ bfqg = bfq_group_chain_alloc(bfqd, css);
589 ++ if (bfqg != NULL)
590 ++ bfq_group_chain_link(bfqd, css, bfqg);
591 ++ else
592 ++ bfqg = bfqd->root_group;
593 ++
594 ++ return bfqg;
595 ++}
596 ++
597 ++/**
598 ++ * bfq_bfqq_move - migrate @bfqq to @bfqg.
599 ++ * @bfqd: queue descriptor.
600 ++ * @bfqq: the queue to move.
601 ++ * @entity: @bfqq's entity.
602 ++ * @bfqg: the group to move to.
603 ++ *
604 ++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
605 ++ * it on the new one. Avoid putting the entity on the old group idle tree.
606 ++ *
607 ++ * Must be called under the queue lock; the cgroup owning @bfqg must
608 ++ * not disappear (by now this just means that we are called under
609 ++ * rcu_read_lock()).
610 ++ */
611 ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
612 ++ struct bfq_entity *entity, struct bfq_group *bfqg)
613 ++{
614 ++ int busy, resume;
615 ++
616 ++ busy = bfq_bfqq_busy(bfqq);
617 ++ resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
618 ++
619 ++ BUG_ON(resume && !entity->on_st);
620 ++ BUG_ON(busy && !resume && entity->on_st &&
621 ++ bfqq != bfqd->in_service_queue);
622 ++
623 ++ if (busy) {
624 ++ BUG_ON(atomic_read(&bfqq->ref) < 2);
625 ++
626 ++ if (!resume)
627 ++ bfq_del_bfqq_busy(bfqd, bfqq, 0);
628 ++ else
629 ++ bfq_deactivate_bfqq(bfqd, bfqq, 0);
630 ++ } else if (entity->on_st)
631 ++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
632 ++
633 ++ /*
634 ++ * Here we use a reference to bfqg. We don't need a refcounter
635 ++ * as the cgroup reference will not be dropped, so that its
636 ++ * destroy() callback will not be invoked.
637 ++ */
638 ++ entity->parent = bfqg->my_entity;
639 ++ entity->sched_data = &bfqg->sched_data;
640 ++
641 ++ if (busy && resume)
642 ++ bfq_activate_bfqq(bfqd, bfqq);
643 ++
644 ++ if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)
645 ++ bfq_schedule_dispatch(bfqd);
646 ++}
647 ++
648 ++/**
649 ++ * __bfq_bic_change_cgroup - move @bic to @cgroup.
650 ++ * @bfqd: the queue descriptor.
651 ++ * @bic: the bic to move.
652 ++ * @cgroup: the cgroup to move to.
653 ++ *
654 ++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
655 ++ * has to make sure that the reference to cgroup is valid across the call.
656 ++ *
657 ++ * NOTE: an alternative approach might have been to store the current
658 ++ * cgroup in bfqq and getting a reference to it, reducing the lookup
659 ++ * time here, at the price of slightly more complex code.
660 ++ */
661 ++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
662 ++ struct bfq_io_cq *bic,
663 ++ struct cgroup_subsys_state *css)
664 ++{
665 ++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
666 ++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
667 ++ struct bfq_entity *entity;
668 ++ struct bfq_group *bfqg;
669 ++ struct bfqio_cgroup *bgrp;
670 ++
671 ++ bgrp = css_to_bfqio(css);
672 ++
673 ++ bfqg = bfq_find_alloc_group(bfqd, css);
674 ++ if (async_bfqq != NULL) {
675 ++ entity = &async_bfqq->entity;
676 ++
677 ++ if (entity->sched_data != &bfqg->sched_data) {
678 ++ bic_set_bfqq(bic, NULL, 0);
679 ++ bfq_log_bfqq(bfqd, async_bfqq,
680 ++ "bic_change_group: %p %d",
681 ++ async_bfqq, atomic_read(&async_bfqq->ref));
682 ++ bfq_put_queue(async_bfqq);
683 ++ }
684 ++ }
685 ++
686 ++ if (sync_bfqq != NULL) {
687 ++ entity = &sync_bfqq->entity;
688 ++ if (entity->sched_data != &bfqg->sched_data)
689 ++ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
690 ++ }
691 ++
692 ++ return bfqg;
693 ++}
694 ++
695 ++/**
696 ++ * bfq_bic_change_cgroup - move @bic to @cgroup.
697 ++ * @bic: the bic being migrated.
698 ++ * @cgroup: the destination cgroup.
699 ++ *
700 ++ * When the task owning @bic is moved to @cgroup, @bic is immediately
701 ++ * moved into its new parent group.
702 ++ */
703 ++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
704 ++ struct cgroup_subsys_state *css)
705 ++{
706 ++ struct bfq_data *bfqd;
707 ++ unsigned long uninitialized_var(flags);
708 ++
709 ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
710 ++ &flags);
711 ++ if (bfqd != NULL) {
712 ++ __bfq_bic_change_cgroup(bfqd, bic, css);
713 ++ bfq_put_bfqd_unlock(bfqd, &flags);
714 ++ }
715 ++}
716 ++
717 ++/**
718 ++ * bfq_bic_update_cgroup - update the cgroup of @bic.
719 ++ * @bic: the @bic to update.
720 ++ *
721 ++ * Make sure that @bic is enqueued in the cgroup of the current task.
722 ++ * We need this in addition to moving bics during the cgroup attach
723 ++ * phase because the task owning @bic could be at its first disk
724 ++ * access or we may end up in the root cgroup as the result of a
725 ++ * memory allocation failure and here we try to move to the right
726 ++ * group.
727 ++ *
728 ++ * Must be called under the queue lock. It is safe to use the returned
729 ++ * value even after the rcu_read_unlock() as the migration/destruction
730 ++ * paths act under the queue lock too. IOW it is impossible to race with
731 ++ * group migration/destruction and end up with an invalid group as:
732 ++ * a) here cgroup has not yet been destroyed, nor its destroy callback
733 ++ * has started execution, as current holds a reference to it,
734 ++ * b) if it is destroyed after rcu_read_unlock() [after current is
735 ++ * migrated to a different cgroup] its attach() callback will have
736 ++ * taken care of remove all the references to the old cgroup data.
737 ++ */
738 ++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
739 ++{
740 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
741 ++ struct bfq_group *bfqg;
742 ++ struct cgroup_subsys_state *css;
743 ++
744 ++ BUG_ON(bfqd == NULL);
745 ++
746 ++ rcu_read_lock();
747 ++ css = task_css(current, bfqio_subsys_id);
748 ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
749 ++ rcu_read_unlock();
750 ++
751 ++ return bfqg;
752 ++}
753 ++
754 ++/**
755 ++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
756 ++ * @st: the service tree being flushed.
757 ++ */
758 ++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
759 ++{
760 ++ struct bfq_entity *entity = st->first_idle;
761 ++
762 ++ for (; entity != NULL; entity = st->first_idle)
763 ++ __bfq_deactivate_entity(entity, 0);
764 ++}
765 ++
766 ++/**
767 ++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
768 ++ * @bfqd: the device data structure with the root group.
769 ++ * @entity: the entity to move.
770 ++ */
771 ++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
772 ++ struct bfq_entity *entity)
773 ++{
774 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
775 ++
776 ++ BUG_ON(bfqq == NULL);
777 ++ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
778 ++ return;
779 ++}
780 ++
781 ++/**
782 ++ * bfq_reparent_active_entities - move to the root group all active entities.
783 ++ * @bfqd: the device data structure with the root group.
784 ++ * @bfqg: the group to move from.
785 ++ * @st: the service tree with the entities.
786 ++ *
787 ++ * Needs queue_lock to be taken and reference to be valid over the call.
788 ++ */
789 ++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
790 ++ struct bfq_group *bfqg,
791 ++ struct bfq_service_tree *st)
792 ++{
793 ++ struct rb_root *active = &st->active;
794 ++ struct bfq_entity *entity = NULL;
795 ++
796 ++ if (!RB_EMPTY_ROOT(&st->active))
797 ++ entity = bfq_entity_of(rb_first(active));
798 ++
799 ++ for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))
800 ++ bfq_reparent_leaf_entity(bfqd, entity);
801 ++
802 ++ if (bfqg->sched_data.active_entity != NULL)
803 ++ bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);
804 ++
805 ++ return;
806 ++}
807 ++
808 ++/**
809 ++ * bfq_destroy_group - destroy @bfqg.
810 ++ * @bgrp: the bfqio_cgroup containing @bfqg.
811 ++ * @bfqg: the group being destroyed.
812 ++ *
813 ++ * Destroy @bfqg, making sure that it is not referenced from its parent.
814 ++ */
815 ++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
816 ++{
817 ++ struct bfq_data *bfqd;
818 ++ struct bfq_service_tree *st;
819 ++ struct bfq_entity *entity = bfqg->my_entity;
820 ++ unsigned long uninitialized_var(flags);
821 ++ int i;
822 ++
823 ++ hlist_del(&bfqg->group_node);
824 ++
825 ++ /*
826 ++ * Empty all service_trees belonging to this group before deactivating
827 ++ * the group itself.
828 ++ */
829 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
830 ++ st = bfqg->sched_data.service_tree + i;
831 ++
832 ++ /*
833 ++ * The idle tree may still contain bfq_queues belonging
834 ++ * to exited task because they never migrated to a different
835 ++ * cgroup from the one being destroyed now. Noone else
836 ++ * can access them so it's safe to act without any lock.
837 ++ */
838 ++ bfq_flush_idle_tree(st);
839 ++
840 ++ /*
841 ++ * It may happen that some queues are still active
842 ++ * (busy) upon group destruction (if the corresponding
843 ++ * processes have been forced to terminate). We move
844 ++ * all the leaf entities corresponding to these queues
845 ++ * to the root_group.
846 ++ * Also, it may happen that the group has an entity
847 ++ * under service, which is disconnected from the active
848 ++ * tree: it must be moved, too.
849 ++ * There is no need to put the sync queues, as the
850 ++ * scheduler has taken no reference.
851 ++ */
852 ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
853 ++ if (bfqd != NULL) {
854 ++ bfq_reparent_active_entities(bfqd, bfqg, st);
855 ++ bfq_put_bfqd_unlock(bfqd, &flags);
856 ++ }
857 ++ BUG_ON(!RB_EMPTY_ROOT(&st->active));
858 ++ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
859 ++ }
860 ++ BUG_ON(bfqg->sched_data.next_active != NULL);
861 ++ BUG_ON(bfqg->sched_data.active_entity != NULL);
862 ++
863 ++ /*
864 ++ * We may race with device destruction, take extra care when
865 ++ * dereferencing bfqg->bfqd.
866 ++ */
867 ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
868 ++ if (bfqd != NULL) {
869 ++ hlist_del(&bfqg->bfqd_node);
870 ++ __bfq_deactivate_entity(entity, 0);
871 ++ bfq_put_async_queues(bfqd, bfqg);
872 ++ bfq_put_bfqd_unlock(bfqd, &flags);
873 ++ }
874 ++ BUG_ON(entity->tree != NULL);
875 ++
876 ++ /*
877 ++ * No need to defer the kfree() to the end of the RCU grace
878 ++ * period: we are called from the destroy() callback of our
879 ++ * cgroup, so we can be sure that noone is a) still using
880 ++ * this cgroup or b) doing lookups in it.
881 ++ */
882 ++ kfree(bfqg);
883 ++}
884 ++
885 ++static void bfq_end_raising_async(struct bfq_data *bfqd)
886 ++{
887 ++ struct hlist_node *tmp;
888 ++ struct bfq_group *bfqg;
889 ++
890 ++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
891 ++ bfq_end_raising_async_queues(bfqd, bfqg);
892 ++ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
893 ++}
894 ++
895 ++/**
896 ++ * bfq_disconnect_groups - diconnect @bfqd from all its groups.
897 ++ * @bfqd: the device descriptor being exited.
898 ++ *
899 ++ * When the device exits we just make sure that no lookup can return
900 ++ * the now unused group structures. They will be deallocated on cgroup
901 ++ * destruction.
902 ++ */
903 ++static void bfq_disconnect_groups(struct bfq_data *bfqd)
904 ++{
905 ++ struct hlist_node *tmp;
906 ++ struct bfq_group *bfqg;
907 ++
908 ++ bfq_log(bfqd, "disconnect_groups beginning");
909 ++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
910 ++ hlist_del(&bfqg->bfqd_node);
911 ++
912 ++ __bfq_deactivate_entity(bfqg->my_entity, 0);
913 ++
914 ++ /*
915 ++ * Don't remove from the group hash, just set an
916 ++ * invalid key. No lookups can race with the
917 ++ * assignment as bfqd is being destroyed; this
918 ++ * implies also that new elements cannot be added
919 ++ * to the list.
920 ++ */
921 ++ rcu_assign_pointer(bfqg->bfqd, NULL);
922 ++
923 ++ bfq_log(bfqd, "disconnect_groups: put async for group %p",
924 ++ bfqg);
925 ++ bfq_put_async_queues(bfqd, bfqg);
926 ++ }
927 ++}
928 ++
929 ++static inline void bfq_free_root_group(struct bfq_data *bfqd)
930 ++{
931 ++ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
932 ++ struct bfq_group *bfqg = bfqd->root_group;
933 ++
934 ++ bfq_put_async_queues(bfqd, bfqg);
935 ++
936 ++ spin_lock_irq(&bgrp->lock);
937 ++ hlist_del_rcu(&bfqg->group_node);
938 ++ spin_unlock_irq(&bgrp->lock);
939 ++
940 ++ /*
941 ++ * No need to synchronize_rcu() here: since the device is gone
942 ++ * there cannot be any read-side access to its root_group.
943 ++ */
944 ++ kfree(bfqg);
945 ++}
946 ++
947 ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
948 ++{
949 ++ struct bfq_group *bfqg;
950 ++ struct bfqio_cgroup *bgrp;
951 ++ int i;
952 ++
953 ++ bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);
954 ++ if (bfqg == NULL)
955 ++ return NULL;
956 ++
957 ++ bfqg->entity.parent = NULL;
958 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
959 ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
960 ++
961 ++ bgrp = &bfqio_root_cgroup;
962 ++ spin_lock_irq(&bgrp->lock);
963 ++ rcu_assign_pointer(bfqg->bfqd, bfqd);
964 ++ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
965 ++ spin_unlock_irq(&bgrp->lock);
966 ++
967 ++ return bfqg;
968 ++}
969 ++
970 ++#define SHOW_FUNCTION(__VAR) \
971 ++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
972 ++ struct cftype *cftype) \
973 ++{ \
974 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
975 ++ u64 ret = -ENODEV; \
976 ++ \
977 ++ mutex_lock(&bfqio_mutex); \
978 ++ if (bfqio_is_removed(bgrp)) \
979 ++ goto out_unlock; \
980 ++ \
981 ++ spin_lock_irq(&bgrp->lock); \
982 ++ ret = bgrp->__VAR; \
983 ++ spin_unlock_irq(&bgrp->lock); \
984 ++ \
985 ++out_unlock: \
986 ++ mutex_unlock(&bfqio_mutex); \
987 ++ return ret; \
988 ++}
989 ++
990 ++SHOW_FUNCTION(weight);
991 ++SHOW_FUNCTION(ioprio);
992 ++SHOW_FUNCTION(ioprio_class);
993 ++#undef SHOW_FUNCTION
994 ++
995 ++#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
996 ++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
997 ++ struct cftype *cftype, \
998 ++ u64 val) \
999 ++{ \
1000 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
1001 ++ struct bfq_group *bfqg; \
1002 ++ int ret = -EINVAL; \
1003 ++ \
1004 ++ if (val < (__MIN) || val > (__MAX)) \
1005 ++ return ret; \
1006 ++ \
1007 ++ ret = -ENODEV; \
1008 ++ mutex_lock(&bfqio_mutex); \
1009 ++ if (bfqio_is_removed(bgrp)) \
1010 ++ goto out_unlock; \
1011 ++ ret = 0; \
1012 ++ \
1013 ++ spin_lock_irq(&bgrp->lock); \
1014 ++ bgrp->__VAR = (unsigned short)val; \
1015 ++ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
1016 ++ /* \
1017 ++ * Setting the ioprio_changed flag of the entity \
1018 ++ * to 1 with new_##__VAR == ##__VAR would re-set \
1019 ++ * the value of the weight to its ioprio mapping. \
1020 ++ * Set the flag only if necessary. \
1021 ++ */ \
1022 ++ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
1023 ++ bfqg->entity.new_##__VAR = (unsigned short)val; \
1024 ++ smp_wmb(); \
1025 ++ bfqg->entity.ioprio_changed = 1; \
1026 ++ } \
1027 ++ } \
1028 ++ spin_unlock_irq(&bgrp->lock); \
1029 ++ \
1030 ++out_unlock: \
1031 ++ mutex_unlock(&bfqio_mutex); \
1032 ++ return ret; \
1033 ++}
1034 ++
1035 ++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
1036 ++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
1037 ++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
1038 ++#undef STORE_FUNCTION
1039 ++
1040 ++static struct cftype bfqio_files[] = {
1041 ++ {
1042 ++ .name = "weight",
1043 ++ .read_u64 = bfqio_cgroup_weight_read,
1044 ++ .write_u64 = bfqio_cgroup_weight_write,
1045 ++ },
1046 ++ {
1047 ++ .name = "ioprio",
1048 ++ .read_u64 = bfqio_cgroup_ioprio_read,
1049 ++ .write_u64 = bfqio_cgroup_ioprio_write,
1050 ++ },
1051 ++ {
1052 ++ .name = "ioprio_class",
1053 ++ .read_u64 = bfqio_cgroup_ioprio_class_read,
1054 ++ .write_u64 = bfqio_cgroup_ioprio_class_write,
1055 ++ },
1056 ++ { }, /* terminate */
1057 ++};
1058 ++
1059 ++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state
1060 ++ *parent_css)
1061 ++{
1062 ++ struct bfqio_cgroup *bgrp;
1063 ++
1064 ++ if (parent_css != NULL) {
1065 ++ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
1066 ++ if (bgrp == NULL)
1067 ++ return ERR_PTR(-ENOMEM);
1068 ++ } else
1069 ++ bgrp = &bfqio_root_cgroup;
1070 ++
1071 ++ spin_lock_init(&bgrp->lock);
1072 ++ INIT_HLIST_HEAD(&bgrp->group_data);
1073 ++ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
1074 ++ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
1075 ++
1076 ++ return &bgrp->css;
1077 ++}
1078 ++
1079 ++/*
1080 ++ * We cannot support shared io contexts, as we have no means to support
1081 ++ * two tasks with the same ioc in two different groups without major rework
1082 ++ * of the main bic/bfqq data structures. By now we allow a task to change
1083 ++ * its cgroup only if it's the only owner of its ioc; the drawback of this
1084 ++ * behavior is that a group containing a task that forked using CLONE_IO
1085 ++ * will not be destroyed until the tasks sharing the ioc die.
1086 ++ */
1087 ++static int bfqio_can_attach(struct cgroup_subsys_state *css,
1088 ++ struct cgroup_taskset *tset)
1089 ++{
1090 ++ struct task_struct *task;
1091 ++ struct io_context *ioc;
1092 ++ int ret = 0;
1093 ++
1094 ++ cgroup_taskset_for_each(task, css, tset) {
1095 ++ /*
1096 ++ * task_lock() is needed to avoid races with
1097 ++ * exit_io_context()
1098 ++ */
1099 ++ task_lock(task);
1100 ++ ioc = task->io_context;
1101 ++ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
1102 ++ /*
1103 ++ * ioc == NULL means that the task is either too young
1104 ++ * or exiting: if it has still no ioc the ioc can't be
1105 ++ * shared, if the task is exiting the attach will fail
1106 ++ * anyway, no matter what we return here.
1107 ++ */
1108 ++ ret = -EINVAL;
1109 ++ task_unlock(task);
1110 ++ if (ret)
1111 ++ break;
1112 ++ }
1113 ++
1114 ++ return ret;
1115 ++}
1116 ++
1117 ++static void bfqio_attach(struct cgroup_subsys_state *css,
1118 ++ struct cgroup_taskset *tset)
1119 ++{
1120 ++ struct task_struct *task;
1121 ++ struct io_context *ioc;
1122 ++ struct io_cq *icq;
1123 ++
1124 ++ /*
1125 ++ * IMPORTANT NOTE: The move of more than one process at a time to a
1126 ++ * new group has not yet been tested.
1127 ++ */
1128 ++ cgroup_taskset_for_each(task, css, tset) {
1129 ++ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1130 ++ if (ioc) {
1131 ++ /*
1132 ++ * Handle cgroup change here.
1133 ++ */
1134 ++ rcu_read_lock();
1135 ++ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
1136 ++ if (!strncmp(
1137 ++ icq->q->elevator->type->elevator_name,
1138 ++ "bfq", ELV_NAME_MAX))
1139 ++ bfq_bic_change_cgroup(icq_to_bic(icq),
1140 ++ css);
1141 ++ rcu_read_unlock();
1142 ++ put_io_context(ioc);
1143 ++ }
1144 ++ }
1145 ++}
1146 ++
1147 ++static void bfqio_destroy(struct cgroup_subsys_state *css)
1148 ++{
1149 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
1150 ++ struct hlist_node *tmp;
1151 ++ struct bfq_group *bfqg;
1152 ++
1153 ++ /*
1154 ++ * Since we are destroying the cgroup, there are no more tasks
1155 ++ * referencing it, and all the RCU grace periods that may have
1156 ++ * referenced it are ended (as the destruction of the parent
1157 ++ * cgroup is RCU-safe); bgrp->group_data will not be accessed by
1158 ++ * anything else and we don't need any synchronization.
1159 ++ */
1160 ++ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
1161 ++ bfq_destroy_group(bgrp, bfqg);
1162 ++
1163 ++ BUG_ON(!hlist_empty(&bgrp->group_data));
1164 ++
1165 ++ kfree(bgrp);
1166 ++}
1167 ++
1168 ++static int bfqio_css_online(struct cgroup_subsys_state *css)
1169 ++{
1170 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
1171 ++
1172 ++ mutex_lock(&bfqio_mutex);
1173 ++ bgrp->online = true;
1174 ++ mutex_unlock(&bfqio_mutex);
1175 ++
1176 ++ return 0;
1177 ++}
1178 ++
1179 ++static void bfqio_css_offline(struct cgroup_subsys_state *css)
1180 ++{
1181 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
1182 ++
1183 ++ mutex_lock(&bfqio_mutex);
1184 ++ bgrp->online = false;
1185 ++ mutex_unlock(&bfqio_mutex);
1186 ++}
1187 ++
1188 ++struct cgroup_subsys bfqio_subsys = {
1189 ++ .name = "bfqio",
1190 ++ .css_alloc = bfqio_create,
1191 ++ .css_online = bfqio_css_online,
1192 ++ .css_offline = bfqio_css_offline,
1193 ++ .can_attach = bfqio_can_attach,
1194 ++ .attach = bfqio_attach,
1195 ++ .css_free = bfqio_destroy,
1196 ++ .subsys_id = bfqio_subsys_id,
1197 ++ .base_cftypes = bfqio_files,
1198 ++};
1199 ++#else
1200 ++static inline void bfq_init_entity(struct bfq_entity *entity,
1201 ++ struct bfq_group *bfqg)
1202 ++{
1203 ++ entity->weight = entity->new_weight;
1204 ++ entity->orig_weight = entity->new_weight;
1205 ++ entity->ioprio = entity->new_ioprio;
1206 ++ entity->ioprio_class = entity->new_ioprio_class;
1207 ++ entity->sched_data = &bfqg->sched_data;
1208 ++}
1209 ++
1210 ++static inline struct bfq_group *
1211 ++bfq_bic_update_cgroup(struct bfq_io_cq *bic)
1212 ++{
1213 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
1214 ++ return bfqd->root_group;
1215 ++}
1216 ++
1217 ++static inline void bfq_bfqq_move(struct bfq_data *bfqd,
1218 ++ struct bfq_queue *bfqq,
1219 ++ struct bfq_entity *entity,
1220 ++ struct bfq_group *bfqg)
1221 ++{
1222 ++}
1223 ++
1224 ++static void bfq_end_raising_async(struct bfq_data *bfqd)
1225 ++{
1226 ++ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
1227 ++}
1228 ++
1229 ++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
1230 ++{
1231 ++ bfq_put_async_queues(bfqd, bfqd->root_group);
1232 ++}
1233 ++
1234 ++static inline void bfq_free_root_group(struct bfq_data *bfqd)
1235 ++{
1236 ++ kfree(bfqd->root_group);
1237 ++}
1238 ++
1239 ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
1240 ++{
1241 ++ struct bfq_group *bfqg;
1242 ++ int i;
1243 ++
1244 ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
1245 ++ if (bfqg == NULL)
1246 ++ return NULL;
1247 ++
1248 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
1249 ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
1250 ++
1251 ++ return bfqg;
1252 ++}
1253 ++#endif
1254 +diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
1255 +new file mode 100644
1256 +index 0000000..7f6b000
1257 +--- /dev/null
1258 ++++ b/block/bfq-ioc.c
1259 +@@ -0,0 +1,36 @@
1260 ++/*
1261 ++ * BFQ: I/O context handling.
1262 ++ *
1263 ++ * Based on ideas and code from CFQ:
1264 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1265 ++ *
1266 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1267 ++ * Paolo Valente <paolo.valente@×××××××.it>
1268 ++ *
1269 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1270 ++ */
1271 ++
1272 ++/**
1273 ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
1274 ++ * @icq: the iocontext queue.
1275 ++ */
1276 ++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
1277 ++{
1278 ++ /* bic->icq is the first member, %NULL will convert to %NULL */
1279 ++ return container_of(icq, struct bfq_io_cq, icq);
1280 ++}
1281 ++
1282 ++/**
1283 ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
1284 ++ * @bfqd: the lookup key.
1285 ++ * @ioc: the io_context of the process doing I/O.
1286 ++ *
1287 ++ * Queue lock must be held.
1288 ++ */
1289 ++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
1290 ++ struct io_context *ioc)
1291 ++{
1292 ++ if (ioc)
1293 ++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
1294 ++ return NULL;
1295 ++}
1296 +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
1297 +new file mode 100644
1298 +index 0000000..7670400
1299 +--- /dev/null
1300 ++++ b/block/bfq-iosched.c
1301 +@@ -0,0 +1,3268 @@
1302 ++/*
1303 ++ * BFQ, or Budget Fair Queueing, disk scheduler.
1304 ++ *
1305 ++ * Based on ideas and code from CFQ:
1306 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1307 ++ *
1308 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1309 ++ * Paolo Valente <paolo.valente@×××××××.it>
1310 ++ *
1311 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1312 ++ *
1313 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
1314 ++ *
1315 ++ * BFQ is a proportional share disk scheduling algorithm based on the
1316 ++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in
1317 ++ * number of sectors, to tasks instead of time slices. The disk is not granted
1318 ++ * to the in-service task for a given time slice, but until it has exahusted
1319 ++ * its assigned budget. This change from the time to the service domain allows
1320 ++ * BFQ to distribute the disk bandwidth among tasks as desired, without any
1321 ++ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an
1322 ++ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to
1323 ++ * their budgets (more precisely BFQ schedules queues associated to tasks).
1324 ++ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to
1325 ++ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low
1326 ++ * latencies to interactive and soft real-time applications.
1327 ++ *
1328 ++ * BFQ is described in [1], where also a reference to the initial, more
1329 ++ * theoretical paper on BFQ can be found. The interested reader can find in
1330 ++ * the latter paper full details on the main algorithm as well as formulas of
1331 ++ * the guarantees, plus formal proofs of all the properties. With respect to
1332 ++ * the version of BFQ presented in these papers, this implementation adds a
1333 ++ * few more heuristics, such as the one that guarantees a low latency to soft
1334 ++ * real-time applications, and a hierarchical extension based on H-WF2Q+.
1335 ++ *
1336 ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
1337 ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
1338 ++ * complexity derives from the one introduced with EEVDF in [3].
1339 ++ *
1340 ++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
1341 ++ * with the BFQ Disk I/O Scheduler'',
1342 ++ * Proceedings of the 5th Annual International Systems and Storage
1343 ++ * Conference (SYSTOR '12), June 2012.
1344 ++ *
1345 ++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
1346 ++ *
1347 ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
1348 ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
1349 ++ * Oct 1997.
1350 ++ *
1351 ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
1352 ++ *
1353 ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
1354 ++ * First: A Flexible and Accurate Mechanism for Proportional Share
1355 ++ * Resource Allocation,'' technical report.
1356 ++ *
1357 ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
1358 ++ */
1359 ++#include <linux/module.h>
1360 ++#include <linux/slab.h>
1361 ++#include <linux/blkdev.h>
1362 ++#include <linux/cgroup.h>
1363 ++#include <linux/elevator.h>
1364 ++#include <linux/jiffies.h>
1365 ++#include <linux/rbtree.h>
1366 ++#include <linux/ioprio.h>
1367 ++#include "bfq.h"
1368 ++#include "blk.h"
1369 ++
1370 ++/* Max number of dispatches in one round of service. */
1371 ++static const int bfq_quantum = 4;
1372 ++
1373 ++/* Expiration time of sync (0) and async (1) requests, in jiffies. */
1374 ++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
1375 ++
1376 ++/* Maximum backwards seek, in KiB. */
1377 ++static const int bfq_back_max = 16 * 1024;
1378 ++
1379 ++/* Penalty of a backwards seek, in number of sectors. */
1380 ++static const int bfq_back_penalty = 2;
1381 ++
1382 ++/* Idling period duration, in jiffies. */
1383 ++static int bfq_slice_idle = HZ / 125;
1384 ++
1385 ++/* Default maximum budget values, in sectors and number of requests. */
1386 ++static const int bfq_default_max_budget = 16 * 1024;
1387 ++static const int bfq_max_budget_async_rq = 4;
1388 ++
1389 ++/*
1390 ++ * Async to sync throughput distribution is controlled as follows:
1391 ++ * when an async request is served, the entity is charged the number
1392 ++ * of sectors of the request, multipled by the factor below
1393 ++ */
1394 ++static const int bfq_async_charge_factor = 10;
1395 ++
1396 ++/* Default timeout values, in jiffies, approximating CFQ defaults. */
1397 ++static const int bfq_timeout_sync = HZ / 8;
1398 ++static int bfq_timeout_async = HZ / 25;
1399 ++
1400 ++struct kmem_cache *bfq_pool;
1401 ++
1402 ++/* Below this threshold (in ms), we consider thinktime immediate. */
1403 ++#define BFQ_MIN_TT 2
1404 ++
1405 ++/* hw_tag detection: parallel requests threshold and min samples needed. */
1406 ++#define BFQ_HW_QUEUE_THRESHOLD 4
1407 ++#define BFQ_HW_QUEUE_SAMPLES 32
1408 ++
1409 ++#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
1410 ++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
1411 ++
1412 ++/* Min samples used for peak rate estimation (for autotuning). */
1413 ++#define BFQ_PEAK_RATE_SAMPLES 32
1414 ++
1415 ++/* Shift used for peak rate fixed precision calculations. */
1416 ++#define BFQ_RATE_SHIFT 16
1417 ++
1418 ++/*
1419 ++ * The duration of the weight raising for interactive applications is
1420 ++ * computed automatically (as default behaviour), using the following
1421 ++ * formula: duration = (R / r) * T, where r is the peak rate of the
1422 ++ * disk, and R and T are two reference parameters. In particular, R is
1423 ++ * the peak rate of a reference disk, and T is about the maximum time
1424 ++ * for starting popular large applications on that disk, under BFQ and
1425 ++ * while reading two files in parallel. Finally, BFQ uses two
1426 ++ * different pairs (R, T) depending on whether the disk is rotational
1427 ++ * or non-rotational.
1428 ++ */
1429 ++#define T_rot (msecs_to_jiffies(5500))
1430 ++#define T_nonrot (msecs_to_jiffies(2000))
1431 ++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
1432 ++#define R_rot 17415
1433 ++#define R_nonrot 34791
1434 ++
1435 ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
1436 ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1437 ++
1438 ++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
1439 ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
1440 ++
1441 ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
1442 ++
1443 ++#include "bfq-ioc.c"
1444 ++#include "bfq-sched.c"
1445 ++#include "bfq-cgroup.c"
1446 ++
1447 ++#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
1448 ++ IOPRIO_CLASS_IDLE)
1449 ++#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
1450 ++ IOPRIO_CLASS_RT)
1451 ++
1452 ++#define bfq_sample_valid(samples) ((samples) > 80)
1453 ++
1454 ++/*
1455 ++ * We regard a request as SYNC, if either it's a read or has the SYNC bit
1456 ++ * set (in which case it could also be a direct WRITE).
1457 ++ */
1458 ++static inline int bfq_bio_sync(struct bio *bio)
1459 ++{
1460 ++ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
1461 ++ return 1;
1462 ++
1463 ++ return 0;
1464 ++}
1465 ++
1466 ++/*
1467 ++ * Scheduler run of queue, if there are requests pending and no one in the
1468 ++ * driver that will restart queueing.
1469 ++ */
1470 ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
1471 ++{
1472 ++ if (bfqd->queued != 0) {
1473 ++ bfq_log(bfqd, "schedule dispatch");
1474 ++ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
1475 ++ }
1476 ++}
1477 ++
1478 ++/*
1479 ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1480 ++ * We choose the request that is closesr to the head right now. Distance
1481 ++ * behind the head is penalized and only allowed to a certain extent.
1482 ++ */
1483 ++static struct request *bfq_choose_req(struct bfq_data *bfqd,
1484 ++ struct request *rq1,
1485 ++ struct request *rq2,
1486 ++ sector_t last)
1487 ++{
1488 ++ sector_t s1, s2, d1 = 0, d2 = 0;
1489 ++ unsigned long back_max;
1490 ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
1491 ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
1492 ++ unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1493 ++
1494 ++ if (rq1 == NULL || rq1 == rq2)
1495 ++ return rq2;
1496 ++ if (rq2 == NULL)
1497 ++ return rq1;
1498 ++
1499 ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
1500 ++ return rq1;
1501 ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
1502 ++ return rq2;
1503 ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
1504 ++ return rq1;
1505 ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
1506 ++ return rq2;
1507 ++
1508 ++ s1 = blk_rq_pos(rq1);
1509 ++ s2 = blk_rq_pos(rq2);
1510 ++
1511 ++ /*
1512 ++ * By definition, 1KiB is 2 sectors.
1513 ++ */
1514 ++ back_max = bfqd->bfq_back_max * 2;
1515 ++
1516 ++ /*
1517 ++ * Strict one way elevator _except_ in the case where we allow
1518 ++ * short backward seeks which are biased as twice the cost of a
1519 ++ * similar forward seek.
1520 ++ */
1521 ++ if (s1 >= last)
1522 ++ d1 = s1 - last;
1523 ++ else if (s1 + back_max >= last)
1524 ++ d1 = (last - s1) * bfqd->bfq_back_penalty;
1525 ++ else
1526 ++ wrap |= BFQ_RQ1_WRAP;
1527 ++
1528 ++ if (s2 >= last)
1529 ++ d2 = s2 - last;
1530 ++ else if (s2 + back_max >= last)
1531 ++ d2 = (last - s2) * bfqd->bfq_back_penalty;
1532 ++ else
1533 ++ wrap |= BFQ_RQ2_WRAP;
1534 ++
1535 ++ /* Found required data */
1536 ++
1537 ++ /*
1538 ++ * By doing switch() on the bit mask "wrap" we avoid having to
1539 ++ * check two variables for all permutations: --> faster!
1540 ++ */
1541 ++ switch (wrap) {
1542 ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1543 ++ if (d1 < d2)
1544 ++ return rq1;
1545 ++ else if (d2 < d1)
1546 ++ return rq2;
1547 ++ else {
1548 ++ if (s1 >= s2)
1549 ++ return rq1;
1550 ++ else
1551 ++ return rq2;
1552 ++ }
1553 ++
1554 ++ case BFQ_RQ2_WRAP:
1555 ++ return rq1;
1556 ++ case BFQ_RQ1_WRAP:
1557 ++ return rq2;
1558 ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
1559 ++ default:
1560 ++ /*
1561 ++ * Since both rqs are wrapped,
1562 ++ * start with the one that's further behind head
1563 ++ * (--> only *one* back seek required),
1564 ++ * since back seek takes more time than forward.
1565 ++ */
1566 ++ if (s1 <= s2)
1567 ++ return rq1;
1568 ++ else
1569 ++ return rq2;
1570 ++ }
1571 ++}
1572 ++
1573 ++static struct bfq_queue *
1574 ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
1575 ++ sector_t sector, struct rb_node **ret_parent,
1576 ++ struct rb_node ***rb_link)
1577 ++{
1578 ++ struct rb_node **p, *parent;
1579 ++ struct bfq_queue *bfqq = NULL;
1580 ++
1581 ++ parent = NULL;
1582 ++ p = &root->rb_node;
1583 ++ while (*p) {
1584 ++ struct rb_node **n;
1585 ++
1586 ++ parent = *p;
1587 ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
1588 ++
1589 ++ /*
1590 ++ * Sort strictly based on sector. Smallest to the left,
1591 ++ * largest to the right.
1592 ++ */
1593 ++ if (sector > blk_rq_pos(bfqq->next_rq))
1594 ++ n = &(*p)->rb_right;
1595 ++ else if (sector < blk_rq_pos(bfqq->next_rq))
1596 ++ n = &(*p)->rb_left;
1597 ++ else
1598 ++ break;
1599 ++ p = n;
1600 ++ bfqq = NULL;
1601 ++ }
1602 ++
1603 ++ *ret_parent = parent;
1604 ++ if (rb_link)
1605 ++ *rb_link = p;
1606 ++
1607 ++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
1608 ++ (long long unsigned)sector,
1609 ++ bfqq != NULL ? bfqq->pid : 0);
1610 ++
1611 ++ return bfqq;
1612 ++}
1613 ++
1614 ++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1615 ++{
1616 ++ struct rb_node **p, *parent;
1617 ++ struct bfq_queue *__bfqq;
1618 ++
1619 ++ if (bfqq->pos_root != NULL) {
1620 ++ rb_erase(&bfqq->pos_node, bfqq->pos_root);
1621 ++ bfqq->pos_root = NULL;
1622 ++ }
1623 ++
1624 ++ if (bfq_class_idle(bfqq))
1625 ++ return;
1626 ++ if (!bfqq->next_rq)
1627 ++ return;
1628 ++
1629 ++ bfqq->pos_root = &bfqd->rq_pos_tree;
1630 ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
1631 ++ blk_rq_pos(bfqq->next_rq), &parent, &p);
1632 ++ if (__bfqq == NULL) {
1633 ++ rb_link_node(&bfqq->pos_node, parent, p);
1634 ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
1635 ++ } else
1636 ++ bfqq->pos_root = NULL;
1637 ++}
1638 ++
1639 ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
1640 ++ struct bfq_queue *bfqq,
1641 ++ struct request *last)
1642 ++{
1643 ++ struct rb_node *rbnext = rb_next(&last->rb_node);
1644 ++ struct rb_node *rbprev = rb_prev(&last->rb_node);
1645 ++ struct request *next = NULL, *prev = NULL;
1646 ++
1647 ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1648 ++
1649 ++ if (rbprev != NULL)
1650 ++ prev = rb_entry_rq(rbprev);
1651 ++
1652 ++ if (rbnext != NULL)
1653 ++ next = rb_entry_rq(rbnext);
1654 ++ else {
1655 ++ rbnext = rb_first(&bfqq->sort_list);
1656 ++ if (rbnext && rbnext != &last->rb_node)
1657 ++ next = rb_entry_rq(rbnext);
1658 ++ }
1659 ++
1660 ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
1661 ++}
1662 ++
1663 ++static void bfq_del_rq_rb(struct request *rq)
1664 ++{
1665 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1666 ++ struct bfq_data *bfqd = bfqq->bfqd;
1667 ++ const int sync = rq_is_sync(rq);
1668 ++
1669 ++ BUG_ON(bfqq->queued[sync] == 0);
1670 ++ bfqq->queued[sync]--;
1671 ++ bfqd->queued--;
1672 ++
1673 ++ elv_rb_del(&bfqq->sort_list, rq);
1674 ++
1675 ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
1676 ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
1677 ++ bfq_del_bfqq_busy(bfqd, bfqq, 1);
1678 ++ /*
1679 ++ * Remove queue from request-position tree as it is empty.
1680 ++ */
1681 ++ if (bfqq->pos_root != NULL) {
1682 ++ rb_erase(&bfqq->pos_node, bfqq->pos_root);
1683 ++ bfqq->pos_root = NULL;
1684 ++ }
1685 ++ }
1686 ++}
1687 ++
1688 ++/* see the definition of bfq_async_charge_factor for details */
1689 ++static inline unsigned long bfq_serv_to_charge(struct request *rq,
1690 ++ struct bfq_queue *bfqq)
1691 ++{
1692 ++ return blk_rq_sectors(rq) *
1693 ++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
1694 ++ bfq_async_charge_factor));
1695 ++}
1696 ++
1697 ++/**
1698 ++ * bfq_updated_next_req - update the queue after a new next_rq selection.
1699 ++ * @bfqd: the device data the queue belongs to.
1700 ++ * @bfqq: the queue to update.
1701 ++ *
1702 ++ * If the first request of a queue changes we make sure that the queue
1703 ++ * has enough budget to serve at least its first request (if the
1704 ++ * request has grown). We do this because if the queue has not enough
1705 ++ * budget for its first request, it has to go through two dispatch
1706 ++ * rounds to actually get it dispatched.
1707 ++ */
1708 ++static void bfq_updated_next_req(struct bfq_data *bfqd,
1709 ++ struct bfq_queue *bfqq)
1710 ++{
1711 ++ struct bfq_entity *entity = &bfqq->entity;
1712 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1713 ++ struct request *next_rq = bfqq->next_rq;
1714 ++ unsigned long new_budget;
1715 ++
1716 ++ if (next_rq == NULL)
1717 ++ return;
1718 ++
1719 ++ if (bfqq == bfqd->in_service_queue)
1720 ++ /*
1721 ++ * In order not to break guarantees, budgets cannot be
1722 ++ * changed after an entity has been selected.
1723 ++ */
1724 ++ return;
1725 ++
1726 ++ BUG_ON(entity->tree != &st->active);
1727 ++ BUG_ON(entity == entity->sched_data->active_entity);
1728 ++
1729 ++ new_budget = max_t(unsigned long, bfqq->max_budget,
1730 ++ bfq_serv_to_charge(next_rq, bfqq));
1731 ++ entity->budget = new_budget;
1732 ++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
1733 ++ bfq_activate_bfqq(bfqd, bfqq);
1734 ++}
1735 ++
1736 ++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
1737 ++{
1738 ++ u64 dur;
1739 ++
1740 ++ if (bfqd->bfq_raising_max_time > 0)
1741 ++ return bfqd->bfq_raising_max_time;
1742 ++
1743 ++ dur = bfqd->RT_prod;
1744 ++ do_div(dur, bfqd->peak_rate);
1745 ++
1746 ++ return dur;
1747 ++}
1748 ++
1749 ++static void bfq_add_rq_rb(struct request *rq)
1750 ++{
1751 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1752 ++ struct bfq_entity *entity = &bfqq->entity;
1753 ++ struct bfq_data *bfqd = bfqq->bfqd;
1754 ++ struct request *next_rq, *prev;
1755 ++ unsigned long old_raising_coeff = bfqq->raising_coeff;
1756 ++ int idle_for_long_time = 0;
1757 ++
1758 ++ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
1759 ++ bfqq->queued[rq_is_sync(rq)]++;
1760 ++ bfqd->queued++;
1761 ++
1762 ++ elv_rb_add(&bfqq->sort_list, rq);
1763 ++
1764 ++ /*
1765 ++ * Check if this request is a better next-serve candidate.
1766 ++ */
1767 ++ prev = bfqq->next_rq;
1768 ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
1769 ++ BUG_ON(next_rq == NULL);
1770 ++ bfqq->next_rq = next_rq;
1771 ++
1772 ++ /*
1773 ++ * Adjust priority tree position, if next_rq changes.
1774 ++ */
1775 ++ if (prev != bfqq->next_rq)
1776 ++ bfq_rq_pos_tree_add(bfqd, bfqq);
1777 ++
1778 ++ if (!bfq_bfqq_busy(bfqq)) {
1779 ++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
1780 ++ time_is_before_jiffies(bfqq->soft_rt_next_start);
1781 ++ idle_for_long_time = time_is_before_jiffies(
1782 ++ bfqq->budget_timeout +
1783 ++ bfqd->bfq_raising_min_idle_time);
1784 ++ entity->budget = max_t(unsigned long, bfqq->max_budget,
1785 ++ bfq_serv_to_charge(next_rq, bfqq));
1786 ++
1787 ++ if (!bfqd->low_latency)
1788 ++ goto add_bfqq_busy;
1789 ++
1790 ++ /*
1791 ++ * If the queue is not being boosted and has been idle
1792 ++ * for enough time, start a weight-raising period
1793 ++ */
1794 ++ if (old_raising_coeff == 1 &&
1795 ++ (idle_for_long_time || soft_rt)) {
1796 ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1797 ++ if (idle_for_long_time)
1798 ++ bfqq->raising_cur_max_time =
1799 ++ bfq_wrais_duration(bfqd);
1800 ++ else
1801 ++ bfqq->raising_cur_max_time =
1802 ++ bfqd->bfq_raising_rt_max_time;
1803 ++ bfq_log_bfqq(bfqd, bfqq,
1804 ++ "wrais starting at %llu msec,"
1805 ++ "rais_max_time %u",
1806 ++ bfqq->last_rais_start_finish,
1807 ++ jiffies_to_msecs(bfqq->
1808 ++ raising_cur_max_time));
1809 ++ } else if (old_raising_coeff > 1) {
1810 ++ if (idle_for_long_time)
1811 ++ bfqq->raising_cur_max_time =
1812 ++ bfq_wrais_duration(bfqd);
1813 ++ else if (bfqq->raising_cur_max_time ==
1814 ++ bfqd->bfq_raising_rt_max_time &&
1815 ++ !soft_rt) {
1816 ++ bfqq->raising_coeff = 1;
1817 ++ bfq_log_bfqq(bfqd, bfqq,
1818 ++ "wrais ending at %llu msec,"
1819 ++ "rais_max_time %u",
1820 ++ bfqq->last_rais_start_finish,
1821 ++ jiffies_to_msecs(bfqq->
1822 ++ raising_cur_max_time));
1823 ++ } else if ((bfqq->last_rais_start_finish +
1824 ++ bfqq->raising_cur_max_time <
1825 ++ jiffies + bfqd->bfq_raising_rt_max_time) &&
1826 ++ soft_rt) {
1827 ++ /*
1828 ++ *
1829 ++ * The remaining weight-raising time is lower
1830 ++ * than bfqd->bfq_raising_rt_max_time, which
1831 ++ * means that the application is enjoying
1832 ++ * weight raising either because deemed soft rt
1833 ++ * in the near past, or because deemed
1834 ++ * interactive a long ago. In both cases,
1835 ++ * resetting now the current remaining weight-
1836 ++ * raising time for the application to the
1837 ++ * weight-raising duration for soft rt
1838 ++ * applications would not cause any latency
1839 ++ * increase for the application (as the new
1840 ++ * duration would be higher than the remaining
1841 ++ * time).
1842 ++ *
1843 ++ * In addition, the application is now meeting
1844 ++ * the requirements for being deemed soft rt.
1845 ++ * In the end we can correctly and safely
1846 ++ * (re)charge the weight-raising duration for
1847 ++ * the application with the weight-raising
1848 ++ * duration for soft rt applications.
1849 ++ *
1850 ++ * In particular, doing this recharge now, i.e.,
1851 ++ * before the weight-raising period for the
1852 ++ * application finishes, reduces the probability
1853 ++ * of the following negative scenario:
1854 ++ * 1) the weight of a soft rt application is
1855 ++ * raised at startup (as for any newly
1856 ++ * created application),
1857 ++ * 2) since the application is not interactive,
1858 ++ * at a certain time weight-raising is
1859 ++ * stopped for the application,
1860 ++ * 3) at that time the application happens to
1861 ++ * still have pending requests, and hence
1862 ++ * is destined to not have a chance to be
1863 ++ * deemed soft rt before these requests are
1864 ++ * completed (see the comments to the
1865 ++ * function bfq_bfqq_softrt_next_start()
1866 ++ * for details on soft rt detection),
1867 ++ * 4) these pending requests experience a high
1868 ++ * latency because the application is not
1869 ++ * weight-raised while they are pending.
1870 ++ */
1871 ++ bfqq->last_rais_start_finish = jiffies;
1872 ++ bfqq->raising_cur_max_time =
1873 ++ bfqd->bfq_raising_rt_max_time;
1874 ++ }
1875 ++ }
1876 ++ if (old_raising_coeff != bfqq->raising_coeff)
1877 ++ entity->ioprio_changed = 1;
1878 ++add_bfqq_busy:
1879 ++ bfqq->last_idle_bklogged = jiffies;
1880 ++ bfqq->service_from_backlogged = 0;
1881 ++ bfq_clear_bfqq_softrt_update(bfqq);
1882 ++ bfq_add_bfqq_busy(bfqd, bfqq);
1883 ++ } else {
1884 ++ if (bfqd->low_latency && old_raising_coeff == 1 &&
1885 ++ !rq_is_sync(rq) &&
1886 ++ bfqq->last_rais_start_finish +
1887 ++ time_is_before_jiffies(
1888 ++ bfqd->bfq_raising_min_inter_arr_async)) {
1889 ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1890 ++ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
1891 ++
1892 ++ bfqd->raised_busy_queues++;
1893 ++ entity->ioprio_changed = 1;
1894 ++ bfq_log_bfqq(bfqd, bfqq,
1895 ++ "non-idle wrais starting at %llu msec,"
1896 ++ "rais_max_time %u",
1897 ++ bfqq->last_rais_start_finish,
1898 ++ jiffies_to_msecs(bfqq->
1899 ++ raising_cur_max_time));
1900 ++ }
1901 ++ bfq_updated_next_req(bfqd, bfqq);
1902 ++ }
1903 ++
1904 ++ if (bfqd->low_latency &&
1905 ++ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
1906 ++ idle_for_long_time))
1907 ++ bfqq->last_rais_start_finish = jiffies;
1908 ++}
1909 ++
1910 ++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
1911 ++{
1912 ++ elv_rb_del(&bfqq->sort_list, rq);
1913 ++ bfqq->queued[rq_is_sync(rq)]--;
1914 ++ bfqq->bfqd->queued--;
1915 ++ bfq_add_rq_rb(rq);
1916 ++}
1917 ++
1918 ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
1919 ++ struct bio *bio)
1920 ++{
1921 ++ struct task_struct *tsk = current;
1922 ++ struct bfq_io_cq *bic;
1923 ++ struct bfq_queue *bfqq;
1924 ++
1925 ++ bic = bfq_bic_lookup(bfqd, tsk->io_context);
1926 ++ if (bic == NULL)
1927 ++ return NULL;
1928 ++
1929 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
1930 ++ if (bfqq != NULL)
1931 ++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
1932 ++
1933 ++ return NULL;
1934 ++}
1935 ++
1936 ++static void bfq_activate_request(struct request_queue *q, struct request *rq)
1937 ++{
1938 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
1939 ++
1940 ++ bfqd->rq_in_driver++;
1941 ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1942 ++ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
1943 ++ (long long unsigned)bfqd->last_position);
1944 ++}
1945 ++
1946 ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
1947 ++{
1948 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
1949 ++
1950 ++ WARN_ON(bfqd->rq_in_driver == 0);
1951 ++ bfqd->rq_in_driver--;
1952 ++}
1953 ++
1954 ++static void bfq_remove_request(struct request *rq)
1955 ++{
1956 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1957 ++ struct bfq_data *bfqd = bfqq->bfqd;
1958 ++
1959 ++ if (bfqq->next_rq == rq) {
1960 ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
1961 ++ bfq_updated_next_req(bfqd, bfqq);
1962 ++ }
1963 ++
1964 ++ list_del_init(&rq->queuelist);
1965 ++ bfq_del_rq_rb(rq);
1966 ++
1967 ++ if (rq->cmd_flags & REQ_META) {
1968 ++ WARN_ON(bfqq->meta_pending == 0);
1969 ++ bfqq->meta_pending--;
1970 ++ }
1971 ++}
1972 ++
1973 ++static int bfq_merge(struct request_queue *q, struct request **req,
1974 ++ struct bio *bio)
1975 ++{
1976 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
1977 ++ struct request *__rq;
1978 ++
1979 ++ __rq = bfq_find_rq_fmerge(bfqd, bio);
1980 ++ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
1981 ++ *req = __rq;
1982 ++ return ELEVATOR_FRONT_MERGE;
1983 ++ }
1984 ++
1985 ++ return ELEVATOR_NO_MERGE;
1986 ++}
1987 ++
1988 ++static void bfq_merged_request(struct request_queue *q, struct request *req,
1989 ++ int type)
1990 ++{
1991 ++ if (type == ELEVATOR_FRONT_MERGE) {
1992 ++ struct bfq_queue *bfqq = RQ_BFQQ(req);
1993 ++
1994 ++ bfq_reposition_rq_rb(bfqq, req);
1995 ++ }
1996 ++}
1997 ++
1998 ++static void bfq_merged_requests(struct request_queue *q, struct request *rq,
1999 ++ struct request *next)
2000 ++{
2001 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
2002 ++
2003 ++ /*
2004 ++ * Reposition in fifo if next is older than rq.
2005 ++ */
2006 ++ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2007 ++ time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
2008 ++ list_move(&rq->queuelist, &next->queuelist);
2009 ++ rq_set_fifo_time(rq, rq_fifo_time(next));
2010 ++ }
2011 ++
2012 ++ if (bfqq->next_rq == next)
2013 ++ bfqq->next_rq = rq;
2014 ++
2015 ++ bfq_remove_request(next);
2016 ++}
2017 ++
2018 ++/* Must be called with bfqq != NULL */
2019 ++static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)
2020 ++{
2021 ++ BUG_ON(bfqq == NULL);
2022 ++ if (bfq_bfqq_busy(bfqq))
2023 ++ bfqq->bfqd->raised_busy_queues--;
2024 ++ bfqq->raising_coeff = 1;
2025 ++ bfqq->raising_cur_max_time = 0;
2026 ++ /* Trigger a weight change on the next activation of the queue */
2027 ++ bfqq->entity.ioprio_changed = 1;
2028 ++}
2029 ++
2030 ++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
2031 ++ struct bfq_group *bfqg)
2032 ++{
2033 ++ int i, j;
2034 ++
2035 ++ for (i = 0; i < 2; i++)
2036 ++ for (j = 0; j < IOPRIO_BE_NR; j++)
2037 ++ if (bfqg->async_bfqq[i][j] != NULL)
2038 ++ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);
2039 ++ if (bfqg->async_idle_bfqq != NULL)
2040 ++ bfq_bfqq_end_raising(bfqg->async_idle_bfqq);
2041 ++}
2042 ++
2043 ++static void bfq_end_raising(struct bfq_data *bfqd)
2044 ++{
2045 ++ struct bfq_queue *bfqq;
2046 ++
2047 ++ spin_lock_irq(bfqd->queue->queue_lock);
2048 ++
2049 ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
2050 ++ bfq_bfqq_end_raising(bfqq);
2051 ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
2052 ++ bfq_bfqq_end_raising(bfqq);
2053 ++ bfq_end_raising_async(bfqd);
2054 ++
2055 ++ spin_unlock_irq(bfqd->queue->queue_lock);
2056 ++}
2057 ++
2058 ++static int bfq_allow_merge(struct request_queue *q, struct request *rq,
2059 ++ struct bio *bio)
2060 ++{
2061 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
2062 ++ struct bfq_io_cq *bic;
2063 ++ struct bfq_queue *bfqq;
2064 ++
2065 ++ /*
2066 ++ * Disallow merge of a sync bio into an async request.
2067 ++ */
2068 ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
2069 ++ return 0;
2070 ++
2071 ++ /*
2072 ++ * Lookup the bfqq that this bio will be queued with. Allow
2073 ++ * merge only if rq is queued there.
2074 ++ * Queue lock is held here.
2075 ++ */
2076 ++ bic = bfq_bic_lookup(bfqd, current->io_context);
2077 ++ if (bic == NULL)
2078 ++ return 0;
2079 ++
2080 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
2081 ++ return bfqq == RQ_BFQQ(rq);
2082 ++}
2083 ++
2084 ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
2085 ++ struct bfq_queue *bfqq)
2086 ++{
2087 ++ if (bfqq != NULL) {
2088 ++ bfq_mark_bfqq_must_alloc(bfqq);
2089 ++ bfq_mark_bfqq_budget_new(bfqq);
2090 ++ bfq_clear_bfqq_fifo_expire(bfqq);
2091 ++
2092 ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
2093 ++
2094 ++ bfq_log_bfqq(bfqd, bfqq,
2095 ++ "set_in_service_queue, cur-budget = %lu",
2096 ++ bfqq->entity.budget);
2097 ++ }
2098 ++
2099 ++ bfqd->in_service_queue = bfqq;
2100 ++}
2101 ++
2102 ++/*
2103 ++ * Get and set a new queue for service.
2104 ++ */
2105 ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
2106 ++ struct bfq_queue *bfqq)
2107 ++{
2108 ++ if (!bfqq)
2109 ++ bfqq = bfq_get_next_queue(bfqd);
2110 ++ else
2111 ++ bfq_get_next_queue_forced(bfqd, bfqq);
2112 ++
2113 ++ __bfq_set_in_service_queue(bfqd, bfqq);
2114 ++ return bfqq;
2115 ++}
2116 ++
2117 ++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
2118 ++ struct request *rq)
2119 ++{
2120 ++ if (blk_rq_pos(rq) >= bfqd->last_position)
2121 ++ return blk_rq_pos(rq) - bfqd->last_position;
2122 ++ else
2123 ++ return bfqd->last_position - blk_rq_pos(rq);
2124 ++}
2125 ++
2126 ++/*
2127 ++ * Return true if bfqq has no request pending and rq is close enough to
2128 ++ * bfqd->last_position, or if rq is closer to bfqd->last_position than
2129 ++ * bfqq->next_rq
2130 ++ */
2131 ++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
2132 ++{
2133 ++ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
2134 ++}
2135 ++
2136 ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
2137 ++{
2138 ++ struct rb_root *root = &bfqd->rq_pos_tree;
2139 ++ struct rb_node *parent, *node;
2140 ++ struct bfq_queue *__bfqq;
2141 ++ sector_t sector = bfqd->last_position;
2142 ++
2143 ++ if (RB_EMPTY_ROOT(root))
2144 ++ return NULL;
2145 ++
2146 ++ /*
2147 ++ * First, if we find a request starting at the end of the last
2148 ++ * request, choose it.
2149 ++ */
2150 ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
2151 ++ if (__bfqq != NULL)
2152 ++ return __bfqq;
2153 ++
2154 ++ /*
2155 ++ * If the exact sector wasn't found, the parent of the NULL leaf
2156 ++ * will contain the closest sector (rq_pos_tree sorted by next_request
2157 ++ * position).
2158 ++ */
2159 ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
2160 ++ if (bfq_rq_close(bfqd, __bfqq->next_rq))
2161 ++ return __bfqq;
2162 ++
2163 ++ if (blk_rq_pos(__bfqq->next_rq) < sector)
2164 ++ node = rb_next(&__bfqq->pos_node);
2165 ++ else
2166 ++ node = rb_prev(&__bfqq->pos_node);
2167 ++ if (node == NULL)
2168 ++ return NULL;
2169 ++
2170 ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
2171 ++ if (bfq_rq_close(bfqd, __bfqq->next_rq))
2172 ++ return __bfqq;
2173 ++
2174 ++ return NULL;
2175 ++}
2176 ++
2177 ++/*
2178 ++ * bfqd - obvious
2179 ++ * cur_bfqq - passed in so that we don't decide that the current queue
2180 ++ * is closely cooperating with itself.
2181 ++ *
2182 ++ * We are assuming that cur_bfqq has dispatched at least one request,
2183 ++ * and that bfqd->last_position reflects a position on the disk associated
2184 ++ * with the I/O issued by cur_bfqq.
2185 ++ */
2186 ++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
2187 ++ struct bfq_queue *cur_bfqq)
2188 ++{
2189 ++ struct bfq_queue *bfqq;
2190 ++
2191 ++ if (bfq_class_idle(cur_bfqq))
2192 ++ return NULL;
2193 ++ if (!bfq_bfqq_sync(cur_bfqq))
2194 ++ return NULL;
2195 ++ if (BFQQ_SEEKY(cur_bfqq))
2196 ++ return NULL;
2197 ++
2198 ++ /* If device has only one backlogged bfq_queue, don't search. */
2199 ++ if (bfqd->busy_queues == 1)
2200 ++ return NULL;
2201 ++
2202 ++ /*
2203 ++ * We should notice if some of the queues are cooperating, e.g.
2204 ++ * working closely on the same area of the disk. In that case,
2205 ++ * we can group them together and don't waste time idling.
2206 ++ */
2207 ++ bfqq = bfqq_close(bfqd);
2208 ++ if (bfqq == NULL || bfqq == cur_bfqq)
2209 ++ return NULL;
2210 ++
2211 ++ /*
2212 ++ * Do not merge queues from different bfq_groups.
2213 ++ */
2214 ++ if (bfqq->entity.parent != cur_bfqq->entity.parent)
2215 ++ return NULL;
2216 ++
2217 ++ /*
2218 ++ * It only makes sense to merge sync queues.
2219 ++ */
2220 ++ if (!bfq_bfqq_sync(bfqq))
2221 ++ return NULL;
2222 ++ if (BFQQ_SEEKY(bfqq))
2223 ++ return NULL;
2224 ++
2225 ++ /*
2226 ++ * Do not merge queues of different priority classes.
2227 ++ */
2228 ++ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
2229 ++ return NULL;
2230 ++
2231 ++ return bfqq;
2232 ++}
2233 ++
2234 ++/*
2235 ++ * If enough samples have been computed, return the current max budget
2236 ++ * stored in bfqd, which is dynamically updated according to the
2237 ++ * estimated disk peak rate; otherwise return the default max budget
2238 ++ */
2239 ++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
2240 ++{
2241 ++ if (bfqd->budgets_assigned < 194)
2242 ++ return bfq_default_max_budget;
2243 ++ else
2244 ++ return bfqd->bfq_max_budget;
2245 ++}
2246 ++
2247 ++/*
2248 ++ * Return min budget, which is a fraction of the current or default
2249 ++ * max budget (trying with 1/32)
2250 ++ */
2251 ++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
2252 ++{
2253 ++ if (bfqd->budgets_assigned < 194)
2254 ++ return bfq_default_max_budget / 32;
2255 ++ else
2256 ++ return bfqd->bfq_max_budget / 32;
2257 ++}
2258 ++
2259 ++/*
2260 ++ * Decides whether idling should be done for given device and
2261 ++ * given in-service queue.
2262 ++ */
2263 ++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
2264 ++ struct bfq_queue *in_service_bfqq)
2265 ++{
2266 ++ if (in_service_bfqq == NULL)
2267 ++ return false;
2268 ++ /*
2269 ++ * If device is SSD it has no seek penalty, disable idling; but
2270 ++ * do so only if:
2271 ++ * - device does not support queuing, otherwise we still have
2272 ++ * a problem with sync vs async workloads;
2273 ++ * - the queue is not weight-raised, to preserve guarantees.
2274 ++ */
2275 ++ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
2276 ++ in_service_bfqq->raising_coeff == 1);
2277 ++}
2278 ++
2279 ++static void bfq_arm_slice_timer(struct bfq_data *bfqd)
2280 ++{
2281 ++ struct bfq_queue *bfqq = bfqd->in_service_queue;
2282 ++ struct bfq_io_cq *bic;
2283 ++ unsigned long sl;
2284 ++
2285 ++ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
2286 ++
2287 ++ /* Tasks have exited, don't wait. */
2288 ++ bic = bfqd->in_service_bic;
2289 ++ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
2290 ++ return;
2291 ++
2292 ++ bfq_mark_bfqq_wait_request(bfqq);
2293 ++
2294 ++ /*
2295 ++ * We don't want to idle for seeks, but we do want to allow
2296 ++ * fair distribution of slice time for a process doing back-to-back
2297 ++ * seeks. So allow a little bit of time for him to submit a new rq.
2298 ++ *
2299 ++ * To prevent processes with (partly) seeky workloads from
2300 ++ * being too ill-treated, grant them a small fraction of the
2301 ++ * assigned budget before reducing the waiting time to
2302 ++ * BFQ_MIN_TT. This happened to help reduce latency.
2303 ++ */
2304 ++ sl = bfqd->bfq_slice_idle;
2305 ++ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
2306 ++ bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
2307 ++ bfqq->raising_coeff == 1)
2308 ++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
2309 ++ else if (bfqq->raising_coeff > 1)
2310 ++ sl = sl * 3;
2311 ++ bfqd->last_idling_start = ktime_get();
2312 ++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
2313 ++ bfq_log(bfqd, "arm idle: %u/%u ms",
2314 ++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
2315 ++}
2316 ++
2317 ++/*
2318 ++ * Set the maximum time for the in-service queue to consume its
2319 ++ * budget. This prevents seeky processes from lowering the disk
2320 ++ * throughput (always guaranteed with a time slice scheme as in CFQ).
2321 ++ */
2322 ++static void bfq_set_budget_timeout(struct bfq_data *bfqd)
2323 ++{
2324 ++ struct bfq_queue *bfqq = bfqd->in_service_queue;
2325 ++ unsigned int timeout_coeff;
2326 ++ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
2327 ++ timeout_coeff = 1;
2328 ++ else
2329 ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
2330 ++
2331 ++ bfqd->last_budget_start = ktime_get();
2332 ++
2333 ++ bfq_clear_bfqq_budget_new(bfqq);
2334 ++ bfqq->budget_timeout = jiffies +
2335 ++ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
2336 ++
2337 ++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
2338 ++ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
2339 ++ timeout_coeff));
2340 ++}
2341 ++
2342 ++/*
2343 ++ * Move request from internal lists to the request queue dispatch list.
2344 ++ */
2345 ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
2346 ++{
2347 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
2348 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
2349 ++
2350 ++ bfq_remove_request(rq);
2351 ++ bfqq->dispatched++;
2352 ++ elv_dispatch_sort(q, rq);
2353 ++
2354 ++ if (bfq_bfqq_sync(bfqq))
2355 ++ bfqd->sync_flight++;
2356 ++}
2357 ++
2358 ++/*
2359 ++ * Return expired entry, or NULL to just start from scratch in rbtree.
2360 ++ */
2361 ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
2362 ++{
2363 ++ struct request *rq = NULL;
2364 ++
2365 ++ if (bfq_bfqq_fifo_expire(bfqq))
2366 ++ return NULL;
2367 ++
2368 ++ bfq_mark_bfqq_fifo_expire(bfqq);
2369 ++
2370 ++ if (list_empty(&bfqq->fifo))
2371 ++ return NULL;
2372 ++
2373 ++ rq = rq_entry_fifo(bfqq->fifo.next);
2374 ++
2375 ++ if (time_before(jiffies, rq_fifo_time(rq)))
2376 ++ return NULL;
2377 ++
2378 ++ return rq;
2379 ++}
2380 ++
2381 ++/*
2382 ++ * Must be called with the queue_lock held.
2383 ++ */
2384 ++static int bfqq_process_refs(struct bfq_queue *bfqq)
2385 ++{
2386 ++ int process_refs, io_refs;
2387 ++
2388 ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
2389 ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
2390 ++ BUG_ON(process_refs < 0);
2391 ++ return process_refs;
2392 ++}
2393 ++
2394 ++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
2395 ++{
2396 ++ int process_refs, new_process_refs;
2397 ++ struct bfq_queue *__bfqq;
2398 ++
2399 ++ /*
2400 ++ * If there are no process references on the new_bfqq, then it is
2401 ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
2402 ++ * may have dropped their last reference (not just their last process
2403 ++ * reference).
2404 ++ */
2405 ++ if (!bfqq_process_refs(new_bfqq))
2406 ++ return;
2407 ++
2408 ++ /* Avoid a circular list and skip interim queue merges. */
2409 ++ while ((__bfqq = new_bfqq->new_bfqq)) {
2410 ++ if (__bfqq == bfqq)
2411 ++ return;
2412 ++ new_bfqq = __bfqq;
2413 ++ }
2414 ++
2415 ++ process_refs = bfqq_process_refs(bfqq);
2416 ++ new_process_refs = bfqq_process_refs(new_bfqq);
2417 ++ /*
2418 ++ * If the process for the bfqq has gone away, there is no
2419 ++ * sense in merging the queues.
2420 ++ */
2421 ++ if (process_refs == 0 || new_process_refs == 0)
2422 ++ return;
2423 ++
2424 ++ /*
2425 ++ * Merge in the direction of the lesser amount of work.
2426 ++ */
2427 ++ if (new_process_refs >= process_refs) {
2428 ++ bfqq->new_bfqq = new_bfqq;
2429 ++ atomic_add(process_refs, &new_bfqq->ref);
2430 ++ } else {
2431 ++ new_bfqq->new_bfqq = bfqq;
2432 ++ atomic_add(new_process_refs, &bfqq->ref);
2433 ++ }
2434 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
2435 ++ new_bfqq->pid);
2436 ++}
2437 ++
2438 ++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
2439 ++{
2440 ++ struct bfq_entity *entity = &bfqq->entity;
2441 ++ return entity->budget - entity->service;
2442 ++}
2443 ++
2444 ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2445 ++{
2446 ++ BUG_ON(bfqq != bfqd->in_service_queue);
2447 ++
2448 ++ __bfq_bfqd_reset_in_service(bfqd);
2449 ++
2450 ++ /*
2451 ++ * If this bfqq is shared between multiple processes, check
2452 ++ * to make sure that those processes are still issuing I/Os
2453 ++ * within the mean seek distance. If not, it may be time to
2454 ++ * break the queues apart again.
2455 ++ */
2456 ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
2457 ++ bfq_mark_bfqq_split_coop(bfqq);
2458 ++
2459 ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
2460 ++ /*
2461 ++ * overloading budget_timeout field to store when
2462 ++ * the queue remains with no backlog, used by
2463 ++ * the weight-raising mechanism
2464 ++ */
2465 ++ bfqq->budget_timeout = jiffies;
2466 ++ bfq_del_bfqq_busy(bfqd, bfqq, 1);
2467 ++ } else {
2468 ++ bfq_activate_bfqq(bfqd, bfqq);
2469 ++ /*
2470 ++ * Resort priority tree of potential close cooperators.
2471 ++ */
2472 ++ bfq_rq_pos_tree_add(bfqd, bfqq);
2473 ++ }
2474 ++}
2475 ++
2476 ++/**
2477 ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
2478 ++ * @bfqd: device data.
2479 ++ * @bfqq: queue to update.
2480 ++ * @reason: reason for expiration.
2481 ++ *
2482 ++ * Handle the feedback on @bfqq budget. See the body for detailed
2483 ++ * comments.
2484 ++ */
2485 ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
2486 ++ struct bfq_queue *bfqq,
2487 ++ enum bfqq_expiration reason)
2488 ++{
2489 ++ struct request *next_rq;
2490 ++ unsigned long budget, min_budget;
2491 ++
2492 ++ budget = bfqq->max_budget;
2493 ++ min_budget = bfq_min_budget(bfqd);
2494 ++
2495 ++ BUG_ON(bfqq != bfqd->in_service_queue);
2496 ++
2497 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
2498 ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
2499 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
2500 ++ budget, bfq_min_budget(bfqd));
2501 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
2502 ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
2503 ++
2504 ++ if (bfq_bfqq_sync(bfqq)) {
2505 ++ switch (reason) {
2506 ++ /*
2507 ++ * Caveat: in all the following cases we trade latency
2508 ++ * for throughput.
2509 ++ */
2510 ++ case BFQ_BFQQ_TOO_IDLE:
2511 ++ /*
2512 ++ * This is the only case where we may reduce
2513 ++ * the budget: if there is no requets of the
2514 ++ * process still waiting for completion, then
2515 ++ * we assume (tentatively) that the timer has
2516 ++ * expired because the batch of requests of
2517 ++ * the process could have been served with a
2518 ++ * smaller budget. Hence, betting that
2519 ++ * process will behave in the same way when it
2520 ++ * becomes backlogged again, we reduce its
2521 ++ * next budget. As long as we guess right,
2522 ++ * this budget cut reduces the latency
2523 ++ * experienced by the process.
2524 ++ *
2525 ++ * However, if there are still outstanding
2526 ++ * requests, then the process may have not yet
2527 ++ * issued its next request just because it is
2528 ++ * still waiting for the completion of some of
2529 ++ * the still oustanding ones. So in this
2530 ++ * subcase we do not reduce its budget, on the
2531 ++ * contrary we increase it to possibly boost
2532 ++ * the throughput, as discussed in the
2533 ++ * comments to the BUDGET_TIMEOUT case.
2534 ++ */
2535 ++ if (bfqq->dispatched > 0) /* still oustanding reqs */
2536 ++ budget = min(budget * 2, bfqd->bfq_max_budget);
2537 ++ else {
2538 ++ if (budget > 5 * min_budget)
2539 ++ budget -= 4 * min_budget;
2540 ++ else
2541 ++ budget = min_budget;
2542 ++ }
2543 ++ break;
2544 ++ case BFQ_BFQQ_BUDGET_TIMEOUT:
2545 ++ /*
2546 ++ * We double the budget here because: 1) it
2547 ++ * gives the chance to boost the throughput if
2548 ++ * this is not a seeky process (which may have
2549 ++ * bumped into this timeout because of, e.g.,
2550 ++ * ZBR), 2) together with charge_full_budget
2551 ++ * it helps give seeky processes higher
2552 ++ * timestamps, and hence be served less
2553 ++ * frequently.
2554 ++ */
2555 ++ budget = min(budget * 2, bfqd->bfq_max_budget);
2556 ++ break;
2557 ++ case BFQ_BFQQ_BUDGET_EXHAUSTED:
2558 ++ /*
2559 ++ * The process still has backlog, and did not
2560 ++ * let either the budget timeout or the disk
2561 ++ * idling timeout expire. Hence it is not
2562 ++ * seeky, has a short thinktime and may be
2563 ++ * happy with a higher budget too. So
2564 ++ * definitely increase the budget of this good
2565 ++ * candidate to boost the disk throughput.
2566 ++ */
2567 ++ budget = min(budget * 4, bfqd->bfq_max_budget);
2568 ++ break;
2569 ++ case BFQ_BFQQ_NO_MORE_REQUESTS:
2570 ++ /*
2571 ++ * Leave the budget unchanged.
2572 ++ */
2573 ++ default:
2574 ++ return;
2575 ++ }
2576 ++ } else /* async queue */
2577 ++ /* async queues get always the maximum possible budget
2578 ++ * (their ability to dispatch is limited by
2579 ++ * @bfqd->bfq_max_budget_async_rq).
2580 ++ */
2581 ++ budget = bfqd->bfq_max_budget;
2582 ++
2583 ++ bfqq->max_budget = budget;
2584 ++
2585 ++ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
2586 ++ bfqq->max_budget > bfqd->bfq_max_budget)
2587 ++ bfqq->max_budget = bfqd->bfq_max_budget;
2588 ++
2589 ++ /*
2590 ++ * Make sure that we have enough budget for the next request.
2591 ++ * Since the finish time of the bfqq must be kept in sync with
2592 ++ * the budget, be sure to call __bfq_bfqq_expire() after the
2593 ++ * update.
2594 ++ */
2595 ++ next_rq = bfqq->next_rq;
2596 ++ if (next_rq != NULL)
2597 ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
2598 ++ bfq_serv_to_charge(next_rq, bfqq));
2599 ++ else
2600 ++ bfqq->entity.budget = bfqq->max_budget;
2601 ++
2602 ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
2603 ++ next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
2604 ++ bfqq->entity.budget);
2605 ++}
2606 ++
2607 ++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
2608 ++{
2609 ++ unsigned long max_budget;
2610 ++
2611 ++ /*
2612 ++ * The max_budget calculated when autotuning is equal to the
2613 ++ * amount of sectors transfered in timeout_sync at the
2614 ++ * estimated peak rate.
2615 ++ */
2616 ++ max_budget = (unsigned long)(peak_rate * 1000 *
2617 ++ timeout >> BFQ_RATE_SHIFT);
2618 ++
2619 ++ return max_budget;
2620 ++}
2621 ++
2622 ++/*
2623 ++ * In addition to updating the peak rate, checks whether the process
2624 ++ * is "slow", and returns 1 if so. This slow flag is used, in addition
2625 ++ * to the budget timeout, to reduce the amount of service provided to
2626 ++ * seeky processes, and hence reduce their chances to lower the
2627 ++ * throughput. See the code for more details.
2628 ++ */
2629 ++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2630 ++ int compensate, enum bfqq_expiration reason)
2631 ++{
2632 ++ u64 bw, usecs, expected, timeout;
2633 ++ ktime_t delta;
2634 ++ int update = 0;
2635 ++
2636 ++ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
2637 ++ return 0;
2638 ++
2639 ++ if (compensate)
2640 ++ delta = bfqd->last_idling_start;
2641 ++ else
2642 ++ delta = ktime_get();
2643 ++ delta = ktime_sub(delta, bfqd->last_budget_start);
2644 ++ usecs = ktime_to_us(delta);
2645 ++
2646 ++ /* Don't trust short/unrealistic values. */
2647 ++ if (usecs < 100 || usecs >= LONG_MAX)
2648 ++ return 0;
2649 ++
2650 ++ /*
2651 ++ * Calculate the bandwidth for the last slice. We use a 64 bit
2652 ++ * value to store the peak rate, in sectors per usec in fixed
2653 ++ * point math. We do so to have enough precision in the estimate
2654 ++ * and to avoid overflows.
2655 ++ */
2656 ++ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
2657 ++ do_div(bw, (unsigned long)usecs);
2658 ++
2659 ++ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
2660 ++
2661 ++ /*
2662 ++ * Use only long (> 20ms) intervals to filter out spikes for
2663 ++ * the peak rate estimation.
2664 ++ */
2665 ++ if (usecs > 20000) {
2666 ++ if (bw > bfqd->peak_rate ||
2667 ++ (!BFQQ_SEEKY(bfqq) &&
2668 ++ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
2669 ++ bfq_log(bfqd, "measured bw =%llu", bw);
2670 ++ /*
2671 ++ * To smooth oscillations use a low-pass filter with
2672 ++ * alpha=7/8, i.e.,
2673 ++ * new_rate = (7/8) * old_rate + (1/8) * bw
2674 ++ */
2675 ++ do_div(bw, 8);
2676 ++ if (bw == 0)
2677 ++ return 0;
2678 ++ bfqd->peak_rate *= 7;
2679 ++ do_div(bfqd->peak_rate, 8);
2680 ++ bfqd->peak_rate += bw;
2681 ++ update = 1;
2682 ++ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
2683 ++ }
2684 ++
2685 ++ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
2686 ++
2687 ++ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
2688 ++ bfqd->peak_rate_samples++;
2689 ++
2690 ++ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
2691 ++ update && bfqd->bfq_user_max_budget == 0) {
2692 ++ bfqd->bfq_max_budget =
2693 ++ bfq_calc_max_budget(bfqd->peak_rate, timeout);
2694 ++ bfq_log(bfqd, "new max_budget=%lu",
2695 ++ bfqd->bfq_max_budget);
2696 ++ }
2697 ++ }
2698 ++
2699 ++ /*
2700 ++ * If the process has been served for a too short time
2701 ++ * interval to let its possible sequential accesses prevail on
2702 ++ * the initial seek time needed to move the disk head on the
2703 ++ * first sector it requested, then give the process a chance
2704 ++ * and for the moment return false.
2705 ++ */
2706 ++ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
2707 ++ return 0;
2708 ++
2709 ++ /*
2710 ++ * A process is considered ``slow'' (i.e., seeky, so that we
2711 ++ * cannot treat it fairly in the service domain, as it would
2712 ++ * slow down too much the other processes) if, when a slice
2713 ++ * ends for whatever reason, it has received service at a
2714 ++ * rate that would not be high enough to complete the budget
2715 ++ * before the budget timeout expiration.
2716 ++ */
2717 ++ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
2718 ++
2719 ++ /*
2720 ++ * Caveat: processes doing IO in the slower disk zones will
2721 ++ * tend to be slow(er) even if not seeky. And the estimated
2722 ++ * peak rate will actually be an average over the disk
2723 ++ * surface. Hence, to not be too harsh with unlucky processes,
2724 ++ * we keep a budget/3 margin of safety before declaring a
2725 ++ * process slow.
2726 ++ */
2727 ++ return expected > (4 * bfqq->entity.budget) / 3;
2728 ++}
2729 ++
2730 ++/*
2731 ++ * To be deemed as soft real-time, an application must meet two requirements.
2732 ++ * The first is that the application must not require an average bandwidth
2733 ++ * higher than the approximate bandwidth required to playback or record a
2734 ++ * compressed high-definition video.
2735 ++ * The next function is invoked on the completion of the last request of a
2736 ++ * batch, to compute the next-start time instant, soft_rt_next_start, such
2737 ++ * that, if the next request of the application does not arrive before
2738 ++ * soft_rt_next_start, then the above requirement on the bandwidth is met.
2739 ++ *
2740 ++ * The second requirement is that the request pattern of the application is
2741 ++ * isochronous, i.e., that, after issuing a request or a batch of requests, the
2742 ++ * application stops for a while, then issues a new batch, and so on. For this
2743 ++ * reason the next function is invoked to compute soft_rt_next_start only for
2744 ++ * applications that meet this requirement, whereas soft_rt_next_start is set
2745 ++ * to infinity for applications that do not.
2746 ++ *
2747 ++ * Unfortunately, even a greedy application may happen to behave in an
2748 ++ * isochronous way if several processes are competing for the CPUs. In fact,
2749 ++ * in this scenario the application stops issuing requests while the CPUs are
2750 ++ * busy serving other processes, then restarts, then stops again for a while,
2751 ++ * and so on. In addition, if the disk achieves a low enough throughput with
2752 ++ * the request pattern issued by the application, then the above bandwidth
2753 ++ * requirement may happen to be met too. To prevent such a greedy application
2754 ++ * to be deemed as soft real-time, a further rule is used in the computation
2755 ++ * of soft_rt_next_start: soft_rt_next_start must be higher than the current
2756 ++ * time plus the maximum time for which the arrival of a request is waited
2757 ++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This
2758 ++ * filters out greedy applications, as the latter issue instead their next
2759 ++ * request as soon as possible after the last one has been completed (in
2760 ++ * contrast, when a batch of requests is completed, a soft real-time
2761 ++ * application spends some time processing data).
2762 ++ *
2763 ++ * Actually, the last filter may easily generate false positives if: only
2764 ++ * bfqd->bfq_slice_idle is used as a reference time interval, and one or
2765 ++ * both the following two cases occur:
2766 ++ * 1) HZ is so low that the duration of a jiffie is comparable to or higher
2767 ++ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
2768 ++ * HZ=100.
2769 ++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
2770 ++ * for a while, then suddenly 'jump' by several units to recover the lost
2771 ++ * increments. This seems to happen, e.g., inside virtual machines.
2772 ++ * To address this issue, we do not use as a reference time interval just
2773 ++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
2774 ++ * particular we add the minimum number of jiffies for which the filter seems
2775 ++ * to be quite precise also in embedded systems and KVM/QEMU virtual machines.
2776 ++ */
2777 ++static inline u64 bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
2778 ++ struct bfq_queue *bfqq)
2779 ++{
2780 ++ return max(bfqq->last_idle_bklogged +
2781 ++ HZ * bfqq->service_from_backlogged /
2782 ++ bfqd->bfq_raising_max_softrt_rate,
2783 ++ (u64)jiffies + bfqq->bfqd->bfq_slice_idle + 4);
2784 ++}
2785 ++
2786 ++/**
2787 ++ * bfq_bfqq_expire - expire a queue.
2788 ++ * @bfqd: device owning the queue.
2789 ++ * @bfqq: the queue to expire.
2790 ++ * @compensate: if true, compensate for the time spent idling.
2791 ++ * @reason: the reason causing the expiration.
2792 ++ *
2793 ++ *
2794 ++ * If the process associated to the queue is slow (i.e., seeky), or in
2795 ++ * case of budget timeout, or, finally, if it is async, we
2796 ++ * artificially charge it an entire budget (independently of the
2797 ++ * actual service it received). As a consequence, the queue will get
2798 ++ * higher timestamps than the correct ones upon reactivation, and
2799 ++ * hence it will be rescheduled as if it had received more service
2800 ++ * than what it actually received. In the end, this class of processes
2801 ++ * will receive less service in proportion to how slowly they consume
2802 ++ * their budgets (and hence how seriously they tend to lower the
2803 ++ * throughput).
2804 ++ *
2805 ++ * In contrast, when a queue expires because it has been idling for
2806 ++ * too much or because it exhausted its budget, we do not touch the
2807 ++ * amount of service it has received. Hence when the queue will be
2808 ++ * reactivated and its timestamps updated, the latter will be in sync
2809 ++ * with the actual service received by the queue until expiration.
2810 ++ *
2811 ++ * Charging a full budget to the first type of queues and the exact
2812 ++ * service to the others has the effect of using the WF2Q+ policy to
2813 ++ * schedule the former on a timeslice basis, without violating the
2814 ++ * service domain guarantees of the latter.
2815 ++ */
2816 ++static void bfq_bfqq_expire(struct bfq_data *bfqd,
2817 ++ struct bfq_queue *bfqq,
2818 ++ int compensate,
2819 ++ enum bfqq_expiration reason)
2820 ++{
2821 ++ int slow;
2822 ++ BUG_ON(bfqq != bfqd->in_service_queue);
2823 ++
2824 ++ /* Update disk peak rate for autotuning and check whether the
2825 ++ * process is slow (see bfq_update_peak_rate).
2826 ++ */
2827 ++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
2828 ++
2829 ++ /*
2830 ++ * As above explained, 'punish' slow (i.e., seeky), timed-out
2831 ++ * and async queues, to favor sequential sync workloads.
2832 ++ *
2833 ++ * Processes doing IO in the slower disk zones will tend to be
2834 ++ * slow(er) even if not seeky. Hence, since the estimated peak
2835 ++ * rate is actually an average over the disk surface, these
2836 ++ * processes may timeout just for bad luck. To avoid punishing
2837 ++ * them we do not charge a full budget to a process that
2838 ++ * succeeded in consuming at least 2/3 of its budget.
2839 ++ */
2840 ++ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
2841 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
2842 ++ bfq_bfqq_charge_full_budget(bfqq);
2843 ++
2844 ++ bfqq->service_from_backlogged += bfqq->entity.service;
2845 ++
2846 ++ if (bfqd->low_latency && bfqq->raising_coeff == 1)
2847 ++ bfqq->last_rais_start_finish = jiffies;
2848 ++
2849 ++ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {
2850 ++ if (reason != BFQ_BFQQ_BUDGET_TIMEOUT &&
2851 ++ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) {
2852 ++ /*
2853 ++ * If we get here, then the request pattern is
2854 ++ * isochronous (see the comments to the function
2855 ++ * bfq_bfqq_softrt_next_start()). However, if the
2856 ++ * queue still has in-flight requests, then it is
2857 ++ * better to postpone the computation of next_start
2858 ++ * to the next request completion. In fact, if we
2859 ++ * computed it now, then the application might pass
2860 ++ * the greedy-application filter improperly, because
2861 ++ * the arrival of its next request may happen to be
2862 ++ * higher than (jiffies + bfqq->bfqd->bfq_slice_idle)
2863 ++ * not because the application is truly soft real-
2864 ++ * time, but just because the application is currently
2865 ++ * waiting for the completion of some request before
2866 ++ * issuing, as quickly as possible, its next request.
2867 ++ */
2868 ++ if (bfqq->dispatched > 0) {
2869 ++ bfqq->soft_rt_next_start = -1;
2870 ++ bfq_mark_bfqq_softrt_update(bfqq);
2871 ++ } else
2872 ++ bfqq->soft_rt_next_start =
2873 ++ bfq_bfqq_softrt_next_start(bfqd, bfqq);
2874 ++ } else
2875 ++ bfqq->soft_rt_next_start = -1; /* infinity */
2876 ++ }
2877 ++
2878 ++ bfq_log_bfqq(bfqd, bfqq,
2879 ++ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
2880 ++ bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
2881 ++
2882 ++ /* Increase, decrease or leave budget unchanged according to reason */
2883 ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
2884 ++ __bfq_bfqq_expire(bfqd, bfqq);
2885 ++}
2886 ++
2887 ++/*
2888 ++ * Budget timeout is not implemented through a dedicated timer, but
2889 ++ * just checked on request arrivals and completions, as well as on
2890 ++ * idle timer expirations.
2891 ++ */
2892 ++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
2893 ++{
2894 ++ if (bfq_bfqq_budget_new(bfqq))
2895 ++ return 0;
2896 ++
2897 ++ if (time_before(jiffies, bfqq->budget_timeout))
2898 ++ return 0;
2899 ++
2900 ++ return 1;
2901 ++}
2902 ++
2903 ++/*
2904 ++ * If we expire a queue that is waiting for the arrival of a new
2905 ++ * request, we may prevent the fictitious timestamp backshifting that
2906 ++ * allows the guarantees of the queue to be preserved (see [1] for
2907 ++ * this tricky aspect). Hence we return true only if this condition
2908 ++ * does not hold, or if the queue is slow enough to deserve only to be
2909 ++ * kicked off for preserving a high throughput.
2910 ++*/
2911 ++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
2912 ++{
2913 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
2914 ++ "may_budget_timeout: wr %d left %d timeout %d",
2915 ++ bfq_bfqq_wait_request(bfqq),
2916 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
2917 ++ bfq_bfqq_budget_timeout(bfqq));
2918 ++
2919 ++ return (!bfq_bfqq_wait_request(bfqq) ||
2920 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
2921 ++ &&
2922 ++ bfq_bfqq_budget_timeout(bfqq);
2923 ++}
2924 ++
2925 ++/*
2926 ++ * For weight-raised queues issuing sync requests, idling is always performed,
2927 ++ * as this is instrumental in guaranteeing a high fraction of the throughput
2928 ++ * to these queues, and hence in guaranteeing a lower latency for their
2929 ++ * requests. See [1] for details.
2930 ++ *
2931 ++ * For non-weight-raised queues, idling is instead disabled if the device is
2932 ++ * NCQ-enabled and non-rotational, as this boosts the throughput on such
2933 ++ * devices.
2934 ++ */
2935 ++static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
2936 ++{
2937 ++ struct bfq_data *bfqd = bfqq->bfqd;
2938 ++
2939 ++ return bfq_bfqq_sync(bfqq) && (
2940 ++ bfqq->raising_coeff > 1 ||
2941 ++ (bfq_bfqq_idle_window(bfqq) &&
2942 ++ !(bfqd->hw_tag &&
2943 ++ (blk_queue_nonrot(bfqd->queue) ||
2944 ++ /*
2945 ++ * If there are weight-raised busy queues, then do not idle
2946 ++ * the disk for a sync non-weight-raised queue, and hence
2947 ++ * expire the queue immediately if empty. Combined with the
2948 ++ * timestamping rules of BFQ (see [1] for details), this
2949 ++ * causes sync non-weight-raised queues to get a lower
2950 ++ * fraction of the disk throughput, and hence reduces the rate
2951 ++ * at which the processes associated to these queues ask for
2952 ++ * requests from the request pool.
2953 ++ *
2954 ++ * This is beneficial for weight-raised processes, when the
2955 ++ * system operates in request-pool saturation conditions
2956 ++ * (e.g., in the presence of write hogs). In fact, if
2957 ++ * non-weight-raised processes ask for requests at a lower
2958 ++ * rate, then weight-raised processes have a higher
2959 ++ * probability to get a request from the pool immediately
2960 ++ * (or at least soon) when they need one. Hence they have a
2961 ++ * higher probability to actually get a fraction of the disk
2962 ++ * throughput proportional to their high weight. This is
2963 ++ * especially true with NCQ-enabled drives, which enqueue
2964 ++ * several requests in advance and further reorder
2965 ++ * internally-queued requests.
2966 ++ *
2967 ++ * Mistreating non-weight-raised queues in the above-described
2968 ++ * way, when there are busy weight-raised queues, seems to
2969 ++ * mitigate starvation problems in the presence of heavy write
2970 ++ * workloads and NCQ, and hence to guarantee a higher
2971 ++ * application and system responsiveness in these hostile
2972 ++ * scenarios.
2973 ++ */
2974 ++ bfqd->raised_busy_queues > 0)
2975 ++ )
2976 ++ )
2977 ++ );
2978 ++}
2979 ++
2980 ++/*
2981 ++ * If the in-service queue is empty, but it is sync and either of the following
2982 ++ * conditions holds, then: 1) the queue must remain in service and cannot be
2983 ++ * expired, and 2) the disk must be idled to wait for the possible arrival
2984 ++ * of a new request for the queue. The conditions are:
2985 ++ * - the device is rotational and not performing NCQ, and the queue has its
2986 ++ * idle window set (in this case, waiting for a new request for the queue
2987 ++ * is likely to boost the disk throughput);
2988 ++ * - the queue is weight-raised (waiting for the request is necessary to
2989 ++ * provide the queue with fairness and latency guarantees, see [1] for
2990 ++ * details).
2991 ++ */
2992 ++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
2993 ++{
2994 ++ struct bfq_data *bfqd = bfqq->bfqd;
2995 ++
2996 ++ return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
2997 ++ bfq_bfqq_must_not_expire(bfqq) &&
2998 ++ !bfq_queue_nonrot_noidle(bfqd, bfqq));
2999 ++}
3000 ++
3001 ++/*
3002 ++ * Select a queue for service. If we have a current queue in service,
3003 ++ * check whether to continue servicing it, or retrieve and set a new one.
3004 ++ */
3005 ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
3006 ++{
3007 ++ struct bfq_queue *bfqq, *new_bfqq = NULL;
3008 ++ struct request *next_rq;
3009 ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
3010 ++
3011 ++ bfqq = bfqd->in_service_queue;
3012 ++ if (bfqq == NULL)
3013 ++ goto new_queue;
3014 ++
3015 ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
3016 ++
3017 ++ /*
3018 ++ * If another queue has a request waiting within our mean seek
3019 ++ * distance, let it run. The expire code will check for close
3020 ++ * cooperators and put the close queue at the front of the
3021 ++ * service tree. If possible, merge the expiring queue with the
3022 ++ * new bfqq.
3023 ++ */
3024 ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq);
3025 ++ if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
3026 ++ bfq_setup_merge(bfqq, new_bfqq);
3027 ++
3028 ++ if (bfq_may_expire_for_budg_timeout(bfqq) &&
3029 ++ !timer_pending(&bfqd->idle_slice_timer) &&
3030 ++ !bfq_bfqq_must_idle(bfqq))
3031 ++ goto expire;
3032 ++
3033 ++ next_rq = bfqq->next_rq;
3034 ++ /*
3035 ++ * If bfqq has requests queued and it has enough budget left to
3036 ++ * serve them, keep the queue, otherwise expire it.
3037 ++ */
3038 ++ if (next_rq != NULL) {
3039 ++ if (bfq_serv_to_charge(next_rq, bfqq) >
3040 ++ bfq_bfqq_budget_left(bfqq)) {
3041 ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
3042 ++ goto expire;
3043 ++ } else {
3044 ++ /*
3045 ++ * The idle timer may be pending because we may not
3046 ++ * disable disk idling even when a new request arrives
3047 ++ */
3048 ++ if (timer_pending(&bfqd->idle_slice_timer)) {
3049 ++ /*
3050 ++ * If we get here: 1) at least a new request
3051 ++ * has arrived but we have not disabled the
3052 ++ * timer because the request was too small,
3053 ++ * 2) then the block layer has unplugged the
3054 ++ * device, causing the dispatch to be invoked.
3055 ++ *
3056 ++ * Since the device is unplugged, now the
3057 ++ * requests are probably large enough to
3058 ++ * provide a reasonable throughput.
3059 ++ * So we disable idling.
3060 ++ */
3061 ++ bfq_clear_bfqq_wait_request(bfqq);
3062 ++ del_timer(&bfqd->idle_slice_timer);
3063 ++ }
3064 ++ if (new_bfqq == NULL)
3065 ++ goto keep_queue;
3066 ++ else
3067 ++ goto expire;
3068 ++ }
3069 ++ }
3070 ++
3071 ++ /*
3072 ++ * No requests pending. If the in-service queue has no cooperator and
3073 ++ * still has requests in flight (possibly waiting for a completion)
3074 ++ * or is idling for a new request, then keep it.
3075 ++ */
3076 ++ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
3077 ++ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
3078 ++ bfqq = NULL;
3079 ++ goto keep_queue;
3080 ++ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
3081 ++ /*
3082 ++ * Expiring the queue because there is a close cooperator,
3083 ++ * cancel timer.
3084 ++ */
3085 ++ bfq_clear_bfqq_wait_request(bfqq);
3086 ++ del_timer(&bfqd->idle_slice_timer);
3087 ++ }
3088 ++
3089 ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
3090 ++expire:
3091 ++ bfq_bfqq_expire(bfqd, bfqq, 0, reason);
3092 ++new_queue:
3093 ++ bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
3094 ++ bfq_log(bfqd, "select_queue: new queue %d returned",
3095 ++ bfqq != NULL ? bfqq->pid : 0);
3096 ++keep_queue:
3097 ++ return bfqq;
3098 ++}
3099 ++
3100 ++static void bfq_update_raising_data(struct bfq_data *bfqd,
3101 ++ struct bfq_queue *bfqq)
3102 ++{
3103 ++ if (bfqq->raising_coeff > 1) { /* queue is being boosted */
3104 ++ struct bfq_entity *entity = &bfqq->entity;
3105 ++
3106 ++ bfq_log_bfqq(bfqd, bfqq,
3107 ++ "raising period dur %u/%u msec, "
3108 ++ "old raising coeff %u, w %d(%d)",
3109 ++ jiffies_to_msecs(jiffies -
3110 ++ bfqq->last_rais_start_finish),
3111 ++ jiffies_to_msecs(bfqq->raising_cur_max_time),
3112 ++ bfqq->raising_coeff,
3113 ++ bfqq->entity.weight, bfqq->entity.orig_weight);
3114 ++
3115 ++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
3116 ++ entity->orig_weight * bfqq->raising_coeff);
3117 ++ if (entity->ioprio_changed)
3118 ++ bfq_log_bfqq(bfqd, bfqq,
3119 ++ "WARN: pending prio change");
3120 ++ /*
3121 ++ * If too much time has elapsed from the beginning
3122 ++ * of this weight-raising, stop it.
3123 ++ */
3124 ++ if (jiffies - bfqq->last_rais_start_finish >
3125 ++ bfqq->raising_cur_max_time) {
3126 ++ bfqq->last_rais_start_finish = jiffies;
3127 ++ bfq_log_bfqq(bfqd, bfqq,
3128 ++ "wrais ending at %llu msec,"
3129 ++ "rais_max_time %u",
3130 ++ bfqq->last_rais_start_finish,
3131 ++ jiffies_to_msecs(bfqq->
3132 ++ raising_cur_max_time));
3133 ++ bfq_bfqq_end_raising(bfqq);
3134 ++ __bfq_entity_update_weight_prio(
3135 ++ bfq_entity_service_tree(entity),
3136 ++ entity);
3137 ++ }
3138 ++ }
3139 ++}
3140 ++
3141 ++/*
3142 ++ * Dispatch one request from bfqq, moving it to the request queue
3143 ++ * dispatch list.
3144 ++ */
3145 ++static int bfq_dispatch_request(struct bfq_data *bfqd,
3146 ++ struct bfq_queue *bfqq)
3147 ++{
3148 ++ int dispatched = 0;
3149 ++ struct request *rq;
3150 ++ unsigned long service_to_charge;
3151 ++
3152 ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
3153 ++
3154 ++ /* Follow expired path, else get first next available. */
3155 ++ rq = bfq_check_fifo(bfqq);
3156 ++ if (rq == NULL)
3157 ++ rq = bfqq->next_rq;
3158 ++ service_to_charge = bfq_serv_to_charge(rq, bfqq);
3159 ++
3160 ++ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
3161 ++ /*
3162 ++ * This may happen if the next rq is chosen
3163 ++ * in fifo order instead of sector order.
3164 ++ * The budget is properly dimensioned
3165 ++ * to be always sufficient to serve the next request
3166 ++ * only if it is chosen in sector order. The reason is
3167 ++ * that it would be quite inefficient and little useful
3168 ++ * to always make sure that the budget is large enough
3169 ++ * to serve even the possible next rq in fifo order.
3170 ++ * In fact, requests are seldom served in fifo order.
3171 ++ *
3172 ++ * Expire the queue for budget exhaustion, and
3173 ++ * make sure that the next act_budget is enough
3174 ++ * to serve the next request, even if it comes
3175 ++ * from the fifo expired path.
3176 ++ */
3177 ++ bfqq->next_rq = rq;
3178 ++ /*
3179 ++ * Since this dispatch is failed, make sure that
3180 ++ * a new one will be performed
3181 ++ */
3182 ++ if (!bfqd->rq_in_driver)
3183 ++ bfq_schedule_dispatch(bfqd);
3184 ++ goto expire;
3185 ++ }
3186 ++
3187 ++ /* Finally, insert request into driver dispatch list. */
3188 ++ bfq_bfqq_served(bfqq, service_to_charge);
3189 ++ bfq_dispatch_insert(bfqd->queue, rq);
3190 ++
3191 ++ bfq_update_raising_data(bfqd, bfqq);
3192 ++
3193 ++ bfq_log_bfqq(bfqd, bfqq,
3194 ++ "dispatched %u sec req (%llu), budg left %lu",
3195 ++ blk_rq_sectors(rq),
3196 ++ (long long unsigned)blk_rq_pos(rq),
3197 ++ bfq_bfqq_budget_left(bfqq));
3198 ++
3199 ++ dispatched++;
3200 ++
3201 ++ if (bfqd->in_service_bic == NULL) {
3202 ++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
3203 ++ bfqd->in_service_bic = RQ_BIC(rq);
3204 ++ }
3205 ++
3206 ++ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
3207 ++ dispatched >= bfqd->bfq_max_budget_async_rq) ||
3208 ++ bfq_class_idle(bfqq)))
3209 ++ goto expire;
3210 ++
3211 ++ return dispatched;
3212 ++
3213 ++expire:
3214 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
3215 ++ return dispatched;
3216 ++}
3217 ++
3218 ++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
3219 ++{
3220 ++ int dispatched = 0;
3221 ++
3222 ++ while (bfqq->next_rq != NULL) {
3223 ++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
3224 ++ dispatched++;
3225 ++ }
3226 ++
3227 ++ BUG_ON(!list_empty(&bfqq->fifo));
3228 ++ return dispatched;
3229 ++}
3230 ++
3231 ++/*
3232 ++ * Drain our current requests. Used for barriers and when switching
3233 ++ * io schedulers on-the-fly.
3234 ++ */
3235 ++static int bfq_forced_dispatch(struct bfq_data *bfqd)
3236 ++{
3237 ++ struct bfq_queue *bfqq, *n;
3238 ++ struct bfq_service_tree *st;
3239 ++ int dispatched = 0;
3240 ++
3241 ++ bfqq = bfqd->in_service_queue;
3242 ++ if (bfqq != NULL)
3243 ++ __bfq_bfqq_expire(bfqd, bfqq);
3244 ++
3245 ++ /*
3246 ++ * Loop through classes, and be careful to leave the scheduler
3247 ++ * in a consistent state, as feedback mechanisms and vtime
3248 ++ * updates cannot be disabled during the process.
3249 ++ */
3250 ++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
3251 ++ st = bfq_entity_service_tree(&bfqq->entity);
3252 ++
3253 ++ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
3254 ++ bfqq->max_budget = bfq_max_budget(bfqd);
3255 ++
3256 ++ bfq_forget_idle(st);
3257 ++ }
3258 ++
3259 ++ BUG_ON(bfqd->busy_queues != 0);
3260 ++
3261 ++ return dispatched;
3262 ++}
3263 ++
3264 ++static int bfq_dispatch_requests(struct request_queue *q, int force)
3265 ++{
3266 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
3267 ++ struct bfq_queue *bfqq;
3268 ++ int max_dispatch;
3269 ++
3270 ++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
3271 ++ if (bfqd->busy_queues == 0)
3272 ++ return 0;
3273 ++
3274 ++ if (unlikely(force))
3275 ++ return bfq_forced_dispatch(bfqd);
3276 ++
3277 ++ bfqq = bfq_select_queue(bfqd);
3278 ++ if (bfqq == NULL)
3279 ++ return 0;
3280 ++
3281 ++ max_dispatch = bfqd->bfq_quantum;
3282 ++ if (bfq_class_idle(bfqq))
3283 ++ max_dispatch = 1;
3284 ++
3285 ++ if (!bfq_bfqq_sync(bfqq))
3286 ++ max_dispatch = bfqd->bfq_max_budget_async_rq;
3287 ++
3288 ++ if (bfqq->dispatched >= max_dispatch) {
3289 ++ if (bfqd->busy_queues > 1)
3290 ++ return 0;
3291 ++ if (bfqq->dispatched >= 4 * max_dispatch)
3292 ++ return 0;
3293 ++ }
3294 ++
3295 ++ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
3296 ++ return 0;
3297 ++
3298 ++ bfq_clear_bfqq_wait_request(bfqq);
3299 ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
3300 ++
3301 ++ if (!bfq_dispatch_request(bfqd, bfqq))
3302 ++ return 0;
3303 ++
3304 ++ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",
3305 ++ bfqq->pid, max_dispatch);
3306 ++
3307 ++ return 1;
3308 ++}
3309 ++
3310 ++/*
3311 ++ * Task holds one reference to the queue, dropped when task exits. Each rq
3312 ++ * in-flight on this queue also holds a reference, dropped when rq is freed.
3313 ++ *
3314 ++ * Queue lock must be held here.
3315 ++ */
3316 ++static void bfq_put_queue(struct bfq_queue *bfqq)
3317 ++{
3318 ++ struct bfq_data *bfqd = bfqq->bfqd;
3319 ++
3320 ++ BUG_ON(atomic_read(&bfqq->ref) <= 0);
3321 ++
3322 ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
3323 ++ atomic_read(&bfqq->ref));
3324 ++ if (!atomic_dec_and_test(&bfqq->ref))
3325 ++ return;
3326 ++
3327 ++ BUG_ON(rb_first(&bfqq->sort_list) != NULL);
3328 ++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
3329 ++ BUG_ON(bfqq->entity.tree != NULL);
3330 ++ BUG_ON(bfq_bfqq_busy(bfqq));
3331 ++ BUG_ON(bfqd->in_service_queue == bfqq);
3332 ++
3333 ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
3334 ++
3335 ++ kmem_cache_free(bfq_pool, bfqq);
3336 ++}
3337 ++
3338 ++static void bfq_put_cooperator(struct bfq_queue *bfqq)
3339 ++{
3340 ++ struct bfq_queue *__bfqq, *next;
3341 ++
3342 ++ /*
3343 ++ * If this queue was scheduled to merge with another queue, be
3344 ++ * sure to drop the reference taken on that queue (and others in
3345 ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
3346 ++ */
3347 ++ __bfqq = bfqq->new_bfqq;
3348 ++ while (__bfqq) {
3349 ++ if (__bfqq == bfqq) {
3350 ++ WARN(1, "bfqq->new_bfqq loop detected.\n");
3351 ++ break;
3352 ++ }
3353 ++ next = __bfqq->new_bfqq;
3354 ++ bfq_put_queue(__bfqq);
3355 ++ __bfqq = next;
3356 ++ }
3357 ++}
3358 ++
3359 ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3360 ++{
3361 ++ if (bfqq == bfqd->in_service_queue) {
3362 ++ __bfq_bfqq_expire(bfqd, bfqq);
3363 ++ bfq_schedule_dispatch(bfqd);
3364 ++ }
3365 ++
3366 ++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
3367 ++ atomic_read(&bfqq->ref));
3368 ++
3369 ++ bfq_put_cooperator(bfqq);
3370 ++
3371 ++ bfq_put_queue(bfqq);
3372 ++}
3373 ++
3374 ++static void bfq_init_icq(struct io_cq *icq)
3375 ++{
3376 ++ struct bfq_io_cq *bic = icq_to_bic(icq);
3377 ++
3378 ++ bic->ttime.last_end_request = jiffies;
3379 ++}
3380 ++
3381 ++static void bfq_exit_icq(struct io_cq *icq)
3382 ++{
3383 ++ struct bfq_io_cq *bic = icq_to_bic(icq);
3384 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
3385 ++
3386 ++ if (bic->bfqq[BLK_RW_ASYNC]) {
3387 ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
3388 ++ bic->bfqq[BLK_RW_ASYNC] = NULL;
3389 ++ }
3390 ++
3391 ++ if (bic->bfqq[BLK_RW_SYNC]) {
3392 ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
3393 ++ bic->bfqq[BLK_RW_SYNC] = NULL;
3394 ++ }
3395 ++}
3396 ++
3397 ++/*
3398 ++ * Update the entity prio values; note that the new values will not
3399 ++ * be used until the next (re)activation.
3400 ++ */
3401 ++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
3402 ++{
3403 ++ struct task_struct *tsk = current;
3404 ++ int ioprio_class;
3405 ++
3406 ++ if (!bfq_bfqq_prio_changed(bfqq))
3407 ++ return;
3408 ++
3409 ++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3410 ++ switch (ioprio_class) {
3411 ++ default:
3412 ++ dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
3413 ++ "bfq: bad prio %x\n", ioprio_class);
3414 ++ case IOPRIO_CLASS_NONE:
3415 ++ /*
3416 ++ * No prio set, inherit CPU scheduling settings.
3417 ++ */
3418 ++ bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
3419 ++ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
3420 ++ break;
3421 ++ case IOPRIO_CLASS_RT:
3422 ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3423 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
3424 ++ break;
3425 ++ case IOPRIO_CLASS_BE:
3426 ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3427 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
3428 ++ break;
3429 ++ case IOPRIO_CLASS_IDLE:
3430 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
3431 ++ bfqq->entity.new_ioprio = 7;
3432 ++ bfq_clear_bfqq_idle_window(bfqq);
3433 ++ break;
3434 ++ }
3435 ++
3436 ++ bfqq->entity.ioprio_changed = 1;
3437 ++
3438 ++ /*
3439 ++ * Keep track of original prio settings in case we have to temporarily
3440 ++ * elevate the priority of this queue.
3441 ++ */
3442 ++ bfqq->org_ioprio = bfqq->entity.new_ioprio;
3443 ++ bfq_clear_bfqq_prio_changed(bfqq);
3444 ++}
3445 ++
3446 ++static void bfq_changed_ioprio(struct bfq_io_cq *bic)
3447 ++{
3448 ++ struct bfq_data *bfqd;
3449 ++ struct bfq_queue *bfqq, *new_bfqq;
3450 ++ struct bfq_group *bfqg;
3451 ++ unsigned long uninitialized_var(flags);
3452 ++ int ioprio = bic->icq.ioc->ioprio;
3453 ++
3454 ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
3455 ++ &flags);
3456 ++ /*
3457 ++ * This condition may trigger on a newly created bic, be sure to drop
3458 ++ * the lock before returning.
3459 ++ */
3460 ++ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
3461 ++ goto out;
3462 ++
3463 ++ bfqq = bic->bfqq[BLK_RW_ASYNC];
3464 ++ if (bfqq != NULL) {
3465 ++ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
3466 ++ sched_data);
3467 ++ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
3468 ++ GFP_ATOMIC);
3469 ++ if (new_bfqq != NULL) {
3470 ++ bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
3471 ++ bfq_log_bfqq(bfqd, bfqq,
3472 ++ "changed_ioprio: bfqq %p %d",
3473 ++ bfqq, atomic_read(&bfqq->ref));
3474 ++ bfq_put_queue(bfqq);
3475 ++ }
3476 ++ }
3477 ++
3478 ++ bfqq = bic->bfqq[BLK_RW_SYNC];
3479 ++ if (bfqq != NULL)
3480 ++ bfq_mark_bfqq_prio_changed(bfqq);
3481 ++
3482 ++ bic->ioprio = ioprio;
3483 ++
3484 ++out:
3485 ++ bfq_put_bfqd_unlock(bfqd, &flags);
3486 ++}
3487 ++
3488 ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3489 ++ pid_t pid, int is_sync)
3490 ++{
3491 ++ RB_CLEAR_NODE(&bfqq->entity.rb_node);
3492 ++ INIT_LIST_HEAD(&bfqq->fifo);
3493 ++
3494 ++ atomic_set(&bfqq->ref, 0);
3495 ++ bfqq->bfqd = bfqd;
3496 ++
3497 ++ bfq_mark_bfqq_prio_changed(bfqq);
3498 ++
3499 ++ if (is_sync) {
3500 ++ if (!bfq_class_idle(bfqq))
3501 ++ bfq_mark_bfqq_idle_window(bfqq);
3502 ++ bfq_mark_bfqq_sync(bfqq);
3503 ++ }
3504 ++
3505 ++ /* Tentative initial value to trade off between thr and lat */
3506 ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
3507 ++ bfqq->pid = pid;
3508 ++
3509 ++ bfqq->raising_coeff = 1;
3510 ++ bfqq->last_rais_start_finish = 0;
3511 ++ bfqq->soft_rt_next_start = -1;
3512 ++}
3513 ++
3514 ++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
3515 ++ struct bfq_group *bfqg,
3516 ++ int is_sync,
3517 ++ struct bfq_io_cq *bic,
3518 ++ gfp_t gfp_mask)
3519 ++{
3520 ++ struct bfq_queue *bfqq, *new_bfqq = NULL;
3521 ++
3522 ++retry:
3523 ++ /* bic always exists here */
3524 ++ bfqq = bic_to_bfqq(bic, is_sync);
3525 ++
3526 ++ /*
3527 ++ * Always try a new alloc if we fall back to the OOM bfqq
3528 ++ * originally, since it should just be a temporary situation.
3529 ++ */
3530 ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3531 ++ bfqq = NULL;
3532 ++ if (new_bfqq != NULL) {
3533 ++ bfqq = new_bfqq;
3534 ++ new_bfqq = NULL;
3535 ++ } else if (gfp_mask & __GFP_WAIT) {
3536 ++ spin_unlock_irq(bfqd->queue->queue_lock);
3537 ++ new_bfqq = kmem_cache_alloc_node(bfq_pool,
3538 ++ gfp_mask | __GFP_ZERO,
3539 ++ bfqd->queue->node);
3540 ++ spin_lock_irq(bfqd->queue->queue_lock);
3541 ++ if (new_bfqq != NULL)
3542 ++ goto retry;
3543 ++ } else {
3544 ++ bfqq = kmem_cache_alloc_node(bfq_pool,
3545 ++ gfp_mask | __GFP_ZERO,
3546 ++ bfqd->queue->node);
3547 ++ }
3548 ++
3549 ++ if (bfqq != NULL) {
3550 ++ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
3551 ++ bfq_log_bfqq(bfqd, bfqq, "allocated");
3552 ++ } else {
3553 ++ bfqq = &bfqd->oom_bfqq;
3554 ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
3555 ++ }
3556 ++
3557 ++ bfq_init_prio_data(bfqq, bic);
3558 ++ bfq_init_entity(&bfqq->entity, bfqg);
3559 ++ }
3560 ++
3561 ++ if (new_bfqq != NULL)
3562 ++ kmem_cache_free(bfq_pool, new_bfqq);
3563 ++
3564 ++ return bfqq;
3565 ++}
3566 ++
3567 ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
3568 ++ struct bfq_group *bfqg,
3569 ++ int ioprio_class, int ioprio)
3570 ++{
3571 ++ switch (ioprio_class) {
3572 ++ case IOPRIO_CLASS_RT:
3573 ++ return &bfqg->async_bfqq[0][ioprio];
3574 ++ case IOPRIO_CLASS_NONE:
3575 ++ ioprio = IOPRIO_NORM;
3576 ++ /* fall through */
3577 ++ case IOPRIO_CLASS_BE:
3578 ++ return &bfqg->async_bfqq[1][ioprio];
3579 ++ case IOPRIO_CLASS_IDLE:
3580 ++ return &bfqg->async_idle_bfqq;
3581 ++ default:
3582 ++ BUG();
3583 ++ }
3584 ++}
3585 ++
3586 ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
3587 ++ struct bfq_group *bfqg, int is_sync,
3588 ++ struct bfq_io_cq *bic, gfp_t gfp_mask)
3589 ++{
3590 ++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3591 ++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3592 ++ struct bfq_queue **async_bfqq = NULL;
3593 ++ struct bfq_queue *bfqq = NULL;
3594 ++
3595 ++ if (!is_sync) {
3596 ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
3597 ++ ioprio);
3598 ++ bfqq = *async_bfqq;
3599 ++ }
3600 ++
3601 ++ if (bfqq == NULL)
3602 ++ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
3603 ++
3604 ++ /*
3605 ++ * Pin the queue now that it's allocated, scheduler exit will prune it.
3606 ++ */
3607 ++ if (!is_sync && *async_bfqq == NULL) {
3608 ++ atomic_inc(&bfqq->ref);
3609 ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
3610 ++ bfqq, atomic_read(&bfqq->ref));
3611 ++ *async_bfqq = bfqq;
3612 ++ }
3613 ++
3614 ++ atomic_inc(&bfqq->ref);
3615 ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
3616 ++ atomic_read(&bfqq->ref));
3617 ++ return bfqq;
3618 ++}
3619 ++
3620 ++static void bfq_update_io_thinktime(struct bfq_data *bfqd,
3621 ++ struct bfq_io_cq *bic)
3622 ++{
3623 ++ unsigned long elapsed = jiffies - bic->ttime.last_end_request;
3624 ++ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
3625 ++
3626 ++ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
3627 ++ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
3628 ++ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /
3629 ++ bic->ttime.ttime_samples;
3630 ++}
3631 ++
3632 ++static void bfq_update_io_seektime(struct bfq_data *bfqd,
3633 ++ struct bfq_queue *bfqq,
3634 ++ struct request *rq)
3635 ++{
3636 ++ sector_t sdist;
3637 ++ u64 total;
3638 ++
3639 ++ if (bfqq->last_request_pos < blk_rq_pos(rq))
3640 ++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
3641 ++ else
3642 ++ sdist = bfqq->last_request_pos - blk_rq_pos(rq);
3643 ++
3644 ++ /*
3645 ++ * Don't allow the seek distance to get too large from the
3646 ++ * odd fragment, pagein, etc.
3647 ++ */
3648 ++ if (bfqq->seek_samples == 0) /* first request, not really a seek */
3649 ++ sdist = 0;
3650 ++ else if (bfqq->seek_samples <= 60) /* second & third seek */
3651 ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
3652 ++ else
3653 ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
3654 ++
3655 ++ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
3656 ++ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
3657 ++ total = bfqq->seek_total + (bfqq->seek_samples/2);
3658 ++ do_div(total, bfqq->seek_samples);
3659 ++ bfqq->seek_mean = (sector_t)total;
3660 ++
3661 ++ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
3662 ++ (u64)bfqq->seek_mean);
3663 ++}
3664 ++
3665 ++/*
3666 ++ * Disable idle window if the process thinks too long or seeks so much that
3667 ++ * it doesn't matter.
3668 ++ */
3669 ++static void bfq_update_idle_window(struct bfq_data *bfqd,
3670 ++ struct bfq_queue *bfqq,
3671 ++ struct bfq_io_cq *bic)
3672 ++{
3673 ++ int enable_idle;
3674 ++
3675 ++ /* Don't idle for async or idle io prio class. */
3676 ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
3677 ++ return;
3678 ++
3679 ++ enable_idle = bfq_bfqq_idle_window(bfqq);
3680 ++
3681 ++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
3682 ++ bfqd->bfq_slice_idle == 0 ||
3683 ++ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
3684 ++ bfqq->raising_coeff == 1))
3685 ++ enable_idle = 0;
3686 ++ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
3687 ++ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
3688 ++ bfqq->raising_coeff == 1)
3689 ++ enable_idle = 0;
3690 ++ else
3691 ++ enable_idle = 1;
3692 ++ }
3693 ++ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
3694 ++ enable_idle);
3695 ++
3696 ++ if (enable_idle)
3697 ++ bfq_mark_bfqq_idle_window(bfqq);
3698 ++ else
3699 ++ bfq_clear_bfqq_idle_window(bfqq);
3700 ++}
3701 ++
3702 ++/*
3703 ++ * Called when a new fs request (rq) is added to bfqq. Check if there's
3704 ++ * something we should do about it.
3705 ++ */
3706 ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3707 ++ struct request *rq)
3708 ++{
3709 ++ struct bfq_io_cq *bic = RQ_BIC(rq);
3710 ++
3711 ++ if (rq->cmd_flags & REQ_META)
3712 ++ bfqq->meta_pending++;
3713 ++
3714 ++ bfq_update_io_thinktime(bfqd, bic);
3715 ++ bfq_update_io_seektime(bfqd, bfqq, rq);
3716 ++ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
3717 ++ !BFQQ_SEEKY(bfqq))
3718 ++ bfq_update_idle_window(bfqd, bfqq, bic);
3719 ++
3720 ++ bfq_log_bfqq(bfqd, bfqq,
3721 ++ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
3722 ++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
3723 ++ (long long unsigned)bfqq->seek_mean);
3724 ++
3725 ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3726 ++
3727 ++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
3728 ++ int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
3729 ++ blk_rq_sectors(rq) < 32;
3730 ++ int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
3731 ++
3732 ++ /*
3733 ++ * There is just this request queued: if the request
3734 ++ * is small and the queue is not to be expired, then
3735 ++ * just exit.
3736 ++ *
3737 ++ * In this way, if the disk is being idled to wait for
3738 ++ * a new request from the in-service queue, we avoid
3739 ++ * unplugging the device and committing the disk to serve
3740 ++ * just a small request. On the contrary, we wait for
3741 ++ * the block layer to decide when to unplug the device:
3742 ++ * hopefully, new requests will be merged to this one
3743 ++ * quickly, then the device will be unplugged and
3744 ++ * larger requests will be dispatched.
3745 ++ */
3746 ++ if (small_req && !budget_timeout)
3747 ++ return;
3748 ++
3749 ++ /*
3750 ++ * A large enough request arrived, or the queue is to
3751 ++ * be expired: in both cases disk idling is to be
3752 ++ * stopped, so clear wait_request flag and reset
3753 ++ * timer.
3754 ++ */
3755 ++ bfq_clear_bfqq_wait_request(bfqq);
3756 ++ del_timer(&bfqd->idle_slice_timer);
3757 ++
3758 ++ /*
3759 ++ * The queue is not empty, because a new request just
3760 ++ * arrived. Hence we can safely expire the queue, in
3761 ++ * case of budget timeout, without risking that the
3762 ++ * timestamps of the queue are not updated correctly.
3763 ++ * See [1] for more details.
3764 ++ */
3765 ++ if (budget_timeout)
3766 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
3767 ++
3768 ++ /*
3769 ++ * Let the request rip immediately, or let a new queue be
3770 ++ * selected if bfqq has just been expired.
3771 ++ */
3772 ++ __blk_run_queue(bfqd->queue);
3773 ++ }
3774 ++}
3775 ++
3776 ++static void bfq_insert_request(struct request_queue *q, struct request *rq)
3777 ++{
3778 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
3779 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3780 ++
3781 ++ assert_spin_locked(bfqd->queue->queue_lock);
3782 ++ bfq_init_prio_data(bfqq, RQ_BIC(rq));
3783 ++
3784 ++ bfq_add_rq_rb(rq);
3785 ++
3786 ++ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
3787 ++ list_add_tail(&rq->queuelist, &bfqq->fifo);
3788 ++
3789 ++ bfq_rq_enqueued(bfqd, bfqq, rq);
3790 ++}
3791 ++
3792 ++static void bfq_update_hw_tag(struct bfq_data *bfqd)
3793 ++{
3794 ++ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
3795 ++ bfqd->rq_in_driver);
3796 ++
3797 ++ if (bfqd->hw_tag == 1)
3798 ++ return;
3799 ++
3800 ++ /*
3801 ++ * This sample is valid if the number of outstanding requests
3802 ++ * is large enough to allow a queueing behavior. Note that the
3803 ++ * sum is not exact, as it's not taking into account deactivated
3804 ++ * requests.
3805 ++ */
3806 ++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
3807 ++ return;
3808 ++
3809 ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
3810 ++ return;
3811 ++
3812 ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
3813 ++ bfqd->max_rq_in_driver = 0;
3814 ++ bfqd->hw_tag_samples = 0;
3815 ++}
3816 ++
3817 ++static void bfq_completed_request(struct request_queue *q, struct request *rq)
3818 ++{
3819 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3820 ++ struct bfq_data *bfqd = bfqq->bfqd;
3821 ++ const int sync = rq_is_sync(rq);
3822 ++
3823 ++ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
3824 ++ blk_rq_sectors(rq), sync);
3825 ++
3826 ++ bfq_update_hw_tag(bfqd);
3827 ++
3828 ++ WARN_ON(!bfqd->rq_in_driver);
3829 ++ WARN_ON(!bfqq->dispatched);
3830 ++ bfqd->rq_in_driver--;
3831 ++ bfqq->dispatched--;
3832 ++
3833 ++ if (bfq_bfqq_sync(bfqq))
3834 ++ bfqd->sync_flight--;
3835 ++
3836 ++ if (sync)
3837 ++ RQ_BIC(rq)->ttime.last_end_request = jiffies;
3838 ++
3839 ++ /*
3840 ++ * The computation of softrt_next_start was scheduled for the next
3841 ++ * request completion: it is now time to compute it.
3842 ++ */
3843 ++ if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list))
3844 ++ bfqq->soft_rt_next_start =
3845 ++ bfq_bfqq_softrt_next_start(bfqd, bfqq);
3846 ++
3847 ++ /*
3848 ++ * If this is the in-service queue, check if it needs to be expired,
3849 ++ * or if we want to idle in case it has no pending requests.
3850 ++ */
3851 ++ if (bfqd->in_service_queue == bfqq) {
3852 ++ if (bfq_bfqq_budget_new(bfqq))
3853 ++ bfq_set_budget_timeout(bfqd);
3854 ++
3855 ++ if (bfq_bfqq_must_idle(bfqq)) {
3856 ++ bfq_arm_slice_timer(bfqd);
3857 ++ goto out;
3858 ++ } else if (bfq_may_expire_for_budg_timeout(bfqq))
3859 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
3860 ++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
3861 ++ (bfqq->dispatched == 0 ||
3862 ++ !bfq_bfqq_must_not_expire(bfqq)))
3863 ++ bfq_bfqq_expire(bfqd, bfqq, 0,
3864 ++ BFQ_BFQQ_NO_MORE_REQUESTS);
3865 ++ }
3866 ++
3867 ++ if (!bfqd->rq_in_driver)
3868 ++ bfq_schedule_dispatch(bfqd);
3869 ++
3870 ++out:
3871 ++ return;
3872 ++}
3873 ++
3874 ++static inline int __bfq_may_queue(struct bfq_queue *bfqq)
3875 ++{
3876 ++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
3877 ++ bfq_clear_bfqq_must_alloc(bfqq);
3878 ++ return ELV_MQUEUE_MUST;
3879 ++ }
3880 ++
3881 ++ return ELV_MQUEUE_MAY;
3882 ++}
3883 ++
3884 ++static int bfq_may_queue(struct request_queue *q, int rw)
3885 ++{
3886 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
3887 ++ struct task_struct *tsk = current;
3888 ++ struct bfq_io_cq *bic;
3889 ++ struct bfq_queue *bfqq;
3890 ++
3891 ++ /*
3892 ++ * Don't force setup of a queue from here, as a call to may_queue
3893 ++ * does not necessarily imply that a request actually will be queued.
3894 ++ * So just lookup a possibly existing queue, or return 'may queue'
3895 ++ * if that fails.
3896 ++ */
3897 ++ bic = bfq_bic_lookup(bfqd, tsk->io_context);
3898 ++ if (bic == NULL)
3899 ++ return ELV_MQUEUE_MAY;
3900 ++
3901 ++ bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
3902 ++ if (bfqq != NULL) {
3903 ++ bfq_init_prio_data(bfqq, bic);
3904 ++
3905 ++ return __bfq_may_queue(bfqq);
3906 ++ }
3907 ++
3908 ++ return ELV_MQUEUE_MAY;
3909 ++}
3910 ++
3911 ++/*
3912 ++ * Queue lock held here.
3913 ++ */
3914 ++static void bfq_put_request(struct request *rq)
3915 ++{
3916 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3917 ++
3918 ++ if (bfqq != NULL) {
3919 ++ const int rw = rq_data_dir(rq);
3920 ++
3921 ++ BUG_ON(!bfqq->allocated[rw]);
3922 ++ bfqq->allocated[rw]--;
3923 ++
3924 ++ rq->elv.priv[0] = NULL;
3925 ++ rq->elv.priv[1] = NULL;
3926 ++
3927 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
3928 ++ bfqq, atomic_read(&bfqq->ref));
3929 ++ bfq_put_queue(bfqq);
3930 ++ }
3931 ++}
3932 ++
3933 ++static struct bfq_queue *
3934 ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
3935 ++ struct bfq_queue *bfqq)
3936 ++{
3937 ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
3938 ++ (long unsigned)bfqq->new_bfqq->pid);
3939 ++ bic_set_bfqq(bic, bfqq->new_bfqq, 1);
3940 ++ bfq_mark_bfqq_coop(bfqq->new_bfqq);
3941 ++ bfq_put_queue(bfqq);
3942 ++ return bic_to_bfqq(bic, 1);
3943 ++}
3944 ++
3945 ++/*
3946 ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
3947 ++ * was the last process referring to said bfqq.
3948 ++ */
3949 ++static struct bfq_queue *
3950 ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
3951 ++{
3952 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
3953 ++ if (bfqq_process_refs(bfqq) == 1) {
3954 ++ bfqq->pid = current->pid;
3955 ++ bfq_clear_bfqq_coop(bfqq);
3956 ++ bfq_clear_bfqq_split_coop(bfqq);
3957 ++ return bfqq;
3958 ++ }
3959 ++
3960 ++ bic_set_bfqq(bic, NULL, 1);
3961 ++
3962 ++ bfq_put_cooperator(bfqq);
3963 ++
3964 ++ bfq_put_queue(bfqq);
3965 ++ return NULL;
3966 ++}
3967 ++
3968 ++/*
3969 ++ * Allocate bfq data structures associated with this request.
3970 ++ */
3971 ++static int bfq_set_request(struct request_queue *q, struct request *rq,
3972 ++ struct bio *bio, gfp_t gfp_mask)
3973 ++{
3974 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
3975 ++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
3976 ++ const int rw = rq_data_dir(rq);
3977 ++ const int is_sync = rq_is_sync(rq);
3978 ++ struct bfq_queue *bfqq;
3979 ++ struct bfq_group *bfqg;
3980 ++ unsigned long flags;
3981 ++
3982 ++ might_sleep_if(gfp_mask & __GFP_WAIT);
3983 ++
3984 ++ bfq_changed_ioprio(bic);
3985 ++
3986 ++ spin_lock_irqsave(q->queue_lock, flags);
3987 ++
3988 ++ if (bic == NULL)
3989 ++ goto queue_fail;
3990 ++
3991 ++ bfqg = bfq_bic_update_cgroup(bic);
3992 ++
3993 ++new_queue:
3994 ++ bfqq = bic_to_bfqq(bic, is_sync);
3995 ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3996 ++ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
3997 ++ bic_set_bfqq(bic, bfqq, is_sync);
3998 ++ } else {
3999 ++ /*
4000 ++ * If the queue was seeky for too long, break it apart.
4001 ++ */
4002 ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
4003 ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
4004 ++ bfqq = bfq_split_bfqq(bic, bfqq);
4005 ++ if (!bfqq)
4006 ++ goto new_queue;
4007 ++ }
4008 ++
4009 ++ /*
4010 ++ * Check to see if this queue is scheduled to merge with
4011 ++ * another closely cooperating queue. The merging of queues
4012 ++ * happens here as it must be done in process context.
4013 ++ * The reference on new_bfqq was taken in merge_bfqqs.
4014 ++ */
4015 ++ if (bfqq->new_bfqq != NULL)
4016 ++ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
4017 ++ }
4018 ++
4019 ++ bfqq->allocated[rw]++;
4020 ++ atomic_inc(&bfqq->ref);
4021 ++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
4022 ++ atomic_read(&bfqq->ref));
4023 ++
4024 ++ rq->elv.priv[0] = bic;
4025 ++ rq->elv.priv[1] = bfqq;
4026 ++
4027 ++ spin_unlock_irqrestore(q->queue_lock, flags);
4028 ++
4029 ++ return 0;
4030 ++
4031 ++queue_fail:
4032 ++ bfq_schedule_dispatch(bfqd);
4033 ++ spin_unlock_irqrestore(q->queue_lock, flags);
4034 ++
4035 ++ return 1;
4036 ++}
4037 ++
4038 ++static void bfq_kick_queue(struct work_struct *work)
4039 ++{
4040 ++ struct bfq_data *bfqd =
4041 ++ container_of(work, struct bfq_data, unplug_work);
4042 ++ struct request_queue *q = bfqd->queue;
4043 ++
4044 ++ spin_lock_irq(q->queue_lock);
4045 ++ __blk_run_queue(q);
4046 ++ spin_unlock_irq(q->queue_lock);
4047 ++}
4048 ++
4049 ++/*
4050 ++ * Handler of the expiration of the timer running if the in-service queue
4051 ++ * is idling inside its time slice.
4052 ++ */
4053 ++static void bfq_idle_slice_timer(unsigned long data)
4054 ++{
4055 ++ struct bfq_data *bfqd = (struct bfq_data *)data;
4056 ++ struct bfq_queue *bfqq;
4057 ++ unsigned long flags;
4058 ++ enum bfqq_expiration reason;
4059 ++
4060 ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
4061 ++
4062 ++ bfqq = bfqd->in_service_queue;
4063 ++ /*
4064 ++ * Theoretical race here: the in-service queue can be NULL or different
4065 ++ * from the queue that was idling if the timer handler spins on
4066 ++ * the queue_lock and a new request arrives for the current
4067 ++ * queue and there is a full dispatch cycle that changes the
4068 ++ * in-service queue. This can hardly happen, but in the worst case
4069 ++ * we just expire a queue too early.
4070 ++ */
4071 ++ if (bfqq != NULL) {
4072 ++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
4073 ++ if (bfq_bfqq_budget_timeout(bfqq))
4074 ++ /*
4075 ++ * Also here the queue can be safely expired
4076 ++ * for budget timeout without wasting
4077 ++ * guarantees
4078 ++ */
4079 ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
4080 ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
4081 ++ /*
4082 ++ * The queue may not be empty upon timer expiration,
4083 ++ * because we may not disable the timer when the first
4084 ++ * request of the in-service queue arrives during
4085 ++ * disk idling
4086 ++ */
4087 ++ reason = BFQ_BFQQ_TOO_IDLE;
4088 ++ else
4089 ++ goto schedule_dispatch;
4090 ++
4091 ++ bfq_bfqq_expire(bfqd, bfqq, 1, reason);
4092 ++ }
4093 ++
4094 ++schedule_dispatch:
4095 ++ bfq_schedule_dispatch(bfqd);
4096 ++
4097 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
4098 ++}
4099 ++
4100 ++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
4101 ++{
4102 ++ del_timer_sync(&bfqd->idle_slice_timer);
4103 ++ cancel_work_sync(&bfqd->unplug_work);
4104 ++}
4105 ++
4106 ++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
4107 ++ struct bfq_queue **bfqq_ptr)
4108 ++{
4109 ++ struct bfq_group *root_group = bfqd->root_group;
4110 ++ struct bfq_queue *bfqq = *bfqq_ptr;
4111 ++
4112 ++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
4113 ++ if (bfqq != NULL) {
4114 ++ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
4115 ++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
4116 ++ bfqq, atomic_read(&bfqq->ref));
4117 ++ bfq_put_queue(bfqq);
4118 ++ *bfqq_ptr = NULL;
4119 ++ }
4120 ++}
4121 ++
4122 ++/*
4123 ++ * Release all the bfqg references to its async queues. If we are
4124 ++ * deallocating the group these queues may still contain requests, so
4125 ++ * we reparent them to the root cgroup (i.e., the only one that will
4126 ++ * exist for sure untill all the requests on a device are gone).
4127 ++ */
4128 ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
4129 ++{
4130 ++ int i, j;
4131 ++
4132 ++ for (i = 0; i < 2; i++)
4133 ++ for (j = 0; j < IOPRIO_BE_NR; j++)
4134 ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
4135 ++
4136 ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
4137 ++}
4138 ++
4139 ++static void bfq_exit_queue(struct elevator_queue *e)
4140 ++{
4141 ++ struct bfq_data *bfqd = e->elevator_data;
4142 ++ struct request_queue *q = bfqd->queue;
4143 ++ struct bfq_queue *bfqq, *n;
4144 ++
4145 ++ bfq_shutdown_timer_wq(bfqd);
4146 ++
4147 ++ spin_lock_irq(q->queue_lock);
4148 ++
4149 ++ BUG_ON(bfqd->in_service_queue != NULL);
4150 ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
4151 ++ bfq_deactivate_bfqq(bfqd, bfqq, 0);
4152 ++
4153 ++ bfq_disconnect_groups(bfqd);
4154 ++ spin_unlock_irq(q->queue_lock);
4155 ++
4156 ++ bfq_shutdown_timer_wq(bfqd);
4157 ++
4158 ++ synchronize_rcu();
4159 ++
4160 ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
4161 ++
4162 ++ bfq_free_root_group(bfqd);
4163 ++ kfree(bfqd);
4164 ++}
4165 ++
4166 ++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
4167 ++{
4168 ++ struct bfq_group *bfqg;
4169 ++ struct bfq_data *bfqd;
4170 ++ struct elevator_queue *eq;
4171 ++
4172 ++ eq = elevator_alloc(q, e);
4173 ++ if (eq == NULL)
4174 ++ return -ENOMEM;
4175 ++
4176 ++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
4177 ++ if (bfqd == NULL) {
4178 ++ kobject_put(&eq->kobj);
4179 ++ return -ENOMEM;
4180 ++ }
4181 ++ eq->elevator_data = bfqd;
4182 ++
4183 ++ /*
4184 ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
4185 ++ * Grab a permanent reference to it, so that the normal code flow
4186 ++ * will not attempt to free it.
4187 ++ */
4188 ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
4189 ++ atomic_inc(&bfqd->oom_bfqq.ref);
4190 ++
4191 ++ bfqd->queue = q;
4192 ++
4193 ++ spin_lock_irq(q->queue_lock);
4194 ++ q->elevator = eq;
4195 ++ spin_unlock_irq(q->queue_lock);
4196 ++
4197 ++ bfqg = bfq_alloc_root_group(bfqd, q->node);
4198 ++ if (bfqg == NULL) {
4199 ++ kfree(bfqd);
4200 ++ kobject_put(&eq->kobj);
4201 ++ return -ENOMEM;
4202 ++ }
4203 ++
4204 ++ bfqd->root_group = bfqg;
4205 ++
4206 ++ init_timer(&bfqd->idle_slice_timer);
4207 ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
4208 ++ bfqd->idle_slice_timer.data = (unsigned long)bfqd;
4209 ++
4210 ++ bfqd->rq_pos_tree = RB_ROOT;
4211 ++
4212 ++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
4213 ++
4214 ++ INIT_LIST_HEAD(&bfqd->active_list);
4215 ++ INIT_LIST_HEAD(&bfqd->idle_list);
4216 ++
4217 ++ bfqd->hw_tag = -1;
4218 ++
4219 ++ bfqd->bfq_max_budget = bfq_default_max_budget;
4220 ++
4221 ++ bfqd->bfq_quantum = bfq_quantum;
4222 ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
4223 ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
4224 ++ bfqd->bfq_back_max = bfq_back_max;
4225 ++ bfqd->bfq_back_penalty = bfq_back_penalty;
4226 ++ bfqd->bfq_slice_idle = bfq_slice_idle;
4227 ++ bfqd->bfq_class_idle_last_service = 0;
4228 ++ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
4229 ++ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
4230 ++ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
4231 ++
4232 ++ bfqd->low_latency = true;
4233 ++
4234 ++ bfqd->bfq_raising_coeff = 20;
4235 ++ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
4236 ++ bfqd->bfq_raising_max_time = 0;
4237 ++ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
4238 ++ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
4239 ++ bfqd->bfq_raising_max_softrt_rate = 7000; /*
4240 ++ * Approximate rate required
4241 ++ * to playback or record a
4242 ++ * high-definition compressed
4243 ++ * video.
4244 ++ */
4245 ++ bfqd->raised_busy_queues = 0;
4246 ++
4247 ++ /* Initially estimate the device's peak rate as the reference rate */
4248 ++ if (blk_queue_nonrot(bfqd->queue)) {
4249 ++ bfqd->RT_prod = R_nonrot * T_nonrot;
4250 ++ bfqd->peak_rate = R_nonrot;
4251 ++ } else {
4252 ++ bfqd->RT_prod = R_rot * T_rot;
4253 ++ bfqd->peak_rate = R_rot;
4254 ++ }
4255 ++
4256 ++ return 0;
4257 ++}
4258 ++
4259 ++static void bfq_slab_kill(void)
4260 ++{
4261 ++ if (bfq_pool != NULL)
4262 ++ kmem_cache_destroy(bfq_pool);
4263 ++}
4264 ++
4265 ++static int __init bfq_slab_setup(void)
4266 ++{
4267 ++ bfq_pool = KMEM_CACHE(bfq_queue, 0);
4268 ++ if (bfq_pool == NULL)
4269 ++ return -ENOMEM;
4270 ++ return 0;
4271 ++}
4272 ++
4273 ++static ssize_t bfq_var_show(unsigned int var, char *page)
4274 ++{
4275 ++ return sprintf(page, "%d\n", var);
4276 ++}
4277 ++
4278 ++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
4279 ++{
4280 ++ unsigned long new_val;
4281 ++ int ret = kstrtoul(page, 10, &new_val);
4282 ++
4283 ++ if (ret == 0)
4284 ++ *var = new_val;
4285 ++
4286 ++ return count;
4287 ++}
4288 ++
4289 ++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
4290 ++{
4291 ++ struct bfq_data *bfqd = e->elevator_data;
4292 ++ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
4293 ++ jiffies_to_msecs(bfqd->bfq_raising_max_time) :
4294 ++ jiffies_to_msecs(bfq_wrais_duration(bfqd)));
4295 ++}
4296 ++
4297 ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
4298 ++{
4299 ++ struct bfq_queue *bfqq;
4300 ++ struct bfq_data *bfqd = e->elevator_data;
4301 ++ ssize_t num_char = 0;
4302 ++
4303 ++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
4304 ++ bfqd->queued);
4305 ++
4306 ++ spin_lock_irq(bfqd->queue->queue_lock);
4307 ++
4308 ++ num_char += sprintf(page + num_char, "Active:\n");
4309 ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
4310 ++ num_char += sprintf(page + num_char,
4311 ++ "pid%d: weight %hu, nr_queued %d %d,"
4312 ++ " dur %d/%u\n",
4313 ++ bfqq->pid,
4314 ++ bfqq->entity.weight,
4315 ++ bfqq->queued[0],
4316 ++ bfqq->queued[1],
4317 ++ jiffies_to_msecs(jiffies -
4318 ++ bfqq->last_rais_start_finish),
4319 ++ jiffies_to_msecs(bfqq->raising_cur_max_time));
4320 ++ }
4321 ++
4322 ++ num_char += sprintf(page + num_char, "Idle:\n");
4323 ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
4324 ++ num_char += sprintf(page + num_char,
4325 ++ "pid%d: weight %hu, dur %d/%u\n",
4326 ++ bfqq->pid,
4327 ++ bfqq->entity.weight,
4328 ++ jiffies_to_msecs(jiffies -
4329 ++ bfqq->last_rais_start_finish),
4330 ++ jiffies_to_msecs(bfqq->raising_cur_max_time));
4331 ++ }
4332 ++
4333 ++ spin_unlock_irq(bfqd->queue->queue_lock);
4334 ++
4335 ++ return num_char;
4336 ++}
4337 ++
4338 ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
4339 ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \
4340 ++{ \
4341 ++ struct bfq_data *bfqd = e->elevator_data; \
4342 ++ unsigned int __data = __VAR; \
4343 ++ if (__CONV) \
4344 ++ __data = jiffies_to_msecs(__data); \
4345 ++ return bfq_var_show(__data, (page)); \
4346 ++}
4347 ++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
4348 ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
4349 ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
4350 ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
4351 ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
4352 ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
4353 ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
4354 ++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
4355 ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
4356 ++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
4357 ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
4358 ++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
4359 ++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
4360 ++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
4361 ++ 1);
4362 ++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
4363 ++ bfqd->bfq_raising_min_inter_arr_async,
4364 ++ 1);
4365 ++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
4366 ++ bfqd->bfq_raising_max_softrt_rate, 0);
4367 ++#undef SHOW_FUNCTION
4368 ++
4369 ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
4370 ++static ssize_t \
4371 ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \
4372 ++{ \
4373 ++ struct bfq_data *bfqd = e->elevator_data; \
4374 ++ unsigned long uninitialized_var(__data); \
4375 ++ int ret = bfq_var_store(&__data, (page), count); \
4376 ++ if (__data < (MIN)) \
4377 ++ __data = (MIN); \
4378 ++ else if (__data > (MAX)) \
4379 ++ __data = (MAX); \
4380 ++ if (__CONV) \
4381 ++ *(__PTR) = msecs_to_jiffies(__data); \
4382 ++ else \
4383 ++ *(__PTR) = __data; \
4384 ++ return ret; \
4385 ++}
4386 ++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
4387 ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
4388 ++ INT_MAX, 1);
4389 ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
4390 ++ INT_MAX, 1);
4391 ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
4392 ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
4393 ++ INT_MAX, 0);
4394 ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
4395 ++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
4396 ++ 1, INT_MAX, 0);
4397 ++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
4398 ++ INT_MAX, 1);
4399 ++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
4400 ++ INT_MAX, 0);
4401 ++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
4402 ++ INT_MAX, 1);
4403 ++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
4404 ++ INT_MAX, 1);
4405 ++STORE_FUNCTION(bfq_raising_min_idle_time_store,
4406 ++ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
4407 ++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
4408 ++ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
4409 ++STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
4410 ++ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
4411 ++#undef STORE_FUNCTION
4412 ++
4413 ++/* do nothing for the moment */
4414 ++static ssize_t bfq_weights_store(struct elevator_queue *e,
4415 ++ const char *page, size_t count)
4416 ++{
4417 ++ return count;
4418 ++}
4419 ++
4420 ++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
4421 ++{
4422 ++ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
4423 ++
4424 ++ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
4425 ++ return bfq_calc_max_budget(bfqd->peak_rate, timeout);
4426 ++ else
4427 ++ return bfq_default_max_budget;
4428 ++}
4429 ++
4430 ++static ssize_t bfq_max_budget_store(struct elevator_queue *e,
4431 ++ const char *page, size_t count)
4432 ++{
4433 ++ struct bfq_data *bfqd = e->elevator_data;
4434 ++ unsigned long uninitialized_var(__data);
4435 ++ int ret = bfq_var_store(&__data, (page), count);
4436 ++
4437 ++ if (__data == 0)
4438 ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4439 ++ else {
4440 ++ if (__data > INT_MAX)
4441 ++ __data = INT_MAX;
4442 ++ bfqd->bfq_max_budget = __data;
4443 ++ }
4444 ++
4445 ++ bfqd->bfq_user_max_budget = __data;
4446 ++
4447 ++ return ret;
4448 ++}
4449 ++
4450 ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
4451 ++ const char *page, size_t count)
4452 ++{
4453 ++ struct bfq_data *bfqd = e->elevator_data;
4454 ++ unsigned long uninitialized_var(__data);
4455 ++ int ret = bfq_var_store(&__data, (page), count);
4456 ++
4457 ++ if (__data < 1)
4458 ++ __data = 1;
4459 ++ else if (__data > INT_MAX)
4460 ++ __data = INT_MAX;
4461 ++
4462 ++ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
4463 ++ if (bfqd->bfq_user_max_budget == 0)
4464 ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4465 ++
4466 ++ return ret;
4467 ++}
4468 ++
4469 ++static ssize_t bfq_low_latency_store(struct elevator_queue *e,
4470 ++ const char *page, size_t count)
4471 ++{
4472 ++ struct bfq_data *bfqd = e->elevator_data;
4473 ++ unsigned long uninitialized_var(__data);
4474 ++ int ret = bfq_var_store(&__data, (page), count);
4475 ++
4476 ++ if (__data > 1)
4477 ++ __data = 1;
4478 ++ if (__data == 0 && bfqd->low_latency != 0)
4479 ++ bfq_end_raising(bfqd);
4480 ++ bfqd->low_latency = __data;
4481 ++
4482 ++ return ret;
4483 ++}
4484 ++
4485 ++#define BFQ_ATTR(name) \
4486 ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
4487 ++
4488 ++static struct elv_fs_entry bfq_attrs[] = {
4489 ++ BFQ_ATTR(quantum),
4490 ++ BFQ_ATTR(fifo_expire_sync),
4491 ++ BFQ_ATTR(fifo_expire_async),
4492 ++ BFQ_ATTR(back_seek_max),
4493 ++ BFQ_ATTR(back_seek_penalty),
4494 ++ BFQ_ATTR(slice_idle),
4495 ++ BFQ_ATTR(max_budget),
4496 ++ BFQ_ATTR(max_budget_async_rq),
4497 ++ BFQ_ATTR(timeout_sync),
4498 ++ BFQ_ATTR(timeout_async),
4499 ++ BFQ_ATTR(low_latency),
4500 ++ BFQ_ATTR(raising_coeff),
4501 ++ BFQ_ATTR(raising_max_time),
4502 ++ BFQ_ATTR(raising_rt_max_time),
4503 ++ BFQ_ATTR(raising_min_idle_time),
4504 ++ BFQ_ATTR(raising_min_inter_arr_async),
4505 ++ BFQ_ATTR(raising_max_softrt_rate),
4506 ++ BFQ_ATTR(weights),
4507 ++ __ATTR_NULL
4508 ++};
4509 ++
4510 ++static struct elevator_type iosched_bfq = {
4511 ++ .ops = {
4512 ++ .elevator_merge_fn = bfq_merge,
4513 ++ .elevator_merged_fn = bfq_merged_request,
4514 ++ .elevator_merge_req_fn = bfq_merged_requests,
4515 ++ .elevator_allow_merge_fn = bfq_allow_merge,
4516 ++ .elevator_dispatch_fn = bfq_dispatch_requests,
4517 ++ .elevator_add_req_fn = bfq_insert_request,
4518 ++ .elevator_activate_req_fn = bfq_activate_request,
4519 ++ .elevator_deactivate_req_fn = bfq_deactivate_request,
4520 ++ .elevator_completed_req_fn = bfq_completed_request,
4521 ++ .elevator_former_req_fn = elv_rb_former_request,
4522 ++ .elevator_latter_req_fn = elv_rb_latter_request,
4523 ++ .elevator_init_icq_fn = bfq_init_icq,
4524 ++ .elevator_exit_icq_fn = bfq_exit_icq,
4525 ++ .elevator_set_req_fn = bfq_set_request,
4526 ++ .elevator_put_req_fn = bfq_put_request,
4527 ++ .elevator_may_queue_fn = bfq_may_queue,
4528 ++ .elevator_init_fn = bfq_init_queue,
4529 ++ .elevator_exit_fn = bfq_exit_queue,
4530 ++ },
4531 ++ .icq_size = sizeof(struct bfq_io_cq),
4532 ++ .icq_align = __alignof__(struct bfq_io_cq),
4533 ++ .elevator_attrs = bfq_attrs,
4534 ++ .elevator_name = "bfq",
4535 ++ .elevator_owner = THIS_MODULE,
4536 ++};
4537 ++
4538 ++static int __init bfq_init(void)
4539 ++{
4540 ++ /*
4541 ++ * Can be 0 on HZ < 1000 setups.
4542 ++ */
4543 ++ if (bfq_slice_idle == 0)
4544 ++ bfq_slice_idle = 1;
4545 ++
4546 ++ if (bfq_timeout_async == 0)
4547 ++ bfq_timeout_async = 1;
4548 ++
4549 ++ if (bfq_slab_setup())
4550 ++ return -ENOMEM;
4551 ++
4552 ++ elv_register(&iosched_bfq);
4553 ++ printk(KERN_INFO "BFQ I/O-scheduler version: v7");
4554 ++
4555 ++ return 0;
4556 ++}
4557 ++
4558 ++static void __exit bfq_exit(void)
4559 ++{
4560 ++ elv_unregister(&iosched_bfq);
4561 ++ bfq_slab_kill();
4562 ++}
4563 ++
4564 ++module_init(bfq_init);
4565 ++module_exit(bfq_exit);
4566 ++
4567 ++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
4568 ++MODULE_LICENSE("GPL");
4569 ++MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
4570 +diff --git a/block/bfq-sched.c b/block/bfq-sched.c
4571 +new file mode 100644
4572 +index 0000000..30df81c
4573 +--- /dev/null
4574 ++++ b/block/bfq-sched.c
4575 +@@ -0,0 +1,1077 @@
4576 ++/*
4577 ++ * BFQ: Hierarchical B-WF2Q+ scheduler.
4578 ++ *
4579 ++ * Based on ideas and code from CFQ:
4580 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
4581 ++ *
4582 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
4583 ++ * Paolo Valente <paolo.valente@×××××××.it>
4584 ++ *
4585 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
4586 ++ */
4587 ++
4588 ++#ifdef CONFIG_CGROUP_BFQIO
4589 ++#define for_each_entity(entity) \
4590 ++ for (; entity != NULL; entity = entity->parent)
4591 ++
4592 ++#define for_each_entity_safe(entity, parent) \
4593 ++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
4594 ++
4595 ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
4596 ++ int extract,
4597 ++ struct bfq_data *bfqd);
4598 ++
4599 ++static inline void bfq_update_budget(struct bfq_entity *next_active)
4600 ++{
4601 ++ struct bfq_entity *bfqg_entity;
4602 ++ struct bfq_group *bfqg;
4603 ++ struct bfq_sched_data *group_sd;
4604 ++
4605 ++ BUG_ON(next_active == NULL);
4606 ++
4607 ++ group_sd = next_active->sched_data;
4608 ++
4609 ++ bfqg = container_of(group_sd, struct bfq_group, sched_data);
4610 ++ /*
4611 ++ * bfq_group's my_entity field is not NULL only if the group
4612 ++ * is not the root group. We must not touch the root entity
4613 ++ * as it must never become an active entity.
4614 ++ */
4615 ++ bfqg_entity = bfqg->my_entity;
4616 ++ if (bfqg_entity != NULL)
4617 ++ bfqg_entity->budget = next_active->budget;
4618 ++}
4619 ++
4620 ++static int bfq_update_next_active(struct bfq_sched_data *sd)
4621 ++{
4622 ++ struct bfq_entity *next_active;
4623 ++
4624 ++ if (sd->active_entity != NULL)
4625 ++ /* will update/requeue at the end of service */
4626 ++ return 0;
4627 ++
4628 ++ /*
4629 ++ * NOTE: this can be improved in many ways, such as returning
4630 ++ * 1 (and thus propagating upwards the update) only when the
4631 ++ * budget changes, or caching the bfqq that will be scheduled
4632 ++ * next from this subtree. By now we worry more about
4633 ++ * correctness than about performance...
4634 ++ */
4635 ++ next_active = bfq_lookup_next_entity(sd, 0, NULL);
4636 ++ sd->next_active = next_active;
4637 ++
4638 ++ if (next_active != NULL)
4639 ++ bfq_update_budget(next_active);
4640 ++
4641 ++ return 1;
4642 ++}
4643 ++
4644 ++static inline void bfq_check_next_active(struct bfq_sched_data *sd,
4645 ++ struct bfq_entity *entity)
4646 ++{
4647 ++ BUG_ON(sd->next_active != entity);
4648 ++}
4649 ++#else
4650 ++#define for_each_entity(entity) \
4651 ++ for (; entity != NULL; entity = NULL)
4652 ++
4653 ++#define for_each_entity_safe(entity, parent) \
4654 ++ for (parent = NULL; entity != NULL; entity = parent)
4655 ++
4656 ++static inline int bfq_update_next_active(struct bfq_sched_data *sd)
4657 ++{
4658 ++ return 0;
4659 ++}
4660 ++
4661 ++static inline void bfq_check_next_active(struct bfq_sched_data *sd,
4662 ++ struct bfq_entity *entity)
4663 ++{
4664 ++}
4665 ++
4666 ++static inline void bfq_update_budget(struct bfq_entity *next_active)
4667 ++{
4668 ++}
4669 ++#endif
4670 ++
4671 ++/*
4672 ++ * Shift for timestamp calculations. This actually limits the maximum
4673 ++ * service allowed in one timestamp delta (small shift values increase it),
4674 ++ * the maximum total weight that can be used for the queues in the system
4675 ++ * (big shift values increase it), and the period of virtual time wraparounds.
4676 ++ */
4677 ++#define WFQ_SERVICE_SHIFT 22
4678 ++
4679 ++/**
4680 ++ * bfq_gt - compare two timestamps.
4681 ++ * @a: first ts.
4682 ++ * @b: second ts.
4683 ++ *
4684 ++ * Return @a > @b, dealing with wrapping correctly.
4685 ++ */
4686 ++static inline int bfq_gt(u64 a, u64 b)
4687 ++{
4688 ++ return (s64)(a - b) > 0;
4689 ++}
4690 ++
4691 ++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
4692 ++{
4693 ++ struct bfq_queue *bfqq = NULL;
4694 ++
4695 ++ BUG_ON(entity == NULL);
4696 ++
4697 ++ if (entity->my_sched_data == NULL)
4698 ++ bfqq = container_of(entity, struct bfq_queue, entity);
4699 ++
4700 ++ return bfqq;
4701 ++}
4702 ++
4703 ++
4704 ++/**
4705 ++ * bfq_delta - map service into the virtual time domain.
4706 ++ * @service: amount of service.
4707 ++ * @weight: scale factor (weight of an entity or weight sum).
4708 ++ */
4709 ++static inline u64 bfq_delta(unsigned long service,
4710 ++ unsigned long weight)
4711 ++{
4712 ++ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
4713 ++
4714 ++ do_div(d, weight);
4715 ++ return d;
4716 ++}
4717 ++
4718 ++/**
4719 ++ * bfq_calc_finish - assign the finish time to an entity.
4720 ++ * @entity: the entity to act upon.
4721 ++ * @service: the service to be charged to the entity.
4722 ++ */
4723 ++static inline void bfq_calc_finish(struct bfq_entity *entity,
4724 ++ unsigned long service)
4725 ++{
4726 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4727 ++
4728 ++ BUG_ON(entity->weight == 0);
4729 ++
4730 ++ entity->finish = entity->start +
4731 ++ bfq_delta(service, entity->weight);
4732 ++
4733 ++ if (bfqq != NULL) {
4734 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
4735 ++ "calc_finish: serv %lu, w %d",
4736 ++ service, entity->weight);
4737 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
4738 ++ "calc_finish: start %llu, finish %llu, delta %llu",
4739 ++ entity->start, entity->finish,
4740 ++ bfq_delta(service, entity->weight));
4741 ++ }
4742 ++}
4743 ++
4744 ++/**
4745 ++ * bfq_entity_of - get an entity from a node.
4746 ++ * @node: the node field of the entity.
4747 ++ *
4748 ++ * Convert a node pointer to the relative entity. This is used only
4749 ++ * to simplify the logic of some functions and not as the generic
4750 ++ * conversion mechanism because, e.g., in the tree walking functions,
4751 ++ * the check for a %NULL value would be redundant.
4752 ++ */
4753 ++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
4754 ++{
4755 ++ struct bfq_entity *entity = NULL;
4756 ++
4757 ++ if (node != NULL)
4758 ++ entity = rb_entry(node, struct bfq_entity, rb_node);
4759 ++
4760 ++ return entity;
4761 ++}
4762 ++
4763 ++/**
4764 ++ * bfq_extract - remove an entity from a tree.
4765 ++ * @root: the tree root.
4766 ++ * @entity: the entity to remove.
4767 ++ */
4768 ++static inline void bfq_extract(struct rb_root *root,
4769 ++ struct bfq_entity *entity)
4770 ++{
4771 ++ BUG_ON(entity->tree != root);
4772 ++
4773 ++ entity->tree = NULL;
4774 ++ rb_erase(&entity->rb_node, root);
4775 ++}
4776 ++
4777 ++/**
4778 ++ * bfq_idle_extract - extract an entity from the idle tree.
4779 ++ * @st: the service tree of the owning @entity.
4780 ++ * @entity: the entity being removed.
4781 ++ */
4782 ++static void bfq_idle_extract(struct bfq_service_tree *st,
4783 ++ struct bfq_entity *entity)
4784 ++{
4785 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4786 ++ struct rb_node *next;
4787 ++
4788 ++ BUG_ON(entity->tree != &st->idle);
4789 ++
4790 ++ if (entity == st->first_idle) {
4791 ++ next = rb_next(&entity->rb_node);
4792 ++ st->first_idle = bfq_entity_of(next);
4793 ++ }
4794 ++
4795 ++ if (entity == st->last_idle) {
4796 ++ next = rb_prev(&entity->rb_node);
4797 ++ st->last_idle = bfq_entity_of(next);
4798 ++ }
4799 ++
4800 ++ bfq_extract(&st->idle, entity);
4801 ++
4802 ++ if (bfqq != NULL)
4803 ++ list_del(&bfqq->bfqq_list);
4804 ++}
4805 ++
4806 ++/**
4807 ++ * bfq_insert - generic tree insertion.
4808 ++ * @root: tree root.
4809 ++ * @entity: entity to insert.
4810 ++ *
4811 ++ * This is used for the idle and the active tree, since they are both
4812 ++ * ordered by finish time.
4813 ++ */
4814 ++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
4815 ++{
4816 ++ struct bfq_entity *entry;
4817 ++ struct rb_node **node = &root->rb_node;
4818 ++ struct rb_node *parent = NULL;
4819 ++
4820 ++ BUG_ON(entity->tree != NULL);
4821 ++
4822 ++ while (*node != NULL) {
4823 ++ parent = *node;
4824 ++ entry = rb_entry(parent, struct bfq_entity, rb_node);
4825 ++
4826 ++ if (bfq_gt(entry->finish, entity->finish))
4827 ++ node = &parent->rb_left;
4828 ++ else
4829 ++ node = &parent->rb_right;
4830 ++ }
4831 ++
4832 ++ rb_link_node(&entity->rb_node, parent, node);
4833 ++ rb_insert_color(&entity->rb_node, root);
4834 ++
4835 ++ entity->tree = root;
4836 ++}
4837 ++
4838 ++/**
4839 ++ * bfq_update_min - update the min_start field of a entity.
4840 ++ * @entity: the entity to update.
4841 ++ * @node: one of its children.
4842 ++ *
4843 ++ * This function is called when @entity may store an invalid value for
4844 ++ * min_start due to updates to the active tree. The function assumes
4845 ++ * that the subtree rooted at @node (which may be its left or its right
4846 ++ * child) has a valid min_start value.
4847 ++ */
4848 ++static inline void bfq_update_min(struct bfq_entity *entity,
4849 ++ struct rb_node *node)
4850 ++{
4851 ++ struct bfq_entity *child;
4852 ++
4853 ++ if (node != NULL) {
4854 ++ child = rb_entry(node, struct bfq_entity, rb_node);
4855 ++ if (bfq_gt(entity->min_start, child->min_start))
4856 ++ entity->min_start = child->min_start;
4857 ++ }
4858 ++}
4859 ++
4860 ++/**
4861 ++ * bfq_update_active_node - recalculate min_start.
4862 ++ * @node: the node to update.
4863 ++ *
4864 ++ * @node may have changed position or one of its children may have moved,
4865 ++ * this function updates its min_start value. The left and right subtrees
4866 ++ * are assumed to hold a correct min_start value.
4867 ++ */
4868 ++static inline void bfq_update_active_node(struct rb_node *node)
4869 ++{
4870 ++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
4871 ++
4872 ++ entity->min_start = entity->start;
4873 ++ bfq_update_min(entity, node->rb_right);
4874 ++ bfq_update_min(entity, node->rb_left);
4875 ++}
4876 ++
4877 ++/**
4878 ++ * bfq_update_active_tree - update min_start for the whole active tree.
4879 ++ * @node: the starting node.
4880 ++ *
4881 ++ * @node must be the deepest modified node after an update. This function
4882 ++ * updates its min_start using the values held by its children, assuming
4883 ++ * that they did not change, and then updates all the nodes that may have
4884 ++ * changed in the path to the root. The only nodes that may have changed
4885 ++ * are the ones in the path or their siblings.
4886 ++ */
4887 ++static void bfq_update_active_tree(struct rb_node *node)
4888 ++{
4889 ++ struct rb_node *parent;
4890 ++
4891 ++up:
4892 ++ bfq_update_active_node(node);
4893 ++
4894 ++ parent = rb_parent(node);
4895 ++ if (parent == NULL)
4896 ++ return;
4897 ++
4898 ++ if (node == parent->rb_left && parent->rb_right != NULL)
4899 ++ bfq_update_active_node(parent->rb_right);
4900 ++ else if (parent->rb_left != NULL)
4901 ++ bfq_update_active_node(parent->rb_left);
4902 ++
4903 ++ node = parent;
4904 ++ goto up;
4905 ++}
4906 ++
4907 ++/**
4908 ++ * bfq_active_insert - insert an entity in the active tree of its group/device.
4909 ++ * @st: the service tree of the entity.
4910 ++ * @entity: the entity being inserted.
4911 ++ *
4912 ++ * The active tree is ordered by finish time, but an extra key is kept
4913 ++ * per each node, containing the minimum value for the start times of
4914 ++ * its children (and the node itself), so it's possible to search for
4915 ++ * the eligible node with the lowest finish time in logarithmic time.
4916 ++ */
4917 ++static void bfq_active_insert(struct bfq_service_tree *st,
4918 ++ struct bfq_entity *entity)
4919 ++{
4920 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4921 ++ struct rb_node *node = &entity->rb_node;
4922 ++
4923 ++ bfq_insert(&st->active, entity);
4924 ++
4925 ++ if (node->rb_left != NULL)
4926 ++ node = node->rb_left;
4927 ++ else if (node->rb_right != NULL)
4928 ++ node = node->rb_right;
4929 ++
4930 ++ bfq_update_active_tree(node);
4931 ++
4932 ++ if (bfqq != NULL)
4933 ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
4934 ++}
4935 ++
4936 ++/**
4937 ++ * bfq_ioprio_to_weight - calc a weight from an ioprio.
4938 ++ * @ioprio: the ioprio value to convert.
4939 ++ */
4940 ++static unsigned short bfq_ioprio_to_weight(int ioprio)
4941 ++{
4942 ++ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
4943 ++ return IOPRIO_BE_NR - ioprio;
4944 ++}
4945 ++
4946 ++/**
4947 ++ * bfq_weight_to_ioprio - calc an ioprio from a weight.
4948 ++ * @weight: the weight value to convert.
4949 ++ *
4950 ++ * To preserve as mush as possible the old only-ioprio user interface,
4951 ++ * 0 is used as an escape ioprio value for weights (numerically) equal or
4952 ++ * larger than IOPRIO_BE_NR
4953 ++ */
4954 ++static unsigned short bfq_weight_to_ioprio(int weight)
4955 ++{
4956 ++ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
4957 ++ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
4958 ++}
4959 ++
4960 ++static inline void bfq_get_entity(struct bfq_entity *entity)
4961 ++{
4962 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4963 ++ struct bfq_sched_data *sd;
4964 ++
4965 ++ if (bfqq != NULL) {
4966 ++ sd = entity->sched_data;
4967 ++ atomic_inc(&bfqq->ref);
4968 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
4969 ++ bfqq, atomic_read(&bfqq->ref));
4970 ++ }
4971 ++}
4972 ++
4973 ++/**
4974 ++ * bfq_find_deepest - find the deepest node that an extraction can modify.
4975 ++ * @node: the node being removed.
4976 ++ *
4977 ++ * Do the first step of an extraction in an rb tree, looking for the
4978 ++ * node that will replace @node, and returning the deepest node that
4979 ++ * the following modifications to the tree can touch. If @node is the
4980 ++ * last node in the tree return %NULL.
4981 ++ */
4982 ++static struct rb_node *bfq_find_deepest(struct rb_node *node)
4983 ++{
4984 ++ struct rb_node *deepest;
4985 ++
4986 ++ if (node->rb_right == NULL && node->rb_left == NULL)
4987 ++ deepest = rb_parent(node);
4988 ++ else if (node->rb_right == NULL)
4989 ++ deepest = node->rb_left;
4990 ++ else if (node->rb_left == NULL)
4991 ++ deepest = node->rb_right;
4992 ++ else {
4993 ++ deepest = rb_next(node);
4994 ++ if (deepest->rb_right != NULL)
4995 ++ deepest = deepest->rb_right;
4996 ++ else if (rb_parent(deepest) != node)
4997 ++ deepest = rb_parent(deepest);
4998 ++ }
4999 ++
5000 ++ return deepest;
5001 ++}
5002 ++
5003 ++/**
5004 ++ * bfq_active_extract - remove an entity from the active tree.
5005 ++ * @st: the service_tree containing the tree.
5006 ++ * @entity: the entity being removed.
5007 ++ */
5008 ++static void bfq_active_extract(struct bfq_service_tree *st,
5009 ++ struct bfq_entity *entity)
5010 ++{
5011 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5012 ++ struct rb_node *node;
5013 ++
5014 ++ node = bfq_find_deepest(&entity->rb_node);
5015 ++ bfq_extract(&st->active, entity);
5016 ++
5017 ++ if (node != NULL)
5018 ++ bfq_update_active_tree(node);
5019 ++
5020 ++ if (bfqq != NULL)
5021 ++ list_del(&bfqq->bfqq_list);
5022 ++}
5023 ++
5024 ++/**
5025 ++ * bfq_idle_insert - insert an entity into the idle tree.
5026 ++ * @st: the service tree containing the tree.
5027 ++ * @entity: the entity to insert.
5028 ++ */
5029 ++static void bfq_idle_insert(struct bfq_service_tree *st,
5030 ++ struct bfq_entity *entity)
5031 ++{
5032 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5033 ++ struct bfq_entity *first_idle = st->first_idle;
5034 ++ struct bfq_entity *last_idle = st->last_idle;
5035 ++
5036 ++ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
5037 ++ st->first_idle = entity;
5038 ++ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
5039 ++ st->last_idle = entity;
5040 ++
5041 ++ bfq_insert(&st->idle, entity);
5042 ++
5043 ++ if (bfqq != NULL)
5044 ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
5045 ++}
5046 ++
5047 ++/**
5048 ++ * bfq_forget_entity - remove an entity from the wfq trees.
5049 ++ * @st: the service tree.
5050 ++ * @entity: the entity being removed.
5051 ++ *
5052 ++ * Update the device status and forget everything about @entity, putting
5053 ++ * the device reference to it, if it is a queue. Entities belonging to
5054 ++ * groups are not refcounted.
5055 ++ */
5056 ++static void bfq_forget_entity(struct bfq_service_tree *st,
5057 ++ struct bfq_entity *entity)
5058 ++{
5059 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5060 ++ struct bfq_sched_data *sd;
5061 ++
5062 ++ BUG_ON(!entity->on_st);
5063 ++
5064 ++ entity->on_st = 0;
5065 ++ st->wsum -= entity->weight;
5066 ++ if (bfqq != NULL) {
5067 ++ sd = entity->sched_data;
5068 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
5069 ++ bfqq, atomic_read(&bfqq->ref));
5070 ++ bfq_put_queue(bfqq);
5071 ++ }
5072 ++}
5073 ++
5074 ++/**
5075 ++ * bfq_put_idle_entity - release the idle tree ref of an entity.
5076 ++ * @st: service tree for the entity.
5077 ++ * @entity: the entity being released.
5078 ++ */
5079 ++static void bfq_put_idle_entity(struct bfq_service_tree *st,
5080 ++ struct bfq_entity *entity)
5081 ++{
5082 ++ bfq_idle_extract(st, entity);
5083 ++ bfq_forget_entity(st, entity);
5084 ++}
5085 ++
5086 ++/**
5087 ++ * bfq_forget_idle - update the idle tree if necessary.
5088 ++ * @st: the service tree to act upon.
5089 ++ *
5090 ++ * To preserve the global O(log N) complexity we only remove one entry here;
5091 ++ * as the idle tree will not grow indefinitely this can be done safely.
5092 ++ */
5093 ++static void bfq_forget_idle(struct bfq_service_tree *st)
5094 ++{
5095 ++ struct bfq_entity *first_idle = st->first_idle;
5096 ++ struct bfq_entity *last_idle = st->last_idle;
5097 ++
5098 ++ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
5099 ++ !bfq_gt(last_idle->finish, st->vtime)) {
5100 ++ /*
5101 ++ * Forget the whole idle tree, increasing the vtime past
5102 ++ * the last finish time of idle entities.
5103 ++ */
5104 ++ st->vtime = last_idle->finish;
5105 ++ }
5106 ++
5107 ++ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
5108 ++ bfq_put_idle_entity(st, first_idle);
5109 ++}
5110 ++
5111 ++static struct bfq_service_tree *
5112 ++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
5113 ++ struct bfq_entity *entity)
5114 ++{
5115 ++ struct bfq_service_tree *new_st = old_st;
5116 ++
5117 ++ if (entity->ioprio_changed) {
5118 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5119 ++
5120 ++ BUG_ON(old_st->wsum < entity->weight);
5121 ++ old_st->wsum -= entity->weight;
5122 ++
5123 ++ if (entity->new_weight != entity->orig_weight) {
5124 ++ entity->orig_weight = entity->new_weight;
5125 ++ entity->ioprio =
5126 ++ bfq_weight_to_ioprio(entity->orig_weight);
5127 ++ } else if (entity->new_ioprio != entity->ioprio) {
5128 ++ entity->ioprio = entity->new_ioprio;
5129 ++ entity->orig_weight =
5130 ++ bfq_ioprio_to_weight(entity->ioprio);
5131 ++ } else
5132 ++ entity->new_weight = entity->orig_weight =
5133 ++ bfq_ioprio_to_weight(entity->ioprio);
5134 ++
5135 ++ entity->ioprio_class = entity->new_ioprio_class;
5136 ++ entity->ioprio_changed = 0;
5137 ++
5138 ++ /*
5139 ++ * NOTE: here we may be changing the weight too early,
5140 ++ * this will cause unfairness. The correct approach
5141 ++ * would have required additional complexity to defer
5142 ++ * weight changes to the proper time instants (i.e.,
5143 ++ * when entity->finish <= old_st->vtime).
5144 ++ */
5145 ++ new_st = bfq_entity_service_tree(entity);
5146 ++ entity->weight = entity->orig_weight *
5147 ++ (bfqq != NULL ? bfqq->raising_coeff : 1);
5148 ++ new_st->wsum += entity->weight;
5149 ++
5150 ++ if (new_st != old_st)
5151 ++ entity->start = new_st->vtime;
5152 ++ }
5153 ++
5154 ++ return new_st;
5155 ++}
5156 ++
5157 ++/**
5158 ++ * bfq_bfqq_served - update the scheduler status after selection for service.
5159 ++ * @bfqq: the queue being served.
5160 ++ * @served: bytes to transfer.
5161 ++ *
5162 ++ * NOTE: this can be optimized, as the timestamps of upper level entities
5163 ++ * are synchronized every time a new bfqq is selected for service. By now,
5164 ++ * we keep it to better check consistency.
5165 ++ */
5166 ++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
5167 ++{
5168 ++ struct bfq_entity *entity = &bfqq->entity;
5169 ++ struct bfq_service_tree *st;
5170 ++
5171 ++ for_each_entity(entity) {
5172 ++ st = bfq_entity_service_tree(entity);
5173 ++
5174 ++ entity->service += served;
5175 ++ BUG_ON(entity->service > entity->budget);
5176 ++ BUG_ON(st->wsum == 0);
5177 ++
5178 ++ st->vtime += bfq_delta(served, st->wsum);
5179 ++ bfq_forget_idle(st);
5180 ++ }
5181 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
5182 ++}
5183 ++
5184 ++/**
5185 ++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
5186 ++ * @bfqq: the queue that needs a service update.
5187 ++ *
5188 ++ * When it's not possible to be fair in the service domain, because
5189 ++ * a queue is not consuming its budget fast enough (the meaning of
5190 ++ * fast depends on the timeout parameter), we charge it a full
5191 ++ * budget. In this way we should obtain a sort of time-domain
5192 ++ * fairness among all the seeky/slow queues.
5193 ++ */
5194 ++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
5195 ++{
5196 ++ struct bfq_entity *entity = &bfqq->entity;
5197 ++
5198 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
5199 ++
5200 ++ bfq_bfqq_served(bfqq, entity->budget - entity->service);
5201 ++}
5202 ++
5203 ++/**
5204 ++ * __bfq_activate_entity - activate an entity.
5205 ++ * @entity: the entity being activated.
5206 ++ *
5207 ++ * Called whenever an entity is activated, i.e., it is not active and one
5208 ++ * of its children receives a new request, or has to be reactivated due to
5209 ++ * budget exhaustion. It uses the current budget of the entity (and the
5210 ++ * service received if @entity is active) of the queue to calculate its
5211 ++ * timestamps.
5212 ++ */
5213 ++static void __bfq_activate_entity(struct bfq_entity *entity)
5214 ++{
5215 ++ struct bfq_sched_data *sd = entity->sched_data;
5216 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
5217 ++
5218 ++ if (entity == sd->active_entity) {
5219 ++ BUG_ON(entity->tree != NULL);
5220 ++ /*
5221 ++ * If we are requeueing the current entity we have
5222 ++ * to take care of not charging to it service it has
5223 ++ * not received.
5224 ++ */
5225 ++ bfq_calc_finish(entity, entity->service);
5226 ++ entity->start = entity->finish;
5227 ++ sd->active_entity = NULL;
5228 ++ } else if (entity->tree == &st->active) {
5229 ++ /*
5230 ++ * Requeueing an entity due to a change of some
5231 ++ * next_active entity below it. We reuse the old
5232 ++ * start time.
5233 ++ */
5234 ++ bfq_active_extract(st, entity);
5235 ++ } else if (entity->tree == &st->idle) {
5236 ++ /*
5237 ++ * Must be on the idle tree, bfq_idle_extract() will
5238 ++ * check for that.
5239 ++ */
5240 ++ bfq_idle_extract(st, entity);
5241 ++ entity->start = bfq_gt(st->vtime, entity->finish) ?
5242 ++ st->vtime : entity->finish;
5243 ++ } else {
5244 ++ /*
5245 ++ * The finish time of the entity may be invalid, and
5246 ++ * it is in the past for sure, otherwise the queue
5247 ++ * would have been on the idle tree.
5248 ++ */
5249 ++ entity->start = st->vtime;
5250 ++ st->wsum += entity->weight;
5251 ++ bfq_get_entity(entity);
5252 ++
5253 ++ BUG_ON(entity->on_st);
5254 ++ entity->on_st = 1;
5255 ++ }
5256 ++
5257 ++ st = __bfq_entity_update_weight_prio(st, entity);
5258 ++ bfq_calc_finish(entity, entity->budget);
5259 ++ bfq_active_insert(st, entity);
5260 ++}
5261 ++
5262 ++/**
5263 ++ * bfq_activate_entity - activate an entity and its ancestors if necessary.
5264 ++ * @entity: the entity to activate.
5265 ++ *
5266 ++ * Activate @entity and all the entities on the path from it to the root.
5267 ++ */
5268 ++static void bfq_activate_entity(struct bfq_entity *entity)
5269 ++{
5270 ++ struct bfq_sched_data *sd;
5271 ++
5272 ++ for_each_entity(entity) {
5273 ++ __bfq_activate_entity(entity);
5274 ++
5275 ++ sd = entity->sched_data;
5276 ++ if (!bfq_update_next_active(sd))
5277 ++ /*
5278 ++ * No need to propagate the activation to the
5279 ++ * upper entities, as they will be updated when
5280 ++ * the active entity is rescheduled.
5281 ++ */
5282 ++ break;
5283 ++ }
5284 ++}
5285 ++
5286 ++/**
5287 ++ * __bfq_deactivate_entity - deactivate an entity from its service tree.
5288 ++ * @entity: the entity to deactivate.
5289 ++ * @requeue: if false, the entity will not be put into the idle tree.
5290 ++ *
5291 ++ * Deactivate an entity, independently from its previous state. If the
5292 ++ * entity was not on a service tree just return, otherwise if it is on
5293 ++ * any scheduler tree, extract it from that tree, and if necessary
5294 ++ * and if the caller did not specify @requeue, put it on the idle tree.
5295 ++ *
5296 ++ * Return %1 if the caller should update the entity hierarchy, i.e.,
5297 ++ * if the entity was under service or if it was the next_active for
5298 ++ * its sched_data; return %0 otherwise.
5299 ++ */
5300 ++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
5301 ++{
5302 ++ struct bfq_sched_data *sd = entity->sched_data;
5303 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
5304 ++ int was_active = entity == sd->active_entity;
5305 ++ int ret = 0;
5306 ++
5307 ++ if (!entity->on_st)
5308 ++ return 0;
5309 ++
5310 ++ BUG_ON(was_active && entity->tree != NULL);
5311 ++
5312 ++ if (was_active) {
5313 ++ bfq_calc_finish(entity, entity->service);
5314 ++ sd->active_entity = NULL;
5315 ++ } else if (entity->tree == &st->active)
5316 ++ bfq_active_extract(st, entity);
5317 ++ else if (entity->tree == &st->idle)
5318 ++ bfq_idle_extract(st, entity);
5319 ++ else if (entity->tree != NULL)
5320 ++ BUG();
5321 ++
5322 ++ if (was_active || sd->next_active == entity)
5323 ++ ret = bfq_update_next_active(sd);
5324 ++
5325 ++ if (!requeue || !bfq_gt(entity->finish, st->vtime))
5326 ++ bfq_forget_entity(st, entity);
5327 ++ else
5328 ++ bfq_idle_insert(st, entity);
5329 ++
5330 ++ BUG_ON(sd->active_entity == entity);
5331 ++ BUG_ON(sd->next_active == entity);
5332 ++
5333 ++ return ret;
5334 ++}
5335 ++
5336 ++/**
5337 ++ * bfq_deactivate_entity - deactivate an entity.
5338 ++ * @entity: the entity to deactivate.
5339 ++ * @requeue: true if the entity can be put on the idle tree
5340 ++ */
5341 ++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
5342 ++{
5343 ++ struct bfq_sched_data *sd;
5344 ++ struct bfq_entity *parent;
5345 ++
5346 ++ for_each_entity_safe(entity, parent) {
5347 ++ sd = entity->sched_data;
5348 ++
5349 ++ if (!__bfq_deactivate_entity(entity, requeue))
5350 ++ /*
5351 ++ * The parent entity is still backlogged, and
5352 ++ * we don't need to update it as it is still
5353 ++ * under service.
5354 ++ */
5355 ++ break;
5356 ++
5357 ++ if (sd->next_active != NULL)
5358 ++ /*
5359 ++ * The parent entity is still backlogged and
5360 ++ * the budgets on the path towards the root
5361 ++ * need to be updated.
5362 ++ */
5363 ++ goto update;
5364 ++
5365 ++ /*
5366 ++ * If we reach there the parent is no more backlogged and
5367 ++ * we want to propagate the dequeue upwards.
5368 ++ */
5369 ++ requeue = 1;
5370 ++ }
5371 ++
5372 ++ return;
5373 ++
5374 ++update:
5375 ++ entity = parent;
5376 ++ for_each_entity(entity) {
5377 ++ __bfq_activate_entity(entity);
5378 ++
5379 ++ sd = entity->sched_data;
5380 ++ if (!bfq_update_next_active(sd))
5381 ++ break;
5382 ++ }
5383 ++}
5384 ++
5385 ++/**
5386 ++ * bfq_update_vtime - update vtime if necessary.
5387 ++ * @st: the service tree to act upon.
5388 ++ *
5389 ++ * If necessary update the service tree vtime to have at least one
5390 ++ * eligible entity, skipping to its start time. Assumes that the
5391 ++ * active tree of the device is not empty.
5392 ++ *
5393 ++ * NOTE: this hierarchical implementation updates vtimes quite often,
5394 ++ * we may end up with reactivated tasks getting timestamps after a
5395 ++ * vtime skip done because we needed a ->first_active entity on some
5396 ++ * intermediate node.
5397 ++ */
5398 ++static void bfq_update_vtime(struct bfq_service_tree *st)
5399 ++{
5400 ++ struct bfq_entity *entry;
5401 ++ struct rb_node *node = st->active.rb_node;
5402 ++
5403 ++ entry = rb_entry(node, struct bfq_entity, rb_node);
5404 ++ if (bfq_gt(entry->min_start, st->vtime)) {
5405 ++ st->vtime = entry->min_start;
5406 ++ bfq_forget_idle(st);
5407 ++ }
5408 ++}
5409 ++
5410 ++/**
5411 ++ * bfq_first_active - find the eligible entity with the smallest finish time
5412 ++ * @st: the service tree to select from.
5413 ++ *
5414 ++ * This function searches the first schedulable entity, starting from the
5415 ++ * root of the tree and going on the left every time on this side there is
5416 ++ * a subtree with at least one eligible (start >= vtime) entity. The path
5417 ++ * on the right is followed only if a) the left subtree contains no eligible
5418 ++ * entities and b) no eligible entity has been found yet.
5419 ++ */
5420 ++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
5421 ++{
5422 ++ struct bfq_entity *entry, *first = NULL;
5423 ++ struct rb_node *node = st->active.rb_node;
5424 ++
5425 ++ while (node != NULL) {
5426 ++ entry = rb_entry(node, struct bfq_entity, rb_node);
5427 ++left:
5428 ++ if (!bfq_gt(entry->start, st->vtime))
5429 ++ first = entry;
5430 ++
5431 ++ BUG_ON(bfq_gt(entry->min_start, st->vtime));
5432 ++
5433 ++ if (node->rb_left != NULL) {
5434 ++ entry = rb_entry(node->rb_left,
5435 ++ struct bfq_entity, rb_node);
5436 ++ if (!bfq_gt(entry->min_start, st->vtime)) {
5437 ++ node = node->rb_left;
5438 ++ goto left;
5439 ++ }
5440 ++ }
5441 ++ if (first != NULL)
5442 ++ break;
5443 ++ node = node->rb_right;
5444 ++ }
5445 ++
5446 ++ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
5447 ++ return first;
5448 ++}
5449 ++
5450 ++/**
5451 ++ * __bfq_lookup_next_entity - return the first eligible entity in @st.
5452 ++ * @st: the service tree.
5453 ++ *
5454 ++ * Update the virtual time in @st and return the first eligible entity
5455 ++ * it contains.
5456 ++ */
5457 ++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
5458 ++ bool force)
5459 ++{
5460 ++ struct bfq_entity *entity, *new_next_active = NULL;
5461 ++
5462 ++ if (RB_EMPTY_ROOT(&st->active))
5463 ++ return NULL;
5464 ++
5465 ++ bfq_update_vtime(st);
5466 ++ entity = bfq_first_active_entity(st);
5467 ++ BUG_ON(bfq_gt(entity->start, st->vtime));
5468 ++
5469 ++ /*
5470 ++ * If the chosen entity does not match with the sched_data's
5471 ++ * next_active and we are forcedly serving the IDLE priority
5472 ++ * class tree, bubble up budget update.
5473 ++ */
5474 ++ if (unlikely(force && entity != entity->sched_data->next_active)) {
5475 ++ new_next_active = entity;
5476 ++ for_each_entity(new_next_active)
5477 ++ bfq_update_budget(new_next_active);
5478 ++ }
5479 ++
5480 ++ return entity;
5481 ++}
5482 ++
5483 ++/**
5484 ++ * bfq_lookup_next_entity - return the first eligible entity in @sd.
5485 ++ * @sd: the sched_data.
5486 ++ * @extract: if true the returned entity will be also extracted from @sd.
5487 ++ *
5488 ++ * NOTE: since we cache the next_active entity at each level of the
5489 ++ * hierarchy, the complexity of the lookup can be decreased with
5490 ++ * absolutely no effort just returning the cached next_active value;
5491 ++ * we prefer to do full lookups to test the consistency of * the data
5492 ++ * structures.
5493 ++ */
5494 ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
5495 ++ int extract,
5496 ++ struct bfq_data *bfqd)
5497 ++{
5498 ++ struct bfq_service_tree *st = sd->service_tree;
5499 ++ struct bfq_entity *entity;
5500 ++ int i = 0;
5501 ++
5502 ++ BUG_ON(sd->active_entity != NULL);
5503 ++
5504 ++ if (bfqd != NULL &&
5505 ++ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
5506 ++ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
5507 ++ true);
5508 ++ if (entity != NULL) {
5509 ++ i = BFQ_IOPRIO_CLASSES - 1;
5510 ++ bfqd->bfq_class_idle_last_service = jiffies;
5511 ++ sd->next_active = entity;
5512 ++ }
5513 ++ }
5514 ++ for (; i < BFQ_IOPRIO_CLASSES; i++) {
5515 ++ entity = __bfq_lookup_next_entity(st + i, false);
5516 ++ if (entity != NULL) {
5517 ++ if (extract) {
5518 ++ bfq_check_next_active(sd, entity);
5519 ++ bfq_active_extract(st + i, entity);
5520 ++ sd->active_entity = entity;
5521 ++ sd->next_active = NULL;
5522 ++ }
5523 ++ break;
5524 ++ }
5525 ++ }
5526 ++
5527 ++ return entity;
5528 ++}
5529 ++
5530 ++/*
5531 ++ * Get next queue for service.
5532 ++ */
5533 ++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
5534 ++{
5535 ++ struct bfq_entity *entity = NULL;
5536 ++ struct bfq_sched_data *sd;
5537 ++ struct bfq_queue *bfqq;
5538 ++
5539 ++ BUG_ON(bfqd->in_service_queue != NULL);
5540 ++
5541 ++ if (bfqd->busy_queues == 0)
5542 ++ return NULL;
5543 ++
5544 ++ sd = &bfqd->root_group->sched_data;
5545 ++ for (; sd != NULL; sd = entity->my_sched_data) {
5546 ++ entity = bfq_lookup_next_entity(sd, 1, bfqd);
5547 ++ BUG_ON(entity == NULL);
5548 ++ entity->service = 0;
5549 ++ }
5550 ++
5551 ++ bfqq = bfq_entity_to_bfqq(entity);
5552 ++ BUG_ON(bfqq == NULL);
5553 ++
5554 ++ return bfqq;
5555 ++}
5556 ++
5557 ++/*
5558 ++ * Forced extraction of the given queue.
5559 ++ */
5560 ++static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
5561 ++ struct bfq_queue *bfqq)
5562 ++{
5563 ++ struct bfq_entity *entity;
5564 ++ struct bfq_sched_data *sd;
5565 ++
5566 ++ BUG_ON(bfqd->in_service_queue != NULL);
5567 ++
5568 ++ entity = &bfqq->entity;
5569 ++ /*
5570 ++ * Bubble up extraction/update from the leaf to the root.
5571 ++ */
5572 ++ for_each_entity(entity) {
5573 ++ sd = entity->sched_data;
5574 ++ bfq_update_budget(entity);
5575 ++ bfq_update_vtime(bfq_entity_service_tree(entity));
5576 ++ bfq_active_extract(bfq_entity_service_tree(entity), entity);
5577 ++ sd->active_entity = entity;
5578 ++ sd->next_active = NULL;
5579 ++ entity->service = 0;
5580 ++ }
5581 ++
5582 ++ return;
5583 ++}
5584 ++
5585 ++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
5586 ++{
5587 ++ if (bfqd->in_service_bic != NULL) {
5588 ++ put_io_context(bfqd->in_service_bic->icq.ioc);
5589 ++ bfqd->in_service_bic = NULL;
5590 ++ }
5591 ++
5592 ++ bfqd->in_service_queue = NULL;
5593 ++ del_timer(&bfqd->idle_slice_timer);
5594 ++}
5595 ++
5596 ++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5597 ++ int requeue)
5598 ++{
5599 ++ struct bfq_entity *entity = &bfqq->entity;
5600 ++
5601 ++ if (bfqq == bfqd->in_service_queue)
5602 ++ __bfq_bfqd_reset_in_service(bfqd);
5603 ++
5604 ++ bfq_deactivate_entity(entity, requeue);
5605 ++}
5606 ++
5607 ++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5608 ++{
5609 ++ struct bfq_entity *entity = &bfqq->entity;
5610 ++
5611 ++ bfq_activate_entity(entity);
5612 ++}
5613 ++
5614 ++/*
5615 ++ * Called when the bfqq no longer has requests pending, remove it from
5616 ++ * the service tree.
5617 ++ */
5618 ++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5619 ++ int requeue)
5620 ++{
5621 ++ BUG_ON(!bfq_bfqq_busy(bfqq));
5622 ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
5623 ++
5624 ++ bfq_log_bfqq(bfqd, bfqq, "del from busy");
5625 ++
5626 ++ bfq_clear_bfqq_busy(bfqq);
5627 ++
5628 ++ BUG_ON(bfqd->busy_queues == 0);
5629 ++ bfqd->busy_queues--;
5630 ++ if (bfqq->raising_coeff > 1)
5631 ++ bfqd->raised_busy_queues--;
5632 ++
5633 ++ bfq_deactivate_bfqq(bfqd, bfqq, requeue);
5634 ++}
5635 ++
5636 ++/*
5637 ++ * Called when an inactive queue receives a new request.
5638 ++ */
5639 ++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5640 ++{
5641 ++ BUG_ON(bfq_bfqq_busy(bfqq));
5642 ++ BUG_ON(bfqq == bfqd->in_service_queue);
5643 ++
5644 ++ bfq_log_bfqq(bfqd, bfqq, "add to busy");
5645 ++
5646 ++ bfq_activate_bfqq(bfqd, bfqq);
5647 ++
5648 ++ bfq_mark_bfqq_busy(bfqq);
5649 ++ bfqd->busy_queues++;
5650 ++ if (bfqq->raising_coeff > 1)
5651 ++ bfqd->raised_busy_queues++;
5652 ++}
5653 +diff --git a/block/bfq.h b/block/bfq.h
5654 +new file mode 100644
5655 +index 0000000..68b28e3
5656 +--- /dev/null
5657 ++++ b/block/bfq.h
5658 +@@ -0,0 +1,614 @@
5659 ++/*
5660 ++ * BFQ-v7 for 3.13.0: data structures and common functions prototypes.
5661 ++ *
5662 ++ * Based on ideas and code from CFQ:
5663 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
5664 ++ *
5665 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
5666 ++ * Paolo Valente <paolo.valente@×××××××.it>
5667 ++ *
5668 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
5669 ++ */
5670 ++
5671 ++#ifndef _BFQ_H
5672 ++#define _BFQ_H
5673 ++
5674 ++#include <linux/blktrace_api.h>
5675 ++#include <linux/hrtimer.h>
5676 ++#include <linux/ioprio.h>
5677 ++#include <linux/rbtree.h>
5678 ++
5679 ++#define BFQ_IOPRIO_CLASSES 3
5680 ++#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
5681 ++
5682 ++#define BFQ_MIN_WEIGHT 1
5683 ++#define BFQ_MAX_WEIGHT 1000
5684 ++
5685 ++#define BFQ_DEFAULT_GRP_WEIGHT 10
5686 ++#define BFQ_DEFAULT_GRP_IOPRIO 0
5687 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
5688 ++
5689 ++struct bfq_entity;
5690 ++
5691 ++/**
5692 ++ * struct bfq_service_tree - per ioprio_class service tree.
5693 ++ * @active: tree for active entities (i.e., those backlogged).
5694 ++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
5695 ++ * @first_idle: idle entity with minimum F_i.
5696 ++ * @last_idle: idle entity with maximum F_i.
5697 ++ * @vtime: scheduler virtual time.
5698 ++ * @wsum: scheduler weight sum; active and idle entities contribute to it.
5699 ++ *
5700 ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
5701 ++ * ioprio_class has its own independent scheduler, and so its own
5702 ++ * bfq_service_tree. All the fields are protected by the queue lock
5703 ++ * of the containing bfqd.
5704 ++ */
5705 ++struct bfq_service_tree {
5706 ++ struct rb_root active;
5707 ++ struct rb_root idle;
5708 ++
5709 ++ struct bfq_entity *first_idle;
5710 ++ struct bfq_entity *last_idle;
5711 ++
5712 ++ u64 vtime;
5713 ++ unsigned long wsum;
5714 ++};
5715 ++
5716 ++/**
5717 ++ * struct bfq_sched_data - multi-class scheduler.
5718 ++ * @active_entity: entity under service.
5719 ++ * @next_active: head-of-the-line entity in the scheduler.
5720 ++ * @service_tree: array of service trees, one per ioprio_class.
5721 ++ *
5722 ++ * bfq_sched_data is the basic scheduler queue. It supports three
5723 ++ * ioprio_classes, and can be used either as a toplevel queue or as
5724 ++ * an intermediate queue on a hierarchical setup.
5725 ++ * @next_active points to the active entity of the sched_data service
5726 ++ * trees that will be scheduled next.
5727 ++ *
5728 ++ * The supported ioprio_classes are the same as in CFQ, in descending
5729 ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
5730 ++ * Requests from higher priority queues are served before all the
5731 ++ * requests from lower priority queues; among requests of the same
5732 ++ * queue requests are served according to B-WF2Q+.
5733 ++ * All the fields are protected by the queue lock of the containing bfqd.
5734 ++ */
5735 ++struct bfq_sched_data {
5736 ++ struct bfq_entity *active_entity;
5737 ++ struct bfq_entity *next_active;
5738 ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
5739 ++};
5740 ++
5741 ++/**
5742 ++ * struct bfq_entity - schedulable entity.
5743 ++ * @rb_node: service_tree member.
5744 ++ * @on_st: flag, true if the entity is on a tree (either the active or
5745 ++ * the idle one of its service_tree).
5746 ++ * @finish: B-WF2Q+ finish timestamp (aka F_i).
5747 ++ * @start: B-WF2Q+ start timestamp (aka S_i).
5748 ++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
5749 ++ * @min_start: minimum start time of the (active) subtree rooted at
5750 ++ * this entity; used for O(log N) lookups into active trees.
5751 ++ * @service: service received during the last round of service.
5752 ++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
5753 ++ * @weight: weight of the queue
5754 ++ * @parent: parent entity, for hierarchical scheduling.
5755 ++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
5756 ++ * associated scheduler queue, %NULL on leaf nodes.
5757 ++ * @sched_data: the scheduler queue this entity belongs to.
5758 ++ * @ioprio: the ioprio in use.
5759 ++ * @new_weight: when a weight change is requested, the new weight value.
5760 ++ * @orig_weight: original weight, used to implement weight boosting
5761 ++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
5762 ++ * @ioprio_class: the ioprio_class in use.
5763 ++ * @new_ioprio_class: when an ioprio_class change is requested, the new
5764 ++ * ioprio_class value.
5765 ++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
5766 ++ * ioprio_class change.
5767 ++ *
5768 ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
5769 ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
5770 ++ * entity belongs to the sched_data of the parent group in the cgroup
5771 ++ * hierarchy. Non-leaf entities have also their own sched_data, stored
5772 ++ * in @my_sched_data.
5773 ++ *
5774 ++ * Each entity stores independently its priority values; this would
5775 ++ * allow different weights on different devices, but this
5776 ++ * functionality is not exported to userspace by now. Priorities and
5777 ++ * weights are updated lazily, first storing the new values into the
5778 ++ * new_* fields, then setting the @ioprio_changed flag. As soon as
5779 ++ * there is a transition in the entity state that allows the priority
5780 ++ * update to take place the effective and the requested priority
5781 ++ * values are synchronized.
5782 ++ *
5783 ++ * Unless cgroups are used, the weight value is calculated from the
5784 ++ * ioprio to export the same interface as CFQ. When dealing with
5785 ++ * ``well-behaved'' queues (i.e., queues that do not spend too much
5786 ++ * time to consume their budget and have true sequential behavior, and
5787 ++ * when there are no external factors breaking anticipation) the
5788 ++ * relative weights at each level of the cgroups hierarchy should be
5789 ++ * guaranteed. All the fields are protected by the queue lock of the
5790 ++ * containing bfqd.
5791 ++ */
5792 ++struct bfq_entity {
5793 ++ struct rb_node rb_node;
5794 ++
5795 ++ int on_st;
5796 ++
5797 ++ u64 finish;
5798 ++ u64 start;
5799 ++
5800 ++ struct rb_root *tree;
5801 ++
5802 ++ u64 min_start;
5803 ++
5804 ++ unsigned long service, budget;
5805 ++ unsigned short weight, new_weight;
5806 ++ unsigned short orig_weight;
5807 ++
5808 ++ struct bfq_entity *parent;
5809 ++
5810 ++ struct bfq_sched_data *my_sched_data;
5811 ++ struct bfq_sched_data *sched_data;
5812 ++
5813 ++ unsigned short ioprio, new_ioprio;
5814 ++ unsigned short ioprio_class, new_ioprio_class;
5815 ++
5816 ++ int ioprio_changed;
5817 ++};
5818 ++
5819 ++struct bfq_group;
5820 ++
5821 ++/**
5822 ++ * struct bfq_queue - leaf schedulable entity.
5823 ++ * @ref: reference counter.
5824 ++ * @bfqd: parent bfq_data.
5825 ++ * @new_bfqq: shared bfq_queue if queue is cooperating with
5826 ++ * one or more other queues.
5827 ++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
5828 ++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
5829 ++ * @sort_list: sorted list of pending requests.
5830 ++ * @next_rq: if fifo isn't expired, next request to serve.
5831 ++ * @queued: nr of requests queued in @sort_list.
5832 ++ * @allocated: currently allocated requests.
5833 ++ * @meta_pending: pending metadata requests.
5834 ++ * @fifo: fifo list of requests in sort_list.
5835 ++ * @entity: entity representing this queue in the scheduler.
5836 ++ * @max_budget: maximum budget allowed from the feedback mechanism.
5837 ++ * @budget_timeout: budget expiration (in jiffies).
5838 ++ * @dispatched: number of requests on the dispatch list or inside driver.
5839 ++ * @org_ioprio: saved ioprio during boosted periods.
5840 ++ * @flags: status flags.
5841 ++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
5842 ++ * @seek_samples: number of seeks sampled
5843 ++ * @seek_total: sum of the distances of the seeks sampled
5844 ++ * @seek_mean: mean seek distance
5845 ++ * @last_request_pos: position of the last request enqueued
5846 ++ * @pid: pid of the process owning the queue, used for logging purposes.
5847 ++ * @last_rais_start_time: last (idle -> weight-raised) transition attempt
5848 ++ * @raising_cur_max_time: current max raising time for this queue
5849 ++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from
5850 ++ * idle to backlogged
5851 ++ * @service_from_backlogged: cumulative service received from the @bfq_queue
5852 ++ * since the last transition from idle to backlogged
5853 ++ *
5854 ++ * A bfq_queue is a leaf request queue; it can be associated to an io_context
5855 ++ * or more (if it is an async one). @cgroup holds a reference to the
5856 ++ * cgroup, to be sure that it does not disappear while a bfqq still
5857 ++ * references it (mostly to avoid races between request issuing and task
5858 ++ * migration followed by cgroup distruction).
5859 ++ * All the fields are protected by the queue lock of the containing bfqd.
5860 ++ */
5861 ++struct bfq_queue {
5862 ++ atomic_t ref;
5863 ++ struct bfq_data *bfqd;
5864 ++
5865 ++ /* fields for cooperating queues handling */
5866 ++ struct bfq_queue *new_bfqq;
5867 ++ struct rb_node pos_node;
5868 ++ struct rb_root *pos_root;
5869 ++
5870 ++ struct rb_root sort_list;
5871 ++ struct request *next_rq;
5872 ++ int queued[2];
5873 ++ int allocated[2];
5874 ++ int meta_pending;
5875 ++ struct list_head fifo;
5876 ++
5877 ++ struct bfq_entity entity;
5878 ++
5879 ++ unsigned long max_budget;
5880 ++ unsigned long budget_timeout;
5881 ++
5882 ++ int dispatched;
5883 ++
5884 ++ unsigned short org_ioprio;
5885 ++
5886 ++ unsigned int flags;
5887 ++
5888 ++ struct list_head bfqq_list;
5889 ++
5890 ++ unsigned int seek_samples;
5891 ++ u64 seek_total;
5892 ++ sector_t seek_mean;
5893 ++ sector_t last_request_pos;
5894 ++
5895 ++ pid_t pid;
5896 ++
5897 ++ /* weight-raising fields */
5898 ++ unsigned int raising_cur_max_time;
5899 ++ unsigned long soft_rt_next_start;
5900 ++ u64 last_rais_start_finish;
5901 ++ unsigned int raising_coeff;
5902 ++ u64 last_idle_bklogged;
5903 ++ unsigned long service_from_backlogged;
5904 ++};
5905 ++
5906 ++/**
5907 ++ * struct bfq_ttime - per process thinktime stats.
5908 ++ * @ttime_total: total process thinktime
5909 ++ * @ttime_samples: number of thinktime samples
5910 ++ * @ttime_mean: average process thinktime
5911 ++ */
5912 ++struct bfq_ttime {
5913 ++ unsigned long last_end_request;
5914 ++
5915 ++ unsigned long ttime_total;
5916 ++ unsigned long ttime_samples;
5917 ++ unsigned long ttime_mean;
5918 ++};
5919 ++
5920 ++/**
5921 ++ * struct bfq_io_cq - per (request_queue, io_context) structure.
5922 ++ * @icq: associated io_cq structure
5923 ++ * @bfqq: array of two process queues, the sync and the async
5924 ++ * @ttime: associated @bfq_ttime struct
5925 ++ */
5926 ++struct bfq_io_cq {
5927 ++ struct io_cq icq; /* must be the first member */
5928 ++ struct bfq_queue *bfqq[2];
5929 ++ struct bfq_ttime ttime;
5930 ++ int ioprio;
5931 ++};
5932 ++
5933 ++/**
5934 ++ * struct bfq_data - per device data structure.
5935 ++ * @queue: request queue for the managed device.
5936 ++ * @root_group: root bfq_group for the device.
5937 ++ * @rq_pos_tree: rbtree sorted by next_request position,
5938 ++ * used when determining if two or more queues
5939 ++ * have interleaving requests (see bfq_close_cooperator).
5940 ++ * @busy_queues: number of bfq_queues containing requests (including the
5941 ++ * queue under service, even if it is idling).
5942 ++ * @raised_busy_queues: number of weight-raised busy bfq_queues.
5943 ++ * @queued: number of queued requests.
5944 ++ * @rq_in_driver: number of requests dispatched and waiting for completion.
5945 ++ * @sync_flight: number of sync requests in the driver.
5946 ++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
5947 ++ * completed requests .
5948 ++ * @hw_tag_samples: nr of samples used to calculate hw_tag.
5949 ++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
5950 ++ * @budgets_assigned: number of budgets assigned.
5951 ++ * @idle_slice_timer: timer set when idling for the next sequential request
5952 ++ * from the queue under service.
5953 ++ * @unplug_work: delayed work to restart dispatching on the request queue.
5954 ++ * @in_service_queue: bfq_queue under service.
5955 ++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.
5956 ++ * @last_position: on-disk position of the last served request.
5957 ++ * @last_budget_start: beginning of the last budget.
5958 ++ * @last_idling_start: beginning of the last idle slice.
5959 ++ * @peak_rate: peak transfer rate observed for a budget.
5960 ++ * @peak_rate_samples: number of samples used to calculate @peak_rate.
5961 ++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
5962 ++ * @group_list: list of all the bfq_groups active on the device.
5963 ++ * @active_list: list of all the bfq_queues active on the device.
5964 ++ * @idle_list: list of all the bfq_queues idle on the device.
5965 ++ * @bfq_quantum: max number of requests dispatched per dispatch round.
5966 ++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
5967 ++ * requests are served in fifo order.
5968 ++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
5969 ++ * @bfq_back_max: maximum allowed backward seek.
5970 ++ * @bfq_slice_idle: maximum idling time.
5971 ++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
5972 ++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
5973 ++ * async queues.
5974 ++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
5975 ++ * to prevent seeky queues to impose long latencies to well
5976 ++ * behaved ones (this also implies that seeky queues cannot
5977 ++ * receive guarantees in the service domain; after a timeout
5978 ++ * they are charged for the whole allocated budget, to try
5979 ++ * to preserve a behavior reasonably fair among them, but
5980 ++ * without service-domain guarantees).
5981 ++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
5982 ++ * queue is multiplied
5983 ++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
5984 ++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
5985 ++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
5986 ++ * may be reactivated for a queue (in jiffies)
5987 ++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
5988 ++ * after which weight-raising may be
5989 ++ * reactivated for an already busy queue
5990 ++ * (in jiffies)
5991 ++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
5992 ++ * sectors per seconds
5993 ++ * @RT_prod: cached value of the product R*T used for computing the maximum
5994 ++ * duration of the weight raising automatically
5995 ++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
5996 ++ *
5997 ++ * All the fields are protected by the @queue lock.
5998 ++ */
5999 ++struct bfq_data {
6000 ++ struct request_queue *queue;
6001 ++
6002 ++ struct bfq_group *root_group;
6003 ++
6004 ++ struct rb_root rq_pos_tree;
6005 ++
6006 ++ int busy_queues;
6007 ++ int raised_busy_queues;
6008 ++ int queued;
6009 ++ int rq_in_driver;
6010 ++ int sync_flight;
6011 ++
6012 ++ int max_rq_in_driver;
6013 ++ int hw_tag_samples;
6014 ++ int hw_tag;
6015 ++
6016 ++ int budgets_assigned;
6017 ++
6018 ++ struct timer_list idle_slice_timer;
6019 ++ struct work_struct unplug_work;
6020 ++
6021 ++ struct bfq_queue *in_service_queue;
6022 ++ struct bfq_io_cq *in_service_bic;
6023 ++
6024 ++ sector_t last_position;
6025 ++
6026 ++ ktime_t last_budget_start;
6027 ++ ktime_t last_idling_start;
6028 ++ int peak_rate_samples;
6029 ++ u64 peak_rate;
6030 ++ unsigned long bfq_max_budget;
6031 ++
6032 ++ struct hlist_head group_list;
6033 ++ struct list_head active_list;
6034 ++ struct list_head idle_list;
6035 ++
6036 ++ unsigned int bfq_quantum;
6037 ++ unsigned int bfq_fifo_expire[2];
6038 ++ unsigned int bfq_back_penalty;
6039 ++ unsigned int bfq_back_max;
6040 ++ unsigned int bfq_slice_idle;
6041 ++ u64 bfq_class_idle_last_service;
6042 ++
6043 ++ unsigned int bfq_user_max_budget;
6044 ++ unsigned int bfq_max_budget_async_rq;
6045 ++ unsigned int bfq_timeout[2];
6046 ++
6047 ++ bool low_latency;
6048 ++
6049 ++ /* parameters of the low_latency heuristics */
6050 ++ unsigned int bfq_raising_coeff;
6051 ++ unsigned int bfq_raising_max_time;
6052 ++ unsigned int bfq_raising_rt_max_time;
6053 ++ unsigned int bfq_raising_min_idle_time;
6054 ++ unsigned long bfq_raising_min_inter_arr_async;
6055 ++ unsigned int bfq_raising_max_softrt_rate;
6056 ++ u64 RT_prod;
6057 ++
6058 ++ struct bfq_queue oom_bfqq;
6059 ++};
6060 ++
6061 ++enum bfqq_state_flags {
6062 ++ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */
6063 ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
6064 ++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
6065 ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
6066 ++ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
6067 ++ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */
6068 ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
6069 ++ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
6070 ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
6071 ++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
6072 ++ BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */
6073 ++};
6074 ++
6075 ++#define BFQ_BFQQ_FNS(name) \
6076 ++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
6077 ++{ \
6078 ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
6079 ++} \
6080 ++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
6081 ++{ \
6082 ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
6083 ++} \
6084 ++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
6085 ++{ \
6086 ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
6087 ++}
6088 ++
6089 ++BFQ_BFQQ_FNS(busy);
6090 ++BFQ_BFQQ_FNS(wait_request);
6091 ++BFQ_BFQQ_FNS(must_alloc);
6092 ++BFQ_BFQQ_FNS(fifo_expire);
6093 ++BFQ_BFQQ_FNS(idle_window);
6094 ++BFQ_BFQQ_FNS(prio_changed);
6095 ++BFQ_BFQQ_FNS(sync);
6096 ++BFQ_BFQQ_FNS(budget_new);
6097 ++BFQ_BFQQ_FNS(coop);
6098 ++BFQ_BFQQ_FNS(split_coop);
6099 ++BFQ_BFQQ_FNS(softrt_update);
6100 ++#undef BFQ_BFQQ_FNS
6101 ++
6102 ++/* Logging facilities. */
6103 ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
6104 ++ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
6105 ++
6106 ++#define bfq_log(bfqd, fmt, args...) \
6107 ++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
6108 ++
6109 ++/* Expiration reasons. */
6110 ++enum bfqq_expiration {
6111 ++ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */
6112 ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
6113 ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
6114 ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
6115 ++};
6116 ++
6117 ++#ifdef CONFIG_CGROUP_BFQIO
6118 ++/**
6119 ++ * struct bfq_group - per (device, cgroup) data structure.
6120 ++ * @entity: schedulable entity to insert into the parent group sched_data.
6121 ++ * @sched_data: own sched_data, to contain child entities (they may be
6122 ++ * both bfq_queues and bfq_groups).
6123 ++ * @group_node: node to be inserted into the bfqio_cgroup->group_data
6124 ++ * list of the containing cgroup's bfqio_cgroup.
6125 ++ * @bfqd_node: node to be inserted into the @bfqd->group_list list
6126 ++ * of the groups active on the same device; used for cleanup.
6127 ++ * @bfqd: the bfq_data for the device this group acts upon.
6128 ++ * @async_bfqq: array of async queues for all the tasks belonging to
6129 ++ * the group, one queue per ioprio value per ioprio_class,
6130 ++ * except for the idle class that has only one queue.
6131 ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
6132 ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
6133 ++ * to avoid too many special cases during group creation/migration.
6134 ++ *
6135 ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
6136 ++ * there is a set of bfq_groups, each one collecting the lower-level
6137 ++ * entities belonging to the group that are acting on the same device.
6138 ++ *
6139 ++ * Locking works as follows:
6140 ++ * o @group_node is protected by the bfqio_cgroup lock, and is accessed
6141 ++ * via RCU from its readers.
6142 ++ * o @bfqd is protected by the queue lock, RCU is used to access it
6143 ++ * from the readers.
6144 ++ * o All the other fields are protected by the @bfqd queue lock.
6145 ++ */
6146 ++struct bfq_group {
6147 ++ struct bfq_entity entity;
6148 ++ struct bfq_sched_data sched_data;
6149 ++
6150 ++ struct hlist_node group_node;
6151 ++ struct hlist_node bfqd_node;
6152 ++
6153 ++ void *bfqd;
6154 ++
6155 ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
6156 ++ struct bfq_queue *async_idle_bfqq;
6157 ++
6158 ++ struct bfq_entity *my_entity;
6159 ++};
6160 ++
6161 ++/**
6162 ++ * struct bfqio_cgroup - bfq cgroup data structure.
6163 ++ * @css: subsystem state for bfq in the containing cgroup.
6164 ++ * @online: flag marked when the subsystem is inserted.
6165 ++ * @weight: cgroup weight.
6166 ++ * @ioprio: cgroup ioprio.
6167 ++ * @ioprio_class: cgroup ioprio_class.
6168 ++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
6169 ++ * @group_data: list containing the bfq_group belonging to this cgroup.
6170 ++ *
6171 ++ * @group_data is accessed using RCU, with @lock protecting the updates,
6172 ++ * @ioprio and @ioprio_class are protected by @lock.
6173 ++ */
6174 ++struct bfqio_cgroup {
6175 ++ struct cgroup_subsys_state css;
6176 ++ bool online;
6177 ++
6178 ++ unsigned short weight, ioprio, ioprio_class;
6179 ++
6180 ++ spinlock_t lock;
6181 ++ struct hlist_head group_data;
6182 ++};
6183 ++#else
6184 ++struct bfq_group {
6185 ++ struct bfq_sched_data sched_data;
6186 ++
6187 ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
6188 ++ struct bfq_queue *async_idle_bfqq;
6189 ++};
6190 ++#endif
6191 ++
6192 ++static inline struct bfq_service_tree *
6193 ++bfq_entity_service_tree(struct bfq_entity *entity)
6194 ++{
6195 ++ struct bfq_sched_data *sched_data = entity->sched_data;
6196 ++ unsigned int idx = entity->ioprio_class - 1;
6197 ++
6198 ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
6199 ++ BUG_ON(sched_data == NULL);
6200 ++
6201 ++ return sched_data->service_tree + idx;
6202 ++}
6203 ++
6204 ++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
6205 ++ int is_sync)
6206 ++{
6207 ++ return bic->bfqq[!!is_sync];
6208 ++}
6209 ++
6210 ++static inline void bic_set_bfqq(struct bfq_io_cq *bic,
6211 ++ struct bfq_queue *bfqq, int is_sync)
6212 ++{
6213 ++ bic->bfqq[!!is_sync] = bfqq;
6214 ++}
6215 ++
6216 ++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
6217 ++{
6218 ++ return bic->icq.q->elevator->elevator_data;
6219 ++}
6220 ++
6221 ++/**
6222 ++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
6223 ++ * @ptr: a pointer to a bfqd.
6224 ++ * @flags: storage for the flags to be saved.
6225 ++ *
6226 ++ * This function allows bfqg->bfqd to be protected by the
6227 ++ * queue lock of the bfqd they reference; the pointer is dereferenced
6228 ++ * under RCU, so the storage for bfqd is assured to be safe as long
6229 ++ * as the RCU read side critical section does not end. After the
6230 ++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
6231 ++ * sure that no other writer accessed it. If we raced with a writer,
6232 ++ * the function returns NULL, with the queue unlocked, otherwise it
6233 ++ * returns the dereferenced pointer, with the queue locked.
6234 ++ */
6235 ++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
6236 ++ unsigned long *flags)
6237 ++{
6238 ++ struct bfq_data *bfqd;
6239 ++
6240 ++ rcu_read_lock();
6241 ++ bfqd = rcu_dereference(*(struct bfq_data **)ptr);
6242 ++
6243 ++ if (bfqd != NULL) {
6244 ++ spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
6245 ++ if (*ptr == bfqd)
6246 ++ goto out;
6247 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
6248 ++ }
6249 ++
6250 ++ bfqd = NULL;
6251 ++out:
6252 ++ rcu_read_unlock();
6253 ++ return bfqd;
6254 ++}
6255 ++
6256 ++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
6257 ++ unsigned long *flags)
6258 ++{
6259 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
6260 ++}
6261 ++
6262 ++static void bfq_changed_ioprio(struct bfq_io_cq *bic);
6263 ++static void bfq_put_queue(struct bfq_queue *bfqq);
6264 ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
6265 ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
6266 ++ struct bfq_group *bfqg, int is_sync,
6267 ++ struct bfq_io_cq *bic, gfp_t gfp_mask);
6268 ++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
6269 ++ struct bfq_group *bfqg);
6270 ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
6271 ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
6272 ++#endif
6273 +--
6274 +1.8.5.2
6275 +
6276
6277 Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1
6278 ===================================================================
6279 --- genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1 2014-01-29 14:41:45 UTC (rev 2660)
6280 +++ genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1 2014-01-30 16:49:47 UTC (rev 2661)
6281 @@ -1,5773 +0,0 @@
6282 -From 009b78bafe1763f71e6bdbb4f536b564a73b7db5 Mon Sep 17 00:00:00 2001
6283 -From: Arianna Avanzini <avanzini.arianna@×××××.com>
6284 -Date: Thu, 9 May 2013 19:10:02 +0200
6285 -Subject: [PATCH 2/3] block: introduce the BFQ-v6r2 I/O sched for 3.11
6286 -
6287 -Add the BFQ-v6r2 I/O scheduler to 3.11.
6288 -The general structure is borrowed from CFQ, as much code. A (bfq_)queue
6289 -is associated to each task doing I/O on a device, and each time a
6290 -scheduling decision has to be made a queue is selected and served until
6291 -it expires.
6292 -
6293 - - Slices are given in the service domain: tasks are assigned
6294 - budgets, measured in number of sectors. Once got the disk, a task
6295 - must however consume its assigned budget within a configurable
6296 - maximum time (by default, the maximum possible value of the
6297 - budgets is automatically computed to comply with this timeout).
6298 - This allows the desired latency vs "throughput boosting" tradeoff
6299 - to be set.
6300 -
6301 - - Budgets are scheduled according to a variant of WF2Q+, implemented
6302 - using an augmented rb-tree to take eligibility into account while
6303 - preserving an O(log N) overall complexity.
6304 -
6305 - - A low-latency tunable is provided; if enabled, both interactive
6306 - and soft real-time applications are guaranteed very low latency.
6307 -
6308 - - Latency guarantees are preserved also in presence of NCQ.
6309 -
6310 - - Also with flash-based devices, a high throughput is achieved while
6311 - still preserving latency guarantees.
6312 -
6313 - - Useful features borrowed from CFQ: cooperating-queues merging (with
6314 - some additional optimizations with respect to the original CFQ version),
6315 - static fallback queue for OOM.
6316 -
6317 - - BFQ supports full hierarchical scheduling, exporting a cgroups
6318 - interface. Each node has a full scheduler, so each group can
6319 - be assigned its own ioprio (mapped to a weight, see next point)
6320 - and an ioprio_class.
6321 -
6322 - - If the cgroups interface is used, weights can be explictly
6323 - assigned, otherwise ioprio values are mapped to weights using the
6324 - relation weight = IOPRIO_BE_NR - ioprio.
6325 -
6326 - - ioprio classes are served in strict priority order, i.e., lower
6327 - priority queues are not served as long as there are higher
6328 - priority queues. Among queues in the same class the bandwidth is
6329 - distributed in proportion to the weight of each queue. A very
6330 - thin extra bandwidth is however guaranteed to the Idle class, to
6331 - prevent it from starving.
6332 -
6333 -Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
6334 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
6335 ----
6336 - block/bfq-cgroup.c | 881 +++++++++++++++
6337 - block/bfq-ioc.c | 36 +
6338 - block/bfq-iosched.c | 3082 +++++++++++++++++++++++++++++++++++++++++++++++++++
6339 - block/bfq-sched.c | 1072 ++++++++++++++++++
6340 - block/bfq.h | 603 ++++++++++
6341 - 5 files changed, 5674 insertions(+)
6342 - create mode 100644 block/bfq-cgroup.c
6343 - create mode 100644 block/bfq-ioc.c
6344 - create mode 100644 block/bfq-iosched.c
6345 - create mode 100644 block/bfq-sched.c
6346 - create mode 100644 block/bfq.h
6347 -
6348 -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
6349 -new file mode 100644
6350 -index 0000000..bb9b851
6351 ---- /dev/null
6352 -+++ b/block/bfq-cgroup.c
6353 -@@ -0,0 +1,881 @@
6354 -+/*
6355 -+ * BFQ: CGROUPS support.
6356 -+ *
6357 -+ * Based on ideas and code from CFQ:
6358 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
6359 -+ *
6360 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
6361 -+ * Paolo Valente <paolo.valente@×××××××.it>
6362 -+ *
6363 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
6364 -+ *
6365 -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
6366 -+ */
6367 -+
6368 -+#ifdef CONFIG_CGROUP_BFQIO
6369 -+
6370 -+static DEFINE_MUTEX(bfqio_mutex);
6371 -+
6372 -+static bool bfqio_is_removed(struct cgroup *cgroup)
6373 -+{
6374 -+ return test_bit(CGRP_DEAD, &cgroup->flags);
6375 -+}
6376 -+
6377 -+static struct bfqio_cgroup bfqio_root_cgroup = {
6378 -+ .weight = BFQ_DEFAULT_GRP_WEIGHT,
6379 -+ .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
6380 -+ .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
6381 -+};
6382 -+
6383 -+static inline void bfq_init_entity(struct bfq_entity *entity,
6384 -+ struct bfq_group *bfqg)
6385 -+{
6386 -+ entity->weight = entity->new_weight;
6387 -+ entity->orig_weight = entity->new_weight;
6388 -+ entity->ioprio = entity->new_ioprio;
6389 -+ entity->ioprio_class = entity->new_ioprio_class;
6390 -+ entity->parent = bfqg->my_entity;
6391 -+ entity->sched_data = &bfqg->sched_data;
6392 -+}
6393 -+
6394 -+static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)
6395 -+{
6396 -+ return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),
6397 -+ struct bfqio_cgroup, css);
6398 -+}
6399 -+
6400 -+/*
6401 -+ * Search the bfq_group for bfqd into the hash table (by now only a list)
6402 -+ * of bgrp. Must be called under rcu_read_lock().
6403 -+ */
6404 -+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
6405 -+ struct bfq_data *bfqd)
6406 -+{
6407 -+ struct bfq_group *bfqg;
6408 -+ void *key;
6409 -+
6410 -+ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
6411 -+ key = rcu_dereference(bfqg->bfqd);
6412 -+ if (key == bfqd)
6413 -+ return bfqg;
6414 -+ }
6415 -+
6416 -+ return NULL;
6417 -+}
6418 -+
6419 -+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
6420 -+ struct bfq_group *bfqg)
6421 -+{
6422 -+ struct bfq_entity *entity = &bfqg->entity;
6423 -+
6424 -+ /*
6425 -+ * If the weight of the entity has never been set via the sysfs
6426 -+ * interface, then bgrp->weight == 0. In this case we initialize
6427 -+ * the weight from the current ioprio value. Otherwise, the group
6428 -+ * weight, if set, has priority over the ioprio value.
6429 -+ */
6430 -+ if (bgrp->weight == 0) {
6431 -+ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
6432 -+ entity->new_ioprio = bgrp->ioprio;
6433 -+ } else {
6434 -+ entity->new_weight = bgrp->weight;
6435 -+ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
6436 -+ }
6437 -+ entity->orig_weight = entity->weight = entity->new_weight;
6438 -+ entity->ioprio = entity->new_ioprio;
6439 -+ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
6440 -+ entity->my_sched_data = &bfqg->sched_data;
6441 -+}
6442 -+
6443 -+static inline void bfq_group_set_parent(struct bfq_group *bfqg,
6444 -+ struct bfq_group *parent)
6445 -+{
6446 -+ struct bfq_entity *entity;
6447 -+
6448 -+ BUG_ON(parent == NULL);
6449 -+ BUG_ON(bfqg == NULL);
6450 -+
6451 -+ entity = &bfqg->entity;
6452 -+ entity->parent = parent->my_entity;
6453 -+ entity->sched_data = &parent->sched_data;
6454 -+}
6455 -+
6456 -+/**
6457 -+ * bfq_group_chain_alloc - allocate a chain of groups.
6458 -+ * @bfqd: queue descriptor.
6459 -+ * @cgroup: the leaf cgroup this chain starts from.
6460 -+ *
6461 -+ * Allocate a chain of groups starting from the one belonging to
6462 -+ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
6463 -+ * to the root has already an allocated group on @bfqd.
6464 -+ */
6465 -+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
6466 -+ struct cgroup *cgroup)
6467 -+{
6468 -+ struct bfqio_cgroup *bgrp;
6469 -+ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
6470 -+
6471 -+ for (; cgroup != NULL; cgroup = cgroup->parent) {
6472 -+ bgrp = cgroup_to_bfqio(cgroup);
6473 -+
6474 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
6475 -+ if (bfqg != NULL) {
6476 -+ /*
6477 -+ * All the cgroups in the path from there to the
6478 -+ * root must have a bfq_group for bfqd, so we don't
6479 -+ * need any more allocations.
6480 -+ */
6481 -+ break;
6482 -+ }
6483 -+
6484 -+ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
6485 -+ if (bfqg == NULL)
6486 -+ goto cleanup;
6487 -+
6488 -+ bfq_group_init_entity(bgrp, bfqg);
6489 -+ bfqg->my_entity = &bfqg->entity;
6490 -+
6491 -+ if (leaf == NULL) {
6492 -+ leaf = bfqg;
6493 -+ prev = leaf;
6494 -+ } else {
6495 -+ bfq_group_set_parent(prev, bfqg);
6496 -+ /*
6497 -+ * Build a list of allocated nodes using the bfqd
6498 -+ * filed, that is still unused and will be initialized
6499 -+ * only after the node will be connected.
6500 -+ */
6501 -+ prev->bfqd = bfqg;
6502 -+ prev = bfqg;
6503 -+ }
6504 -+ }
6505 -+
6506 -+ return leaf;
6507 -+
6508 -+cleanup:
6509 -+ while (leaf != NULL) {
6510 -+ prev = leaf;
6511 -+ leaf = leaf->bfqd;
6512 -+ kfree(prev);
6513 -+ }
6514 -+
6515 -+ return NULL;
6516 -+}
6517 -+
6518 -+/**
6519 -+ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
6520 -+ * @bfqd: the queue descriptor.
6521 -+ * @cgroup: the leaf cgroup to start from.
6522 -+ * @leaf: the leaf group (to be associated to @cgroup).
6523 -+ *
6524 -+ * Try to link a chain of groups to a cgroup hierarchy, connecting the
6525 -+ * nodes bottom-up, so we can be sure that when we find a cgroup in the
6526 -+ * hierarchy that already as a group associated to @bfqd all the nodes
6527 -+ * in the path to the root cgroup have one too.
6528 -+ *
6529 -+ * On locking: the queue lock protects the hierarchy (there is a hierarchy
6530 -+ * per device) while the bfqio_cgroup lock protects the list of groups
6531 -+ * belonging to the same cgroup.
6532 -+ */
6533 -+static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
6534 -+ struct bfq_group *leaf)
6535 -+{
6536 -+ struct bfqio_cgroup *bgrp;
6537 -+ struct bfq_group *bfqg, *next, *prev = NULL;
6538 -+ unsigned long flags;
6539 -+
6540 -+ assert_spin_locked(bfqd->queue->queue_lock);
6541 -+
6542 -+ for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {
6543 -+ bgrp = cgroup_to_bfqio(cgroup);
6544 -+ next = leaf->bfqd;
6545 -+
6546 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
6547 -+ BUG_ON(bfqg != NULL);
6548 -+
6549 -+ spin_lock_irqsave(&bgrp->lock, flags);
6550 -+
6551 -+ rcu_assign_pointer(leaf->bfqd, bfqd);
6552 -+ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
6553 -+ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
6554 -+
6555 -+ spin_unlock_irqrestore(&bgrp->lock, flags);
6556 -+
6557 -+ prev = leaf;
6558 -+ leaf = next;
6559 -+ }
6560 -+
6561 -+ BUG_ON(cgroup == NULL && leaf != NULL);
6562 -+ if (cgroup != NULL && prev != NULL) {
6563 -+ bgrp = cgroup_to_bfqio(cgroup);
6564 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
6565 -+ bfq_group_set_parent(prev, bfqg);
6566 -+ }
6567 -+}
6568 -+
6569 -+/**
6570 -+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
6571 -+ * @bfqd: queue descriptor.
6572 -+ * @cgroup: cgroup being searched for.
6573 -+ *
6574 -+ * Return a group associated to @bfqd in @cgroup, allocating one if
6575 -+ * necessary. When a group is returned all the cgroups in the path
6576 -+ * to the root have a group associated to @bfqd.
6577 -+ *
6578 -+ * If the allocation fails, return the root group: this breaks guarantees
6579 -+ * but is a safe fallbak. If this loss becames a problem it can be
6580 -+ * mitigated using the equivalent weight (given by the product of the
6581 -+ * weights of the groups in the path from @group to the root) in the
6582 -+ * root scheduler.
6583 -+ *
6584 -+ * We allocate all the missing nodes in the path from the leaf cgroup
6585 -+ * to the root and we connect the nodes only after all the allocations
6586 -+ * have been successful.
6587 -+ */
6588 -+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
6589 -+ struct cgroup *cgroup)
6590 -+{
6591 -+ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
6592 -+ struct bfq_group *bfqg;
6593 -+
6594 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
6595 -+ if (bfqg != NULL)
6596 -+ return bfqg;
6597 -+
6598 -+ bfqg = bfq_group_chain_alloc(bfqd, cgroup);
6599 -+ if (bfqg != NULL)
6600 -+ bfq_group_chain_link(bfqd, cgroup, bfqg);
6601 -+ else
6602 -+ bfqg = bfqd->root_group;
6603 -+
6604 -+ return bfqg;
6605 -+}
6606 -+
6607 -+/**
6608 -+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
6609 -+ * @bfqd: queue descriptor.
6610 -+ * @bfqq: the queue to move.
6611 -+ * @entity: @bfqq's entity.
6612 -+ * @bfqg: the group to move to.
6613 -+ *
6614 -+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
6615 -+ * it on the new one. Avoid putting the entity on the old group idle tree.
6616 -+ *
6617 -+ * Must be called under the queue lock; the cgroup owning @bfqg must
6618 -+ * not disappear (by now this just means that we are called under
6619 -+ * rcu_read_lock()).
6620 -+ */
6621 -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6622 -+ struct bfq_entity *entity, struct bfq_group *bfqg)
6623 -+{
6624 -+ int busy, resume;
6625 -+
6626 -+ busy = bfq_bfqq_busy(bfqq);
6627 -+ resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
6628 -+
6629 -+ BUG_ON(resume && !entity->on_st);
6630 -+ BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue);
6631 -+
6632 -+ if (busy) {
6633 -+ BUG_ON(atomic_read(&bfqq->ref) < 2);
6634 -+
6635 -+ if (!resume)
6636 -+ bfq_del_bfqq_busy(bfqd, bfqq, 0);
6637 -+ else
6638 -+ bfq_deactivate_bfqq(bfqd, bfqq, 0);
6639 -+ } else if (entity->on_st)
6640 -+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
6641 -+
6642 -+ /*
6643 -+ * Here we use a reference to bfqg. We don't need a refcounter
6644 -+ * as the cgroup reference will not be dropped, so that its
6645 -+ * destroy() callback will not be invoked.
6646 -+ */
6647 -+ entity->parent = bfqg->my_entity;
6648 -+ entity->sched_data = &bfqg->sched_data;
6649 -+
6650 -+ if (busy && resume)
6651 -+ bfq_activate_bfqq(bfqd, bfqq);
6652 -+
6653 -+ if (bfqd->active_queue == NULL && !bfqd->rq_in_driver)
6654 -+ bfq_schedule_dispatch(bfqd);
6655 -+}
6656 -+
6657 -+/**
6658 -+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
6659 -+ * @bfqd: the queue descriptor.
6660 -+ * @bic: the bic to move.
6661 -+ * @cgroup: the cgroup to move to.
6662 -+ *
6663 -+ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
6664 -+ * has to make sure that the reference to cgroup is valid across the call.
6665 -+ *
6666 -+ * NOTE: an alternative approach might have been to store the current
6667 -+ * cgroup in bfqq and getting a reference to it, reducing the lookup
6668 -+ * time here, at the price of slightly more complex code.
6669 -+ */
6670 -+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
6671 -+ struct bfq_io_cq *bic,
6672 -+ struct cgroup *cgroup)
6673 -+{
6674 -+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
6675 -+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
6676 -+ struct bfq_entity *entity;
6677 -+ struct bfq_group *bfqg;
6678 -+ struct bfqio_cgroup *bgrp;
6679 -+
6680 -+ bgrp = cgroup_to_bfqio(cgroup);
6681 -+
6682 -+ bfqg = bfq_find_alloc_group(bfqd, cgroup);
6683 -+ if (async_bfqq != NULL) {
6684 -+ entity = &async_bfqq->entity;
6685 -+
6686 -+ if (entity->sched_data != &bfqg->sched_data) {
6687 -+ bic_set_bfqq(bic, NULL, 0);
6688 -+ bfq_log_bfqq(bfqd, async_bfqq,
6689 -+ "bic_change_group: %p %d",
6690 -+ async_bfqq, atomic_read(&async_bfqq->ref));
6691 -+ bfq_put_queue(async_bfqq);
6692 -+ }
6693 -+ }
6694 -+
6695 -+ if (sync_bfqq != NULL) {
6696 -+ entity = &sync_bfqq->entity;
6697 -+ if (entity->sched_data != &bfqg->sched_data)
6698 -+ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
6699 -+ }
6700 -+
6701 -+ return bfqg;
6702 -+}
6703 -+
6704 -+/**
6705 -+ * bfq_bic_change_cgroup - move @bic to @cgroup.
6706 -+ * @bic: the bic being migrated.
6707 -+ * @cgroup: the destination cgroup.
6708 -+ *
6709 -+ * When the task owning @bic is moved to @cgroup, @bic is immediately
6710 -+ * moved into its new parent group.
6711 -+ */
6712 -+static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
6713 -+ struct cgroup *cgroup)
6714 -+{
6715 -+ struct bfq_data *bfqd;
6716 -+ unsigned long uninitialized_var(flags);
6717 -+
6718 -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
6719 -+ if (bfqd != NULL) {
6720 -+ __bfq_bic_change_cgroup(bfqd, bic, cgroup);
6721 -+ bfq_put_bfqd_unlock(bfqd, &flags);
6722 -+ }
6723 -+}
6724 -+
6725 -+/**
6726 -+ * bfq_bic_update_cgroup - update the cgroup of @bic.
6727 -+ * @bic: the @bic to update.
6728 -+ *
6729 -+ * Make sure that @bic is enqueued in the cgroup of the current task.
6730 -+ * We need this in addition to moving bics during the cgroup attach
6731 -+ * phase because the task owning @bic could be at its first disk
6732 -+ * access or we may end up in the root cgroup as the result of a
6733 -+ * memory allocation failure and here we try to move to the right
6734 -+ * group.
6735 -+ *
6736 -+ * Must be called under the queue lock. It is safe to use the returned
6737 -+ * value even after the rcu_read_unlock() as the migration/destruction
6738 -+ * paths act under the queue lock too. IOW it is impossible to race with
6739 -+ * group migration/destruction and end up with an invalid group as:
6740 -+ * a) here cgroup has not yet been destroyed, nor its destroy callback
6741 -+ * has started execution, as current holds a reference to it,
6742 -+ * b) if it is destroyed after rcu_read_unlock() [after current is
6743 -+ * migrated to a different cgroup] its attach() callback will have
6744 -+ * taken care of remove all the references to the old cgroup data.
6745 -+ */
6746 -+static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
6747 -+{
6748 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
6749 -+ struct bfq_group *bfqg;
6750 -+ struct cgroup *cgroup;
6751 -+
6752 -+ BUG_ON(bfqd == NULL);
6753 -+
6754 -+ rcu_read_lock();
6755 -+ cgroup = task_cgroup(current, bfqio_subsys_id);
6756 -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, cgroup);
6757 -+ rcu_read_unlock();
6758 -+
6759 -+ return bfqg;
6760 -+}
6761 -+
6762 -+/**
6763 -+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
6764 -+ * @st: the service tree being flushed.
6765 -+ */
6766 -+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
6767 -+{
6768 -+ struct bfq_entity *entity = st->first_idle;
6769 -+
6770 -+ for (; entity != NULL; entity = st->first_idle)
6771 -+ __bfq_deactivate_entity(entity, 0);
6772 -+}
6773 -+
6774 -+/**
6775 -+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
6776 -+ * @bfqd: the device data structure with the root group.
6777 -+ * @entity: the entity to move.
6778 -+ */
6779 -+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
6780 -+ struct bfq_entity *entity)
6781 -+{
6782 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
6783 -+
6784 -+ BUG_ON(bfqq == NULL);
6785 -+ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
6786 -+ return;
6787 -+}
6788 -+
6789 -+/**
6790 -+ * bfq_reparent_active_entities - move to the root group all active entities.
6791 -+ * @bfqd: the device data structure with the root group.
6792 -+ * @bfqg: the group to move from.
6793 -+ * @st: the service tree with the entities.
6794 -+ *
6795 -+ * Needs queue_lock to be taken and reference to be valid over the call.
6796 -+ */
6797 -+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
6798 -+ struct bfq_group *bfqg,
6799 -+ struct bfq_service_tree *st)
6800 -+{
6801 -+ struct rb_root *active = &st->active;
6802 -+ struct bfq_entity *entity = NULL;
6803 -+
6804 -+ if (!RB_EMPTY_ROOT(&st->active))
6805 -+ entity = bfq_entity_of(rb_first(active));
6806 -+
6807 -+ for (; entity != NULL ; entity = bfq_entity_of(rb_first(active)))
6808 -+ bfq_reparent_leaf_entity(bfqd, entity);
6809 -+
6810 -+ if (bfqg->sched_data.active_entity != NULL)
6811 -+ bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);
6812 -+
6813 -+ return;
6814 -+}
6815 -+
6816 -+/**
6817 -+ * bfq_destroy_group - destroy @bfqg.
6818 -+ * @bgrp: the bfqio_cgroup containing @bfqg.
6819 -+ * @bfqg: the group being destroyed.
6820 -+ *
6821 -+ * Destroy @bfqg, making sure that it is not referenced from its parent.
6822 -+ */
6823 -+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
6824 -+{
6825 -+ struct bfq_data *bfqd;
6826 -+ struct bfq_service_tree *st;
6827 -+ struct bfq_entity *entity = bfqg->my_entity;
6828 -+ unsigned long uninitialized_var(flags);
6829 -+ int i;
6830 -+
6831 -+ hlist_del(&bfqg->group_node);
6832 -+
6833 -+ /*
6834 -+ * Empty all service_trees belonging to this group before deactivating
6835 -+ * the group itself.
6836 -+ */
6837 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
6838 -+ st = bfqg->sched_data.service_tree + i;
6839 -+
6840 -+ /*
6841 -+ * The idle tree may still contain bfq_queues belonging
6842 -+ * to exited task because they never migrated to a different
6843 -+ * cgroup from the one being destroyed now. Noone else
6844 -+ * can access them so it's safe to act without any lock.
6845 -+ */
6846 -+ bfq_flush_idle_tree(st);
6847 -+
6848 -+ /*
6849 -+ * It may happen that some queues are still active
6850 -+ * (busy) upon group destruction (if the corresponding
6851 -+ * processes have been forced to terminate). We move
6852 -+ * all the leaf entities corresponding to these queues
6853 -+ * to the root_group.
6854 -+ * Also, it may happen that the group has an entity
6855 -+ * under service, which is disconnected from the active
6856 -+ * tree: it must be moved, too.
6857 -+ * There is no need to put the sync queues, as the
6858 -+ * scheduler has taken no reference.
6859 -+ */
6860 -+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
6861 -+ if (bfqd != NULL) {
6862 -+ bfq_reparent_active_entities(bfqd, bfqg, st);
6863 -+ bfq_put_bfqd_unlock(bfqd, &flags);
6864 -+ }
6865 -+ BUG_ON(!RB_EMPTY_ROOT(&st->active));
6866 -+ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
6867 -+ }
6868 -+ BUG_ON(bfqg->sched_data.next_active != NULL);
6869 -+ BUG_ON(bfqg->sched_data.active_entity != NULL);
6870 -+
6871 -+ /*
6872 -+ * We may race with device destruction, take extra care when
6873 -+ * dereferencing bfqg->bfqd.
6874 -+ */
6875 -+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
6876 -+ if (bfqd != NULL) {
6877 -+ hlist_del(&bfqg->bfqd_node);
6878 -+ __bfq_deactivate_entity(entity, 0);
6879 -+ bfq_put_async_queues(bfqd, bfqg);
6880 -+ bfq_put_bfqd_unlock(bfqd, &flags);
6881 -+ }
6882 -+ BUG_ON(entity->tree != NULL);
6883 -+
6884 -+ /*
6885 -+ * No need to defer the kfree() to the end of the RCU grace
6886 -+ * period: we are called from the destroy() callback of our
6887 -+ * cgroup, so we can be sure that noone is a) still using
6888 -+ * this cgroup or b) doing lookups in it.
6889 -+ */
6890 -+ kfree(bfqg);
6891 -+}
6892 -+
6893 -+static void bfq_end_raising_async(struct bfq_data *bfqd)
6894 -+{
6895 -+ struct hlist_node *tmp;
6896 -+ struct bfq_group *bfqg;
6897 -+
6898 -+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
6899 -+ bfq_end_raising_async_queues(bfqd, bfqg);
6900 -+}
6901 -+
6902 -+/**
6903 -+ * bfq_disconnect_groups - diconnect @bfqd from all its groups.
6904 -+ * @bfqd: the device descriptor being exited.
6905 -+ *
6906 -+ * When the device exits we just make sure that no lookup can return
6907 -+ * the now unused group structures. They will be deallocated on cgroup
6908 -+ * destruction.
6909 -+ */
6910 -+static void bfq_disconnect_groups(struct bfq_data *bfqd)
6911 -+{
6912 -+ struct hlist_node *tmp;
6913 -+ struct bfq_group *bfqg;
6914 -+
6915 -+ bfq_log(bfqd, "disconnect_groups beginning") ;
6916 -+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
6917 -+ hlist_del(&bfqg->bfqd_node);
6918 -+
6919 -+ __bfq_deactivate_entity(bfqg->my_entity, 0);
6920 -+
6921 -+ /*
6922 -+ * Don't remove from the group hash, just set an
6923 -+ * invalid key. No lookups can race with the
6924 -+ * assignment as bfqd is being destroyed; this
6925 -+ * implies also that new elements cannot be added
6926 -+ * to the list.
6927 -+ */
6928 -+ rcu_assign_pointer(bfqg->bfqd, NULL);
6929 -+
6930 -+ bfq_log(bfqd, "disconnect_groups: put async for group %p",
6931 -+ bfqg) ;
6932 -+ bfq_put_async_queues(bfqd, bfqg);
6933 -+ }
6934 -+}
6935 -+
6936 -+static inline void bfq_free_root_group(struct bfq_data *bfqd)
6937 -+{
6938 -+ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
6939 -+ struct bfq_group *bfqg = bfqd->root_group;
6940 -+
6941 -+ bfq_put_async_queues(bfqd, bfqg);
6942 -+
6943 -+ spin_lock_irq(&bgrp->lock);
6944 -+ hlist_del_rcu(&bfqg->group_node);
6945 -+ spin_unlock_irq(&bgrp->lock);
6946 -+
6947 -+ /*
6948 -+ * No need to synchronize_rcu() here: since the device is gone
6949 -+ * there cannot be any read-side access to its root_group.
6950 -+ */
6951 -+ kfree(bfqg);
6952 -+}
6953 -+
6954 -+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
6955 -+{
6956 -+ struct bfq_group *bfqg;
6957 -+ struct bfqio_cgroup *bgrp;
6958 -+ int i;
6959 -+
6960 -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
6961 -+ if (bfqg == NULL)
6962 -+ return NULL;
6963 -+
6964 -+ bfqg->entity.parent = NULL;
6965 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
6966 -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
6967 -+
6968 -+ bgrp = &bfqio_root_cgroup;
6969 -+ spin_lock_irq(&bgrp->lock);
6970 -+ rcu_assign_pointer(bfqg->bfqd, bfqd);
6971 -+ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
6972 -+ spin_unlock_irq(&bgrp->lock);
6973 -+
6974 -+ return bfqg;
6975 -+}
6976 -+
6977 -+#define SHOW_FUNCTION(__VAR) \
6978 -+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup, \
6979 -+ struct cftype *cftype) \
6980 -+{ \
6981 -+ struct bfqio_cgroup *bgrp; \
6982 -+ u64 ret = -ENODEV; \
6983 -+ \
6984 -+ mutex_lock(&bfqio_mutex); \
6985 -+ if (bfqio_is_removed(cgroup)) \
6986 -+ goto out_unlock; \
6987 -+ \
6988 -+ bgrp = cgroup_to_bfqio(cgroup); \
6989 -+ spin_lock_irq(&bgrp->lock); \
6990 -+ ret = bgrp->__VAR; \
6991 -+ spin_unlock_irq(&bgrp->lock); \
6992 -+ \
6993 -+out_unlock: \
6994 -+ mutex_unlock(&bfqio_mutex); \
6995 -+ return ret; \
6996 -+}
6997 -+
6998 -+SHOW_FUNCTION(weight);
6999 -+SHOW_FUNCTION(ioprio);
7000 -+SHOW_FUNCTION(ioprio_class);
7001 -+#undef SHOW_FUNCTION
7002 -+
7003 -+#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
7004 -+static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \
7005 -+ struct cftype *cftype, \
7006 -+ u64 val) \
7007 -+{ \
7008 -+ struct bfqio_cgroup *bgrp; \
7009 -+ struct bfq_group *bfqg; \
7010 -+ int ret = -EINVAL; \
7011 -+ \
7012 -+ if (val < (__MIN) || val > (__MAX)) \
7013 -+ return ret; \
7014 -+ \
7015 -+ ret = -ENODEV; \
7016 -+ mutex_lock(&bfqio_mutex); \
7017 -+ if (bfqio_is_removed(cgroup)) \
7018 -+ goto out_unlock; \
7019 -+ ret = 0; \
7020 -+ \
7021 -+ bgrp = cgroup_to_bfqio(cgroup); \
7022 -+ \
7023 -+ spin_lock_irq(&bgrp->lock); \
7024 -+ bgrp->__VAR = (unsigned short)val; \
7025 -+ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
7026 -+ /* \
7027 -+ * Setting the ioprio_changed flag of the entity \
7028 -+ * to 1 with new_##__VAR == ##__VAR would re-set \
7029 -+ * the value of the weight to its ioprio mapping. \
7030 -+ * Set the flag only if necessary. \
7031 -+ */ \
7032 -+ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
7033 -+ bfqg->entity.new_##__VAR = (unsigned short)val; \
7034 -+ smp_wmb(); \
7035 -+ bfqg->entity.ioprio_changed = 1; \
7036 -+ } \
7037 -+ } \
7038 -+ spin_unlock_irq(&bgrp->lock); \
7039 -+ \
7040 -+out_unlock: \
7041 -+ mutex_unlock(&bfqio_mutex); \
7042 -+ return ret; \
7043 -+}
7044 -+
7045 -+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
7046 -+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
7047 -+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
7048 -+#undef STORE_FUNCTION
7049 -+
7050 -+static struct cftype bfqio_files[] = {
7051 -+ {
7052 -+ .name = "weight",
7053 -+ .read_u64 = bfqio_cgroup_weight_read,
7054 -+ .write_u64 = bfqio_cgroup_weight_write,
7055 -+ },
7056 -+ {
7057 -+ .name = "ioprio",
7058 -+ .read_u64 = bfqio_cgroup_ioprio_read,
7059 -+ .write_u64 = bfqio_cgroup_ioprio_write,
7060 -+ },
7061 -+ {
7062 -+ .name = "ioprio_class",
7063 -+ .read_u64 = bfqio_cgroup_ioprio_class_read,
7064 -+ .write_u64 = bfqio_cgroup_ioprio_class_write,
7065 -+ },
7066 -+ { }, /* terminate */
7067 -+};
7068 -+
7069 -+static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)
7070 -+{
7071 -+ struct bfqio_cgroup *bgrp;
7072 -+
7073 -+ if (cgroup->parent != NULL) {
7074 -+ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
7075 -+ if (bgrp == NULL)
7076 -+ return ERR_PTR(-ENOMEM);
7077 -+ } else
7078 -+ bgrp = &bfqio_root_cgroup;
7079 -+
7080 -+ spin_lock_init(&bgrp->lock);
7081 -+ INIT_HLIST_HEAD(&bgrp->group_data);
7082 -+ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
7083 -+ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
7084 -+
7085 -+ return &bgrp->css;
7086 -+}
7087 -+
7088 -+/*
7089 -+ * We cannot support shared io contexts, as we have no means to support
7090 -+ * two tasks with the same ioc in two different groups without major rework
7091 -+ * of the main bic/bfqq data structures. By now we allow a task to change
7092 -+ * its cgroup only if it's the only owner of its ioc; the drawback of this
7093 -+ * behavior is that a group containing a task that forked using CLONE_IO
7094 -+ * will not be destroyed until the tasks sharing the ioc die.
7095 -+ */
7096 -+static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
7097 -+{
7098 -+ struct task_struct *task;
7099 -+ struct io_context *ioc;
7100 -+ int ret = 0;
7101 -+
7102 -+ cgroup_taskset_for_each(task, cgroup, tset) {
7103 -+ /* task_lock() is needed to avoid races with exit_io_context() */
7104 -+ task_lock(task);
7105 -+ ioc = task->io_context;
7106 -+ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
7107 -+ /*
7108 -+ * ioc == NULL means that the task is either too young or
7109 -+ * exiting: if it has still no ioc the ioc can't be shared,
7110 -+ * if the task is exiting the attach will fail anyway, no
7111 -+ * matter what we return here.
7112 -+ */
7113 -+ ret = -EINVAL;
7114 -+ task_unlock(task);
7115 -+ if (ret)
7116 -+ break;
7117 -+ }
7118 -+
7119 -+ return ret;
7120 -+}
7121 -+
7122 -+static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
7123 -+{
7124 -+ struct task_struct *task;
7125 -+ struct io_context *ioc;
7126 -+ struct io_cq *icq;
7127 -+
7128 -+ /*
7129 -+ * IMPORTANT NOTE: The move of more than one process at a time to a
7130 -+ * new group has not yet been tested.
7131 -+ */
7132 -+ cgroup_taskset_for_each(task, cgroup, tset) {
7133 -+ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
7134 -+ if (ioc) {
7135 -+ /*
7136 -+ * Handle cgroup change here.
7137 -+ */
7138 -+ rcu_read_lock();
7139 -+ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
7140 -+ if (!strncmp(icq->q->elevator->type->elevator_name,
7141 -+ "bfq", ELV_NAME_MAX))
7142 -+ bfq_bic_change_cgroup(icq_to_bic(icq),
7143 -+ cgroup);
7144 -+ rcu_read_unlock();
7145 -+ put_io_context(ioc);
7146 -+ }
7147 -+ }
7148 -+}
7149 -+
7150 -+static void bfqio_destroy(struct cgroup *cgroup)
7151 -+{
7152 -+ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
7153 -+ struct hlist_node *tmp;
7154 -+ struct bfq_group *bfqg;
7155 -+
7156 -+ /*
7157 -+ * Since we are destroying the cgroup, there are no more tasks
7158 -+ * referencing it, and all the RCU grace periods that may have
7159 -+ * referenced it are ended (as the destruction of the parent
7160 -+ * cgroup is RCU-safe); bgrp->group_data will not be accessed by
7161 -+ * anything else and we don't need any synchronization.
7162 -+ */
7163 -+ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
7164 -+ bfq_destroy_group(bgrp, bfqg);
7165 -+
7166 -+ BUG_ON(!hlist_empty(&bgrp->group_data));
7167 -+
7168 -+ kfree(bgrp);
7169 -+}
7170 -+
7171 -+struct cgroup_subsys bfqio_subsys = {
7172 -+ .name = "bfqio",
7173 -+ .css_alloc = bfqio_create,
7174 -+ .can_attach = bfqio_can_attach,
7175 -+ .attach = bfqio_attach,
7176 -+ .css_free = bfqio_destroy,
7177 -+ .subsys_id = bfqio_subsys_id,
7178 -+ .base_cftypes = bfqio_files,
7179 -+};
7180 -+#else
7181 -+static inline void bfq_init_entity(struct bfq_entity *entity,
7182 -+ struct bfq_group *bfqg)
7183 -+{
7184 -+ entity->weight = entity->new_weight;
7185 -+ entity->orig_weight = entity->new_weight;
7186 -+ entity->ioprio = entity->new_ioprio;
7187 -+ entity->ioprio_class = entity->new_ioprio_class;
7188 -+ entity->sched_data = &bfqg->sched_data;
7189 -+}
7190 -+
7191 -+static inline struct bfq_group *
7192 -+bfq_bic_update_cgroup(struct bfq_io_cq *bic)
7193 -+{
7194 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
7195 -+ return bfqd->root_group;
7196 -+}
7197 -+
7198 -+static inline void bfq_bfqq_move(struct bfq_data *bfqd,
7199 -+ struct bfq_queue *bfqq,
7200 -+ struct bfq_entity *entity,
7201 -+ struct bfq_group *bfqg)
7202 -+{
7203 -+}
7204 -+
7205 -+static void bfq_end_raising_async(struct bfq_data *bfqd)
7206 -+{
7207 -+ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
7208 -+}
7209 -+
7210 -+static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
7211 -+{
7212 -+ bfq_put_async_queues(bfqd, bfqd->root_group);
7213 -+}
7214 -+
7215 -+static inline void bfq_free_root_group(struct bfq_data *bfqd)
7216 -+{
7217 -+ kfree(bfqd->root_group);
7218 -+}
7219 -+
7220 -+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
7221 -+{
7222 -+ struct bfq_group *bfqg;
7223 -+ int i;
7224 -+
7225 -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
7226 -+ if (bfqg == NULL)
7227 -+ return NULL;
7228 -+
7229 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
7230 -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
7231 -+
7232 -+ return bfqg;
7233 -+}
7234 -+#endif
7235 -diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
7236 -new file mode 100644
7237 -index 0000000..326e3ec
7238 ---- /dev/null
7239 -+++ b/block/bfq-ioc.c
7240 -@@ -0,0 +1,36 @@
7241 -+/*
7242 -+ * BFQ: I/O context handling.
7243 -+ *
7244 -+ * Based on ideas and code from CFQ:
7245 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
7246 -+ *
7247 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
7248 -+ * Paolo Valente <paolo.valente@×××××××.it>
7249 -+ *
7250 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
7251 -+ */
7252 -+
7253 -+/**
7254 -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
7255 -+ * @icq: the iocontext queue.
7256 -+ */
7257 -+static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
7258 -+{
7259 -+ /* bic->icq is the first member, %NULL will convert to %NULL */
7260 -+ return container_of(icq, struct bfq_io_cq, icq);
7261 -+}
7262 -+
7263 -+/**
7264 -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
7265 -+ * @bfqd: the lookup key.
7266 -+ * @ioc: the io_context of the process doing I/O.
7267 -+ *
7268 -+ * Queue lock must be held.
7269 -+ */
7270 -+static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
7271 -+ struct io_context *ioc)
7272 -+{
7273 -+ if(ioc)
7274 -+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
7275 -+ return NULL;
7276 -+}
7277 -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
7278 -new file mode 100644
7279 -index 0000000..0ed2746
7280 ---- /dev/null
7281 -+++ b/block/bfq-iosched.c
7282 -@@ -0,0 +1,3082 @@
7283 -+/*
7284 -+ * BFQ, or Budget Fair Queueing, disk scheduler.
7285 -+ *
7286 -+ * Based on ideas and code from CFQ:
7287 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
7288 -+ *
7289 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
7290 -+ * Paolo Valente <paolo.valente@×××××××.it>
7291 -+ *
7292 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
7293 -+ *
7294 -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
7295 -+ *
7296 -+ * BFQ is a proportional share disk scheduling algorithm based on the
7297 -+ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
7298 -+ * measured in number of sectors, to tasks instead of time slices.
7299 -+ * The disk is not granted to the active task for a given time slice,
7300 -+ * but until it has exahusted its assigned budget. This change from
7301 -+ * the time to the service domain allows BFQ to distribute the disk
7302 -+ * bandwidth among tasks as desired, without any distortion due to
7303 -+ * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc
7304 -+ * internal scheduler, called B-WF2Q+, to schedule tasks according to
7305 -+ * their budgets. Thanks to this accurate scheduler, BFQ can afford
7306 -+ * to assign high budgets to disk-bound non-seeky tasks (to boost the
7307 -+ * throughput), and yet guarantee low latencies to interactive and
7308 -+ * soft real-time applications.
7309 -+ *
7310 -+ * BFQ has been introduced in [1], where the interested reader can
7311 -+ * find an accurate description of the algorithm, the bandwidth
7312 -+ * distribution and latency guarantees it provides, plus formal proofs
7313 -+ * of all the properties. With respect to the algorithm presented in
7314 -+ * the paper, this implementation adds several little heuristics, and
7315 -+ * a hierarchical extension, based on H-WF2Q+.
7316 -+ *
7317 -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
7318 -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
7319 -+ * complexity derives from the one introduced with EEVDF in [3].
7320 -+ *
7321 -+ * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
7322 -+ * with Deterministic Guarantees on Bandwidth Distribution,'',
7323 -+ * IEEE Transactions on Computer, May 2010.
7324 -+ *
7325 -+ * http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf
7326 -+ *
7327 -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
7328 -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
7329 -+ * Oct 1997.
7330 -+ *
7331 -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
7332 -+ *
7333 -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
7334 -+ * First: A Flexible and Accurate Mechanism for Proportional Share
7335 -+ * Resource Allocation,'' technical report.
7336 -+ *
7337 -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
7338 -+ */
7339 -+#include <linux/module.h>
7340 -+#include <linux/slab.h>
7341 -+#include <linux/blkdev.h>
7342 -+#include <linux/cgroup.h>
7343 -+#include <linux/elevator.h>
7344 -+#include <linux/jiffies.h>
7345 -+#include <linux/rbtree.h>
7346 -+#include <linux/ioprio.h>
7347 -+#include "bfq.h"
7348 -+#include "blk.h"
7349 -+
7350 -+/* Max number of dispatches in one round of service. */
7351 -+static const int bfq_quantum = 4;
7352 -+
7353 -+/* Expiration time of sync (0) and async (1) requests, in jiffies. */
7354 -+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
7355 -+
7356 -+/* Maximum backwards seek, in KiB. */
7357 -+static const int bfq_back_max = 16 * 1024;
7358 -+
7359 -+/* Penalty of a backwards seek, in number of sectors. */
7360 -+static const int bfq_back_penalty = 2;
7361 -+
7362 -+/* Idling period duration, in jiffies. */
7363 -+static int bfq_slice_idle = HZ / 125;
7364 -+
7365 -+/* Default maximum budget values, in sectors and number of requests. */
7366 -+static const int bfq_default_max_budget = 16 * 1024;
7367 -+static const int bfq_max_budget_async_rq = 4;
7368 -+
7369 -+/*
7370 -+ * Async to sync throughput distribution is controlled as follows:
7371 -+ * when an async request is served, the entity is charged the number
7372 -+ * of sectors of the request, multipled by the factor below
7373 -+ */
7374 -+static const int bfq_async_charge_factor = 10;
7375 -+
7376 -+/* Default timeout values, in jiffies, approximating CFQ defaults. */
7377 -+static const int bfq_timeout_sync = HZ / 8;
7378 -+static int bfq_timeout_async = HZ / 25;
7379 -+
7380 -+struct kmem_cache *bfq_pool;
7381 -+
7382 -+/* Below this threshold (in ms), we consider thinktime immediate. */
7383 -+#define BFQ_MIN_TT 2
7384 -+
7385 -+/* hw_tag detection: parallel requests threshold and min samples needed. */
7386 -+#define BFQ_HW_QUEUE_THRESHOLD 4
7387 -+#define BFQ_HW_QUEUE_SAMPLES 32
7388 -+
7389 -+#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
7390 -+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
7391 -+
7392 -+/* Min samples used for peak rate estimation (for autotuning). */
7393 -+#define BFQ_PEAK_RATE_SAMPLES 32
7394 -+
7395 -+/* Shift used for peak rate fixed precision calculations. */
7396 -+#define BFQ_RATE_SHIFT 16
7397 -+
7398 -+/*
7399 -+ * The duration of the weight raising for interactive applications is
7400 -+ * computed automatically (as default behaviour), using the following
7401 -+ * formula: duration = (R / r) * T, where r is the peak rate of the
7402 -+ * disk, and R and T are two reference parameters. In particular, R is
7403 -+ * the peak rate of a reference disk, and T is about the maximum time
7404 -+ * for starting popular large applications on that disk, under BFQ and
7405 -+ * while reading two files in parallel. Finally, BFQ uses two
7406 -+ * different pairs (R, T) depending on whether the disk is rotational
7407 -+ * or non-rotational.
7408 -+ */
7409 -+#define T_rot (msecs_to_jiffies(5500))
7410 -+#define T_nonrot (msecs_to_jiffies(2000))
7411 -+/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
7412 -+#define R_rot 17415
7413 -+#define R_nonrot 34791
7414 -+
7415 -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
7416 -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
7417 -+
7418 -+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
7419 -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
7420 -+
7421 -+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
7422 -+
7423 -+#include "bfq-ioc.c"
7424 -+#include "bfq-sched.c"
7425 -+#include "bfq-cgroup.c"
7426 -+
7427 -+#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
7428 -+ IOPRIO_CLASS_IDLE)
7429 -+#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
7430 -+ IOPRIO_CLASS_RT)
7431 -+
7432 -+#define bfq_sample_valid(samples) ((samples) > 80)
7433 -+
7434 -+/*
7435 -+ * We regard a request as SYNC, if either it's a read or has the SYNC bit
7436 -+ * set (in which case it could also be a direct WRITE).
7437 -+ */
7438 -+static inline int bfq_bio_sync(struct bio *bio)
7439 -+{
7440 -+ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
7441 -+ return 1;
7442 -+
7443 -+ return 0;
7444 -+}
7445 -+
7446 -+/*
7447 -+ * Scheduler run of queue, if there are requests pending and no one in the
7448 -+ * driver that will restart queueing.
7449 -+ */
7450 -+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
7451 -+{
7452 -+ if (bfqd->queued != 0) {
7453 -+ bfq_log(bfqd, "schedule dispatch");
7454 -+ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
7455 -+ }
7456 -+}
7457 -+
7458 -+/*
7459 -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
7460 -+ * We choose the request that is closesr to the head right now. Distance
7461 -+ * behind the head is penalized and only allowed to a certain extent.
7462 -+ */
7463 -+static struct request *bfq_choose_req(struct bfq_data *bfqd,
7464 -+ struct request *rq1,
7465 -+ struct request *rq2,
7466 -+ sector_t last)
7467 -+{
7468 -+ sector_t s1, s2, d1 = 0, d2 = 0;
7469 -+ unsigned long back_max;
7470 -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
7471 -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
7472 -+ unsigned wrap = 0; /* bit mask: requests behind the disk head? */
7473 -+
7474 -+ if (rq1 == NULL || rq1 == rq2)
7475 -+ return rq2;
7476 -+ if (rq2 == NULL)
7477 -+ return rq1;
7478 -+
7479 -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
7480 -+ return rq1;
7481 -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
7482 -+ return rq2;
7483 -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
7484 -+ return rq1;
7485 -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
7486 -+ return rq2;
7487 -+
7488 -+ s1 = blk_rq_pos(rq1);
7489 -+ s2 = blk_rq_pos(rq2);
7490 -+
7491 -+ /*
7492 -+ * By definition, 1KiB is 2 sectors.
7493 -+ */
7494 -+ back_max = bfqd->bfq_back_max * 2;
7495 -+
7496 -+ /*
7497 -+ * Strict one way elevator _except_ in the case where we allow
7498 -+ * short backward seeks which are biased as twice the cost of a
7499 -+ * similar forward seek.
7500 -+ */
7501 -+ if (s1 >= last)
7502 -+ d1 = s1 - last;
7503 -+ else if (s1 + back_max >= last)
7504 -+ d1 = (last - s1) * bfqd->bfq_back_penalty;
7505 -+ else
7506 -+ wrap |= BFQ_RQ1_WRAP;
7507 -+
7508 -+ if (s2 >= last)
7509 -+ d2 = s2 - last;
7510 -+ else if (s2 + back_max >= last)
7511 -+ d2 = (last - s2) * bfqd->bfq_back_penalty;
7512 -+ else
7513 -+ wrap |= BFQ_RQ2_WRAP;
7514 -+
7515 -+ /* Found required data */
7516 -+
7517 -+ /*
7518 -+ * By doing switch() on the bit mask "wrap" we avoid having to
7519 -+ * check two variables for all permutations: --> faster!
7520 -+ */
7521 -+ switch (wrap) {
7522 -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
7523 -+ if (d1 < d2)
7524 -+ return rq1;
7525 -+ else if (d2 < d1)
7526 -+ return rq2;
7527 -+ else {
7528 -+ if (s1 >= s2)
7529 -+ return rq1;
7530 -+ else
7531 -+ return rq2;
7532 -+ }
7533 -+
7534 -+ case BFQ_RQ2_WRAP:
7535 -+ return rq1;
7536 -+ case BFQ_RQ1_WRAP:
7537 -+ return rq2;
7538 -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
7539 -+ default:
7540 -+ /*
7541 -+ * Since both rqs are wrapped,
7542 -+ * start with the one that's further behind head
7543 -+ * (--> only *one* back seek required),
7544 -+ * since back seek takes more time than forward.
7545 -+ */
7546 -+ if (s1 <= s2)
7547 -+ return rq1;
7548 -+ else
7549 -+ return rq2;
7550 -+ }
7551 -+}
7552 -+
7553 -+static struct bfq_queue *
7554 -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
7555 -+ sector_t sector, struct rb_node **ret_parent,
7556 -+ struct rb_node ***rb_link)
7557 -+{
7558 -+ struct rb_node **p, *parent;
7559 -+ struct bfq_queue *bfqq = NULL;
7560 -+
7561 -+ parent = NULL;
7562 -+ p = &root->rb_node;
7563 -+ while (*p) {
7564 -+ struct rb_node **n;
7565 -+
7566 -+ parent = *p;
7567 -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
7568 -+
7569 -+ /*
7570 -+ * Sort strictly based on sector. Smallest to the left,
7571 -+ * largest to the right.
7572 -+ */
7573 -+ if (sector > blk_rq_pos(bfqq->next_rq))
7574 -+ n = &(*p)->rb_right;
7575 -+ else if (sector < blk_rq_pos(bfqq->next_rq))
7576 -+ n = &(*p)->rb_left;
7577 -+ else
7578 -+ break;
7579 -+ p = n;
7580 -+ bfqq = NULL;
7581 -+ }
7582 -+
7583 -+ *ret_parent = parent;
7584 -+ if (rb_link)
7585 -+ *rb_link = p;
7586 -+
7587 -+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
7588 -+ (long long unsigned)sector,
7589 -+ bfqq != NULL ? bfqq->pid : 0);
7590 -+
7591 -+ return bfqq;
7592 -+}
7593 -+
7594 -+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
7595 -+{
7596 -+ struct rb_node **p, *parent;
7597 -+ struct bfq_queue *__bfqq;
7598 -+
7599 -+ if (bfqq->pos_root != NULL) {
7600 -+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
7601 -+ bfqq->pos_root = NULL;
7602 -+ }
7603 -+
7604 -+ if (bfq_class_idle(bfqq))
7605 -+ return;
7606 -+ if (!bfqq->next_rq)
7607 -+ return;
7608 -+
7609 -+ bfqq->pos_root = &bfqd->rq_pos_tree;
7610 -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
7611 -+ blk_rq_pos(bfqq->next_rq), &parent, &p);
7612 -+ if (__bfqq == NULL) {
7613 -+ rb_link_node(&bfqq->pos_node, parent, p);
7614 -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
7615 -+ } else
7616 -+ bfqq->pos_root = NULL;
7617 -+}
7618 -+
7619 -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
7620 -+ struct bfq_queue *bfqq,
7621 -+ struct request *last)
7622 -+{
7623 -+ struct rb_node *rbnext = rb_next(&last->rb_node);
7624 -+ struct rb_node *rbprev = rb_prev(&last->rb_node);
7625 -+ struct request *next = NULL, *prev = NULL;
7626 -+
7627 -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
7628 -+
7629 -+ if (rbprev != NULL)
7630 -+ prev = rb_entry_rq(rbprev);
7631 -+
7632 -+ if (rbnext != NULL)
7633 -+ next = rb_entry_rq(rbnext);
7634 -+ else {
7635 -+ rbnext = rb_first(&bfqq->sort_list);
7636 -+ if (rbnext && rbnext != &last->rb_node)
7637 -+ next = rb_entry_rq(rbnext);
7638 -+ }
7639 -+
7640 -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
7641 -+}
7642 -+
7643 -+static void bfq_del_rq_rb(struct request *rq)
7644 -+{
7645 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
7646 -+ struct bfq_data *bfqd = bfqq->bfqd;
7647 -+ const int sync = rq_is_sync(rq);
7648 -+
7649 -+ BUG_ON(bfqq->queued[sync] == 0);
7650 -+ bfqq->queued[sync]--;
7651 -+ bfqd->queued--;
7652 -+
7653 -+ elv_rb_del(&bfqq->sort_list, rq);
7654 -+
7655 -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
7656 -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue)
7657 -+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
7658 -+ /*
7659 -+ * Remove queue from request-position tree as it is empty.
7660 -+ */
7661 -+ if (bfqq->pos_root != NULL) {
7662 -+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
7663 -+ bfqq->pos_root = NULL;
7664 -+ }
7665 -+ }
7666 -+}
7667 -+
7668 -+/* see the definition of bfq_async_charge_factor for details */
7669 -+static inline unsigned long bfq_serv_to_charge(struct request *rq,
7670 -+ struct bfq_queue *bfqq)
7671 -+{
7672 -+ return blk_rq_sectors(rq) *
7673 -+ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
7674 -+ bfq_async_charge_factor));
7675 -+}
7676 -+
7677 -+/**
7678 -+ * bfq_updated_next_req - update the queue after a new next_rq selection.
7679 -+ * @bfqd: the device data the queue belongs to.
7680 -+ * @bfqq: the queue to update.
7681 -+ *
7682 -+ * If the first request of a queue changes we make sure that the queue
7683 -+ * has enough budget to serve at least its first request (if the
7684 -+ * request has grown). We do this because if the queue has not enough
7685 -+ * budget for its first request, it has to go through two dispatch
7686 -+ * rounds to actually get it dispatched.
7687 -+ */
7688 -+static void bfq_updated_next_req(struct bfq_data *bfqd,
7689 -+ struct bfq_queue *bfqq)
7690 -+{
7691 -+ struct bfq_entity *entity = &bfqq->entity;
7692 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
7693 -+ struct request *next_rq = bfqq->next_rq;
7694 -+ unsigned long new_budget;
7695 -+
7696 -+ if (next_rq == NULL)
7697 -+ return;
7698 -+
7699 -+ if (bfqq == bfqd->active_queue)
7700 -+ /*
7701 -+ * In order not to break guarantees, budgets cannot be
7702 -+ * changed after an entity has been selected.
7703 -+ */
7704 -+ return;
7705 -+
7706 -+ BUG_ON(entity->tree != &st->active);
7707 -+ BUG_ON(entity == entity->sched_data->active_entity);
7708 -+
7709 -+ new_budget = max_t(unsigned long, bfqq->max_budget,
7710 -+ bfq_serv_to_charge(next_rq, bfqq));
7711 -+ entity->budget = new_budget;
7712 -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
7713 -+ bfq_activate_bfqq(bfqd, bfqq);
7714 -+}
7715 -+
7716 -+static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
7717 -+{
7718 -+ u64 dur;
7719 -+
7720 -+ if (bfqd->bfq_raising_max_time > 0)
7721 -+ return bfqd->bfq_raising_max_time;
7722 -+
7723 -+ dur = bfqd->RT_prod;
7724 -+ do_div(dur, bfqd->peak_rate);
7725 -+
7726 -+ return dur;
7727 -+}
7728 -+
7729 -+static void bfq_add_rq_rb(struct request *rq)
7730 -+{
7731 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
7732 -+ struct bfq_entity *entity = &bfqq->entity;
7733 -+ struct bfq_data *bfqd = bfqq->bfqd;
7734 -+ struct request *next_rq, *prev;
7735 -+ unsigned long old_raising_coeff = bfqq->raising_coeff;
7736 -+ int idle_for_long_time = bfqq->budget_timeout +
7737 -+ bfqd->bfq_raising_min_idle_time < jiffies;
7738 -+
7739 -+ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
7740 -+ bfqq->queued[rq_is_sync(rq)]++;
7741 -+ bfqd->queued++;
7742 -+
7743 -+ elv_rb_add(&bfqq->sort_list, rq);
7744 -+
7745 -+ /*
7746 -+ * Check if this request is a better next-serve candidate.
7747 -+ */
7748 -+ prev = bfqq->next_rq;
7749 -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
7750 -+ BUG_ON(next_rq == NULL);
7751 -+ bfqq->next_rq = next_rq;
7752 -+
7753 -+ /*
7754 -+ * Adjust priority tree position, if next_rq changes.
7755 -+ */
7756 -+ if (prev != bfqq->next_rq)
7757 -+ bfq_rq_pos_tree_add(bfqd, bfqq);
7758 -+
7759 -+ if (!bfq_bfqq_busy(bfqq)) {
7760 -+ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
7761 -+ bfqq->soft_rt_next_start < jiffies;
7762 -+ entity->budget = max_t(unsigned long, bfqq->max_budget,
7763 -+ bfq_serv_to_charge(next_rq, bfqq));
7764 -+
7765 -+ if (! bfqd->low_latency)
7766 -+ goto add_bfqq_busy;
7767 -+
7768 -+ /*
7769 -+ * If the queue is not being boosted and has been idle
7770 -+ * for enough time, start a weight-raising period
7771 -+ */
7772 -+ if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {
7773 -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
7774 -+ if (idle_for_long_time)
7775 -+ bfqq->raising_cur_max_time =
7776 -+ bfq_wrais_duration(bfqd);
7777 -+ else
7778 -+ bfqq->raising_cur_max_time =
7779 -+ bfqd->bfq_raising_rt_max_time;
7780 -+ bfq_log_bfqq(bfqd, bfqq,
7781 -+ "wrais starting at %llu msec,"
7782 -+ "rais_max_time %u",
7783 -+ bfqq->last_rais_start_finish,
7784 -+ jiffies_to_msecs(bfqq->
7785 -+ raising_cur_max_time));
7786 -+ } else if (old_raising_coeff > 1) {
7787 -+ if (idle_for_long_time)
7788 -+ bfqq->raising_cur_max_time =
7789 -+ bfq_wrais_duration(bfqd);
7790 -+ else if (bfqq->raising_cur_max_time ==
7791 -+ bfqd->bfq_raising_rt_max_time &&
7792 -+ !soft_rt) {
7793 -+ bfqq->raising_coeff = 1;
7794 -+ bfq_log_bfqq(bfqd, bfqq,
7795 -+ "wrais ending at %llu msec,"
7796 -+ "rais_max_time %u",
7797 -+ bfqq->last_rais_start_finish,
7798 -+ jiffies_to_msecs(bfqq->
7799 -+ raising_cur_max_time));
7800 -+ }
7801 -+ }
7802 -+ if (old_raising_coeff != bfqq->raising_coeff)
7803 -+ entity->ioprio_changed = 1;
7804 -+add_bfqq_busy:
7805 -+ bfq_add_bfqq_busy(bfqd, bfqq);
7806 -+ } else {
7807 -+ if(bfqd->low_latency && old_raising_coeff == 1 &&
7808 -+ !rq_is_sync(rq) &&
7809 -+ bfqq->last_rais_start_finish +
7810 -+ bfqd->bfq_raising_min_inter_arr_async < jiffies) {
7811 -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
7812 -+ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
7813 -+
7814 -+ entity->ioprio_changed = 1;
7815 -+ bfq_log_bfqq(bfqd, bfqq,
7816 -+ "non-idle wrais starting at %llu msec,"
7817 -+ "rais_max_time %u",
7818 -+ bfqq->last_rais_start_finish,
7819 -+ jiffies_to_msecs(bfqq->
7820 -+ raising_cur_max_time));
7821 -+ }
7822 -+ bfq_updated_next_req(bfqd, bfqq);
7823 -+ }
7824 -+
7825 -+ if(bfqd->low_latency &&
7826 -+ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
7827 -+ idle_for_long_time))
7828 -+ bfqq->last_rais_start_finish = jiffies;
7829 -+}
7830 -+
7831 -+static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
7832 -+{
7833 -+ elv_rb_del(&bfqq->sort_list, rq);
7834 -+ bfqq->queued[rq_is_sync(rq)]--;
7835 -+ bfqq->bfqd->queued--;
7836 -+ bfq_add_rq_rb(rq);
7837 -+}
7838 -+
7839 -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
7840 -+ struct bio *bio)
7841 -+{
7842 -+ struct task_struct *tsk = current;
7843 -+ struct bfq_io_cq *bic;
7844 -+ struct bfq_queue *bfqq;
7845 -+
7846 -+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
7847 -+ if (bic == NULL)
7848 -+ return NULL;
7849 -+
7850 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
7851 -+ if (bfqq != NULL) {
7852 -+ sector_t sector = bio->bi_sector + bio_sectors(bio);
7853 -+
7854 -+ return elv_rb_find(&bfqq->sort_list, sector);
7855 -+ }
7856 -+
7857 -+ return NULL;
7858 -+}
7859 -+
7860 -+static void bfq_activate_request(struct request_queue *q, struct request *rq)
7861 -+{
7862 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
7863 -+
7864 -+ bfqd->rq_in_driver++;
7865 -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
7866 -+ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
7867 -+ (long long unsigned)bfqd->last_position);
7868 -+}
7869 -+
7870 -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
7871 -+{
7872 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
7873 -+
7874 -+ WARN_ON(bfqd->rq_in_driver == 0);
7875 -+ bfqd->rq_in_driver--;
7876 -+}
7877 -+
7878 -+static void bfq_remove_request(struct request *rq)
7879 -+{
7880 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
7881 -+ struct bfq_data *bfqd = bfqq->bfqd;
7882 -+
7883 -+ if (bfqq->next_rq == rq) {
7884 -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
7885 -+ bfq_updated_next_req(bfqd, bfqq);
7886 -+ }
7887 -+
7888 -+ list_del_init(&rq->queuelist);
7889 -+ bfq_del_rq_rb(rq);
7890 -+
7891 -+ if (rq->cmd_flags & REQ_META) {
7892 -+ WARN_ON(bfqq->meta_pending == 0);
7893 -+ bfqq->meta_pending--;
7894 -+ }
7895 -+}
7896 -+
7897 -+static int bfq_merge(struct request_queue *q, struct request **req,
7898 -+ struct bio *bio)
7899 -+{
7900 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
7901 -+ struct request *__rq;
7902 -+
7903 -+ __rq = bfq_find_rq_fmerge(bfqd, bio);
7904 -+ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
7905 -+ *req = __rq;
7906 -+ return ELEVATOR_FRONT_MERGE;
7907 -+ }
7908 -+
7909 -+ return ELEVATOR_NO_MERGE;
7910 -+}
7911 -+
7912 -+static void bfq_merged_request(struct request_queue *q, struct request *req,
7913 -+ int type)
7914 -+{
7915 -+ if (type == ELEVATOR_FRONT_MERGE) {
7916 -+ struct bfq_queue *bfqq = RQ_BFQQ(req);
7917 -+
7918 -+ bfq_reposition_rq_rb(bfqq, req);
7919 -+ }
7920 -+}
7921 -+
7922 -+static void bfq_merged_requests(struct request_queue *q, struct request *rq,
7923 -+ struct request *next)
7924 -+{
7925 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
7926 -+
7927 -+ /*
7928 -+ * Reposition in fifo if next is older than rq.
7929 -+ */
7930 -+ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
7931 -+ time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
7932 -+ list_move(&rq->queuelist, &next->queuelist);
7933 -+ rq_set_fifo_time(rq, rq_fifo_time(next));
7934 -+ }
7935 -+
7936 -+ if (bfqq->next_rq == next)
7937 -+ bfqq->next_rq = rq;
7938 -+
7939 -+ bfq_remove_request(next);
7940 -+}
7941 -+
7942 -+/* Must be called with bfqq != NULL */
7943 -+static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)
7944 -+{
7945 -+ BUG_ON(bfqq == NULL);
7946 -+ bfqq->raising_coeff = 1;
7947 -+ bfqq->raising_cur_max_time = 0;
7948 -+ /* Trigger a weight change on the next activation of the queue */
7949 -+ bfqq->entity.ioprio_changed = 1;
7950 -+}
7951 -+
7952 -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
7953 -+ struct bfq_group *bfqg)
7954 -+{
7955 -+ int i, j;
7956 -+
7957 -+ for (i = 0; i < 2; i++)
7958 -+ for (j = 0; j < IOPRIO_BE_NR; j++)
7959 -+ if (bfqg->async_bfqq[i][j] != NULL)
7960 -+ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);
7961 -+ if (bfqg->async_idle_bfqq != NULL)
7962 -+ bfq_bfqq_end_raising(bfqg->async_idle_bfqq);
7963 -+}
7964 -+
7965 -+static void bfq_end_raising(struct bfq_data *bfqd)
7966 -+{
7967 -+ struct bfq_queue *bfqq;
7968 -+
7969 -+ spin_lock_irq(bfqd->queue->queue_lock);
7970 -+
7971 -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
7972 -+ bfq_bfqq_end_raising(bfqq);
7973 -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
7974 -+ bfq_bfqq_end_raising(bfqq);
7975 -+ bfq_end_raising_async(bfqd);
7976 -+
7977 -+ spin_unlock_irq(bfqd->queue->queue_lock);
7978 -+}
7979 -+
7980 -+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
7981 -+ struct bio *bio)
7982 -+{
7983 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
7984 -+ struct bfq_io_cq *bic;
7985 -+ struct bfq_queue *bfqq;
7986 -+
7987 -+ /*
7988 -+ * Disallow merge of a sync bio into an async request.
7989 -+ */
7990 -+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
7991 -+ return 0;
7992 -+
7993 -+ /*
7994 -+ * Lookup the bfqq that this bio will be queued with. Allow
7995 -+ * merge only if rq is queued there.
7996 -+ * Queue lock is held here.
7997 -+ */
7998 -+ bic = bfq_bic_lookup(bfqd, current->io_context);
7999 -+ if (bic == NULL)
8000 -+ return 0;
8001 -+
8002 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
8003 -+ return bfqq == RQ_BFQQ(rq);
8004 -+}
8005 -+
8006 -+static void __bfq_set_active_queue(struct bfq_data *bfqd,
8007 -+ struct bfq_queue *bfqq)
8008 -+{
8009 -+ if (bfqq != NULL) {
8010 -+ bfq_mark_bfqq_must_alloc(bfqq);
8011 -+ bfq_mark_bfqq_budget_new(bfqq);
8012 -+ bfq_clear_bfqq_fifo_expire(bfqq);
8013 -+
8014 -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
8015 -+
8016 -+ bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
8017 -+ bfqq->entity.budget);
8018 -+ }
8019 -+
8020 -+ bfqd->active_queue = bfqq;
8021 -+}
8022 -+
8023 -+/*
8024 -+ * Get and set a new active queue for service.
8025 -+ */
8026 -+static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,
8027 -+ struct bfq_queue *bfqq)
8028 -+{
8029 -+ if (!bfqq)
8030 -+ bfqq = bfq_get_next_queue(bfqd);
8031 -+ else
8032 -+ bfq_get_next_queue_forced(bfqd, bfqq);
8033 -+
8034 -+ __bfq_set_active_queue(bfqd, bfqq);
8035 -+ return bfqq;
8036 -+}
8037 -+
8038 -+static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
8039 -+ struct request *rq)
8040 -+{
8041 -+ if (blk_rq_pos(rq) >= bfqd->last_position)
8042 -+ return blk_rq_pos(rq) - bfqd->last_position;
8043 -+ else
8044 -+ return bfqd->last_position - blk_rq_pos(rq);
8045 -+}
8046 -+
8047 -+/*
8048 -+ * Return true if bfqq has no request pending and rq is close enough to
8049 -+ * bfqd->last_position, or if rq is closer to bfqd->last_position than
8050 -+ * bfqq->next_rq
8051 -+ */
8052 -+static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
8053 -+{
8054 -+ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
8055 -+}
8056 -+
8057 -+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
8058 -+{
8059 -+ struct rb_root *root = &bfqd->rq_pos_tree;
8060 -+ struct rb_node *parent, *node;
8061 -+ struct bfq_queue *__bfqq;
8062 -+ sector_t sector = bfqd->last_position;
8063 -+
8064 -+ if (RB_EMPTY_ROOT(root))
8065 -+ return NULL;
8066 -+
8067 -+ /*
8068 -+ * First, if we find a request starting at the end of the last
8069 -+ * request, choose it.
8070 -+ */
8071 -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
8072 -+ if (__bfqq != NULL)
8073 -+ return __bfqq;
8074 -+
8075 -+ /*
8076 -+ * If the exact sector wasn't found, the parent of the NULL leaf
8077 -+ * will contain the closest sector (rq_pos_tree sorted by next_request
8078 -+ * position).
8079 -+ */
8080 -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
8081 -+ if (bfq_rq_close(bfqd, __bfqq->next_rq))
8082 -+ return __bfqq;
8083 -+
8084 -+ if (blk_rq_pos(__bfqq->next_rq) < sector)
8085 -+ node = rb_next(&__bfqq->pos_node);
8086 -+ else
8087 -+ node = rb_prev(&__bfqq->pos_node);
8088 -+ if (node == NULL)
8089 -+ return NULL;
8090 -+
8091 -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
8092 -+ if (bfq_rq_close(bfqd, __bfqq->next_rq))
8093 -+ return __bfqq;
8094 -+
8095 -+ return NULL;
8096 -+}
8097 -+
8098 -+/*
8099 -+ * bfqd - obvious
8100 -+ * cur_bfqq - passed in so that we don't decide that the current queue
8101 -+ * is closely cooperating with itself.
8102 -+ *
8103 -+ * We are assuming that cur_bfqq has dispatched at least one request,
8104 -+ * and that bfqd->last_position reflects a position on the disk associated
8105 -+ * with the I/O issued by cur_bfqq.
8106 -+ */
8107 -+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
8108 -+ struct bfq_queue *cur_bfqq)
8109 -+{
8110 -+ struct bfq_queue *bfqq;
8111 -+
8112 -+ if (bfq_class_idle(cur_bfqq))
8113 -+ return NULL;
8114 -+ if (!bfq_bfqq_sync(cur_bfqq))
8115 -+ return NULL;
8116 -+ if (BFQQ_SEEKY(cur_bfqq))
8117 -+ return NULL;
8118 -+
8119 -+ /* If device has only one backlogged bfq_queue, don't search. */
8120 -+ if (bfqd->busy_queues == 1)
8121 -+ return NULL;
8122 -+
8123 -+ /*
8124 -+ * We should notice if some of the queues are cooperating, e.g.
8125 -+ * working closely on the same area of the disk. In that case,
8126 -+ * we can group them together and don't waste time idling.
8127 -+ */
8128 -+ bfqq = bfqq_close(bfqd);
8129 -+ if (bfqq == NULL || bfqq == cur_bfqq)
8130 -+ return NULL;
8131 -+
8132 -+ /*
8133 -+ * Do not merge queues from different bfq_groups.
8134 -+ */
8135 -+ if (bfqq->entity.parent != cur_bfqq->entity.parent)
8136 -+ return NULL;
8137 -+
8138 -+ /*
8139 -+ * It only makes sense to merge sync queues.
8140 -+ */
8141 -+ if (!bfq_bfqq_sync(bfqq))
8142 -+ return NULL;
8143 -+ if (BFQQ_SEEKY(bfqq))
8144 -+ return NULL;
8145 -+
8146 -+ /*
8147 -+ * Do not merge queues of different priority classes.
8148 -+ */
8149 -+ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
8150 -+ return NULL;
8151 -+
8152 -+ return bfqq;
8153 -+}
8154 -+
8155 -+/*
8156 -+ * If enough samples have been computed, return the current max budget
8157 -+ * stored in bfqd, which is dynamically updated according to the
8158 -+ * estimated disk peak rate; otherwise return the default max budget
8159 -+ */
8160 -+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
8161 -+{
8162 -+ if (bfqd->budgets_assigned < 194)
8163 -+ return bfq_default_max_budget;
8164 -+ else
8165 -+ return bfqd->bfq_max_budget;
8166 -+}
8167 -+
8168 -+/*
8169 -+ * Return min budget, which is a fraction of the current or default
8170 -+ * max budget (trying with 1/32)
8171 -+ */
8172 -+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
8173 -+{
8174 -+ if (bfqd->budgets_assigned < 194)
8175 -+ return bfq_default_max_budget / 32;
8176 -+ else
8177 -+ return bfqd->bfq_max_budget / 32;
8178 -+}
8179 -+
8180 -+/*
8181 -+ * Decides whether idling should be done for given device and
8182 -+ * given active queue.
8183 -+ */
8184 -+static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
8185 -+ struct bfq_queue *active_bfqq)
8186 -+{
8187 -+ if (active_bfqq == NULL)
8188 -+ return false;
8189 -+ /*
8190 -+ * If device is SSD it has no seek penalty, disable idling; but
8191 -+ * do so only if:
8192 -+ * - device does not support queuing, otherwise we still have
8193 -+ * a problem with sync vs async workloads;
8194 -+ * - the queue is not weight-raised, to preserve guarantees.
8195 -+ */
8196 -+ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
8197 -+ active_bfqq->raising_coeff == 1);
8198 -+}
8199 -+
8200 -+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
8201 -+{
8202 -+ struct bfq_queue *bfqq = bfqd->active_queue;
8203 -+ struct bfq_io_cq *bic;
8204 -+ unsigned long sl;
8205 -+
8206 -+ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
8207 -+
8208 -+ /* Tasks have exited, don't wait. */
8209 -+ bic = bfqd->active_bic;
8210 -+ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
8211 -+ return;
8212 -+
8213 -+ bfq_mark_bfqq_wait_request(bfqq);
8214 -+
8215 -+ /*
8216 -+ * We don't want to idle for seeks, but we do want to allow
8217 -+ * fair distribution of slice time for a process doing back-to-back
8218 -+ * seeks. So allow a little bit of time for him to submit a new rq.
8219 -+ *
8220 -+ * To prevent processes with (partly) seeky workloads from
8221 -+ * being too ill-treated, grant them a small fraction of the
8222 -+ * assigned budget before reducing the waiting time to
8223 -+ * BFQ_MIN_TT. This happened to help reduce latency.
8224 -+ */
8225 -+ sl = bfqd->bfq_slice_idle;
8226 -+ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
8227 -+ bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
8228 -+ bfqq->raising_coeff == 1)
8229 -+ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
8230 -+ else if (bfqq->raising_coeff > 1)
8231 -+ sl = sl * 3;
8232 -+ bfqd->last_idling_start = ktime_get();
8233 -+ mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
8234 -+ bfq_log(bfqd, "arm idle: %u/%u ms",
8235 -+ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
8236 -+}
8237 -+
8238 -+/*
8239 -+ * Set the maximum time for the active queue to consume its
8240 -+ * budget. This prevents seeky processes from lowering the disk
8241 -+ * throughput (always guaranteed with a time slice scheme as in CFQ).
8242 -+ */
8243 -+static void bfq_set_budget_timeout(struct bfq_data *bfqd)
8244 -+{
8245 -+ struct bfq_queue *bfqq = bfqd->active_queue;
8246 -+ unsigned int timeout_coeff;
8247 -+ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
8248 -+ timeout_coeff = 1;
8249 -+ else
8250 -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
8251 -+
8252 -+ bfqd->last_budget_start = ktime_get();
8253 -+
8254 -+ bfq_clear_bfqq_budget_new(bfqq);
8255 -+ bfqq->budget_timeout = jiffies +
8256 -+ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
8257 -+
8258 -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
8259 -+ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
8260 -+ timeout_coeff));
8261 -+}
8262 -+
8263 -+/*
8264 -+ * Move request from internal lists to the request queue dispatch list.
8265 -+ */
8266 -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
8267 -+{
8268 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
8269 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
8270 -+
8271 -+ bfq_remove_request(rq);
8272 -+ bfqq->dispatched++;
8273 -+ elv_dispatch_sort(q, rq);
8274 -+
8275 -+ if (bfq_bfqq_sync(bfqq))
8276 -+ bfqd->sync_flight++;
8277 -+}
8278 -+
8279 -+/*
8280 -+ * Return expired entry, or NULL to just start from scratch in rbtree.
8281 -+ */
8282 -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
8283 -+{
8284 -+ struct request *rq = NULL;
8285 -+
8286 -+ if (bfq_bfqq_fifo_expire(bfqq))
8287 -+ return NULL;
8288 -+
8289 -+ bfq_mark_bfqq_fifo_expire(bfqq);
8290 -+
8291 -+ if (list_empty(&bfqq->fifo))
8292 -+ return NULL;
8293 -+
8294 -+ rq = rq_entry_fifo(bfqq->fifo.next);
8295 -+
8296 -+ if (time_before(jiffies, rq_fifo_time(rq)))
8297 -+ return NULL;
8298 -+
8299 -+ return rq;
8300 -+}
8301 -+
8302 -+/*
8303 -+ * Must be called with the queue_lock held.
8304 -+ */
8305 -+static int bfqq_process_refs(struct bfq_queue *bfqq)
8306 -+{
8307 -+ int process_refs, io_refs;
8308 -+
8309 -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
8310 -+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
8311 -+ BUG_ON(process_refs < 0);
8312 -+ return process_refs;
8313 -+}
8314 -+
8315 -+static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
8316 -+{
8317 -+ int process_refs, new_process_refs;
8318 -+ struct bfq_queue *__bfqq;
8319 -+
8320 -+ /*
8321 -+ * If there are no process references on the new_bfqq, then it is
8322 -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
8323 -+ * may have dropped their last reference (not just their last process
8324 -+ * reference).
8325 -+ */
8326 -+ if (!bfqq_process_refs(new_bfqq))
8327 -+ return;
8328 -+
8329 -+ /* Avoid a circular list and skip interim queue merges. */
8330 -+ while ((__bfqq = new_bfqq->new_bfqq)) {
8331 -+ if (__bfqq == bfqq)
8332 -+ return;
8333 -+ new_bfqq = __bfqq;
8334 -+ }
8335 -+
8336 -+ process_refs = bfqq_process_refs(bfqq);
8337 -+ new_process_refs = bfqq_process_refs(new_bfqq);
8338 -+ /*
8339 -+ * If the process for the bfqq has gone away, there is no
8340 -+ * sense in merging the queues.
8341 -+ */
8342 -+ if (process_refs == 0 || new_process_refs == 0)
8343 -+ return;
8344 -+
8345 -+ /*
8346 -+ * Merge in the direction of the lesser amount of work.
8347 -+ */
8348 -+ if (new_process_refs >= process_refs) {
8349 -+ bfqq->new_bfqq = new_bfqq;
8350 -+ atomic_add(process_refs, &new_bfqq->ref);
8351 -+ } else {
8352 -+ new_bfqq->new_bfqq = bfqq;
8353 -+ atomic_add(new_process_refs, &bfqq->ref);
8354 -+ }
8355 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
8356 -+ new_bfqq->pid);
8357 -+}
8358 -+
8359 -+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
8360 -+{
8361 -+ struct bfq_entity *entity = &bfqq->entity;
8362 -+ return entity->budget - entity->service;
8363 -+}
8364 -+
8365 -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
8366 -+{
8367 -+ BUG_ON(bfqq != bfqd->active_queue);
8368 -+
8369 -+ __bfq_bfqd_reset_active(bfqd);
8370 -+
8371 -+ /*
8372 -+ * If this bfqq is shared between multiple processes, check
8373 -+ * to make sure that those processes are still issuing I/Os
8374 -+ * within the mean seek distance. If not, it may be time to
8375 -+ * break the queues apart again.
8376 -+ */
8377 -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
8378 -+ bfq_mark_bfqq_split_coop(bfqq);
8379 -+
8380 -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
8381 -+ /*
8382 -+ * overloading budget_timeout field to store when
8383 -+ * the queue remains with no backlog, used by
8384 -+ * the weight-raising mechanism
8385 -+ */
8386 -+ bfqq->budget_timeout = jiffies ;
8387 -+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
8388 -+ } else {
8389 -+ bfq_activate_bfqq(bfqd, bfqq);
8390 -+ /*
8391 -+ * Resort priority tree of potential close cooperators.
8392 -+ */
8393 -+ bfq_rq_pos_tree_add(bfqd, bfqq);
8394 -+ }
8395 -+}
8396 -+
8397 -+/**
8398 -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
8399 -+ * @bfqd: device data.
8400 -+ * @bfqq: queue to update.
8401 -+ * @reason: reason for expiration.
8402 -+ *
8403 -+ * Handle the feedback on @bfqq budget. See the body for detailed
8404 -+ * comments.
8405 -+ */
8406 -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
8407 -+ struct bfq_queue *bfqq,
8408 -+ enum bfqq_expiration reason)
8409 -+{
8410 -+ struct request *next_rq;
8411 -+ unsigned long budget, min_budget;
8412 -+
8413 -+ budget = bfqq->max_budget;
8414 -+ min_budget = bfq_min_budget(bfqd);
8415 -+
8416 -+ BUG_ON(bfqq != bfqd->active_queue);
8417 -+
8418 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
8419 -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
8420 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
8421 -+ budget, bfq_min_budget(bfqd));
8422 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
8423 -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));
8424 -+
8425 -+ if (bfq_bfqq_sync(bfqq)) {
8426 -+ switch (reason) {
8427 -+ /*
8428 -+ * Caveat: in all the following cases we trade latency
8429 -+ * for throughput.
8430 -+ */
8431 -+ case BFQ_BFQQ_TOO_IDLE:
8432 -+ /*
8433 -+ * This is the only case where we may reduce
8434 -+ * the budget: if there is no requets of the
8435 -+ * process still waiting for completion, then
8436 -+ * we assume (tentatively) that the timer has
8437 -+ * expired because the batch of requests of
8438 -+ * the process could have been served with a
8439 -+ * smaller budget. Hence, betting that
8440 -+ * process will behave in the same way when it
8441 -+ * becomes backlogged again, we reduce its
8442 -+ * next budget. As long as we guess right,
8443 -+ * this budget cut reduces the latency
8444 -+ * experienced by the process.
8445 -+ *
8446 -+ * However, if there are still outstanding
8447 -+ * requests, then the process may have not yet
8448 -+ * issued its next request just because it is
8449 -+ * still waiting for the completion of some of
8450 -+ * the still oustanding ones. So in this
8451 -+ * subcase we do not reduce its budget, on the
8452 -+ * contrary we increase it to possibly boost
8453 -+ * the throughput, as discussed in the
8454 -+ * comments to the BUDGET_TIMEOUT case.
8455 -+ */
8456 -+ if (bfqq->dispatched > 0) /* still oustanding reqs */
8457 -+ budget = min(budget * 2, bfqd->bfq_max_budget);
8458 -+ else {
8459 -+ if (budget > 5 * min_budget)
8460 -+ budget -= 4 * min_budget;
8461 -+ else
8462 -+ budget = min_budget;
8463 -+ }
8464 -+ break;
8465 -+ case BFQ_BFQQ_BUDGET_TIMEOUT:
8466 -+ /*
8467 -+ * We double the budget here because: 1) it
8468 -+ * gives the chance to boost the throughput if
8469 -+ * this is not a seeky process (which may have
8470 -+ * bumped into this timeout because of, e.g.,
8471 -+ * ZBR), 2) together with charge_full_budget
8472 -+ * it helps give seeky processes higher
8473 -+ * timestamps, and hence be served less
8474 -+ * frequently.
8475 -+ */
8476 -+ budget = min(budget * 2, bfqd->bfq_max_budget);
8477 -+ break;
8478 -+ case BFQ_BFQQ_BUDGET_EXHAUSTED:
8479 -+ /*
8480 -+ * The process still has backlog, and did not
8481 -+ * let either the budget timeout or the disk
8482 -+ * idling timeout expire. Hence it is not
8483 -+ * seeky, has a short thinktime and may be
8484 -+ * happy with a higher budget too. So
8485 -+ * definitely increase the budget of this good
8486 -+ * candidate to boost the disk throughput.
8487 -+ */
8488 -+ budget = min(budget * 4, bfqd->bfq_max_budget);
8489 -+ break;
8490 -+ case BFQ_BFQQ_NO_MORE_REQUESTS:
8491 -+ /*
8492 -+ * Leave the budget unchanged.
8493 -+ */
8494 -+ default:
8495 -+ return;
8496 -+ }
8497 -+ } else /* async queue */
8498 -+ /* async queues get always the maximum possible budget
8499 -+ * (their ability to dispatch is limited by
8500 -+ * @bfqd->bfq_max_budget_async_rq).
8501 -+ */
8502 -+ budget = bfqd->bfq_max_budget;
8503 -+
8504 -+ bfqq->max_budget = budget;
8505 -+
8506 -+ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
8507 -+ bfqq->max_budget > bfqd->bfq_max_budget)
8508 -+ bfqq->max_budget = bfqd->bfq_max_budget;
8509 -+
8510 -+ /*
8511 -+ * Make sure that we have enough budget for the next request.
8512 -+ * Since the finish time of the bfqq must be kept in sync with
8513 -+ * the budget, be sure to call __bfq_bfqq_expire() after the
8514 -+ * update.
8515 -+ */
8516 -+ next_rq = bfqq->next_rq;
8517 -+ if (next_rq != NULL)
8518 -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
8519 -+ bfq_serv_to_charge(next_rq, bfqq));
8520 -+ else
8521 -+ bfqq->entity.budget = bfqq->max_budget;
8522 -+
8523 -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
8524 -+ next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
8525 -+ bfqq->entity.budget);
8526 -+}
8527 -+
8528 -+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
8529 -+{
8530 -+ unsigned long max_budget;
8531 -+
8532 -+ /*
8533 -+ * The max_budget calculated when autotuning is equal to the
8534 -+ * amount of sectors transfered in timeout_sync at the
8535 -+ * estimated peak rate.
8536 -+ */
8537 -+ max_budget = (unsigned long)(peak_rate * 1000 *
8538 -+ timeout >> BFQ_RATE_SHIFT);
8539 -+
8540 -+ return max_budget;
8541 -+}
8542 -+
8543 -+/*
8544 -+ * In addition to updating the peak rate, checks whether the process
8545 -+ * is "slow", and returns 1 if so. This slow flag is used, in addition
8546 -+ * to the budget timeout, to reduce the amount of service provided to
8547 -+ * seeky processes, and hence reduce their chances to lower the
8548 -+ * throughput. See the code for more details.
8549 -+ */
8550 -+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
8551 -+ int compensate, enum bfqq_expiration reason)
8552 -+{
8553 -+ u64 bw, usecs, expected, timeout;
8554 -+ ktime_t delta;
8555 -+ int update = 0;
8556 -+
8557 -+ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
8558 -+ return 0;
8559 -+
8560 -+ if (compensate)
8561 -+ delta = bfqd->last_idling_start;
8562 -+ else
8563 -+ delta = ktime_get();
8564 -+ delta = ktime_sub(delta, bfqd->last_budget_start);
8565 -+ usecs = ktime_to_us(delta);
8566 -+
8567 -+ /* Don't trust short/unrealistic values. */
8568 -+ if (usecs < 100 || usecs >= LONG_MAX)
8569 -+ return 0;
8570 -+
8571 -+ /*
8572 -+ * Calculate the bandwidth for the last slice. We use a 64 bit
8573 -+ * value to store the peak rate, in sectors per usec in fixed
8574 -+ * point math. We do so to have enough precision in the estimate
8575 -+ * and to avoid overflows.
8576 -+ */
8577 -+ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
8578 -+ do_div(bw, (unsigned long)usecs);
8579 -+
8580 -+ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
8581 -+
8582 -+ /*
8583 -+ * Use only long (> 20ms) intervals to filter out spikes for
8584 -+ * the peak rate estimation.
8585 -+ */
8586 -+ if (usecs > 20000) {
8587 -+ if (bw > bfqd->peak_rate ||
8588 -+ (!BFQQ_SEEKY(bfqq) &&
8589 -+ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
8590 -+ bfq_log(bfqd, "measured bw =%llu", bw);
8591 -+ /*
8592 -+ * To smooth oscillations use a low-pass filter with
8593 -+ * alpha=7/8, i.e.,
8594 -+ * new_rate = (7/8) * old_rate + (1/8) * bw
8595 -+ */
8596 -+ do_div(bw, 8);
8597 -+ if (bw == 0)
8598 -+ return 0;
8599 -+ bfqd->peak_rate *= 7;
8600 -+ do_div(bfqd->peak_rate, 8);
8601 -+ bfqd->peak_rate += bw;
8602 -+ update = 1;
8603 -+ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
8604 -+ }
8605 -+
8606 -+ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
8607 -+
8608 -+ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
8609 -+ bfqd->peak_rate_samples++;
8610 -+
8611 -+ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
8612 -+ update && bfqd->bfq_user_max_budget == 0) {
8613 -+ bfqd->bfq_max_budget =
8614 -+ bfq_calc_max_budget(bfqd->peak_rate, timeout);
8615 -+ bfq_log(bfqd, "new max_budget=%lu",
8616 -+ bfqd->bfq_max_budget);
8617 -+ }
8618 -+ }
8619 -+
8620 -+ /*
8621 -+ * If the process has been served for a too short time
8622 -+ * interval to let its possible sequential accesses prevail on
8623 -+ * the initial seek time needed to move the disk head on the
8624 -+ * first sector it requested, then give the process a chance
8625 -+ * and for the moment return false.
8626 -+ */
8627 -+ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
8628 -+ return 0;
8629 -+
8630 -+ /*
8631 -+ * A process is considered ``slow'' (i.e., seeky, so that we
8632 -+ * cannot treat it fairly in the service domain, as it would
8633 -+ * slow down too much the other processes) if, when a slice
8634 -+ * ends for whatever reason, it has received service at a
8635 -+ * rate that would not be high enough to complete the budget
8636 -+ * before the budget timeout expiration.
8637 -+ */
8638 -+ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
8639 -+
8640 -+ /*
8641 -+ * Caveat: processes doing IO in the slower disk zones will
8642 -+ * tend to be slow(er) even if not seeky. And the estimated
8643 -+ * peak rate will actually be an average over the disk
8644 -+ * surface. Hence, to not be too harsh with unlucky processes,
8645 -+ * we keep a budget/3 margin of safety before declaring a
8646 -+ * process slow.
8647 -+ */
8648 -+ return expected > (4 * bfqq->entity.budget) / 3;
8649 -+}
8650 -+
8651 -+/**
8652 -+ * bfq_bfqq_expire - expire a queue.
8653 -+ * @bfqd: device owning the queue.
8654 -+ * @bfqq: the queue to expire.
8655 -+ * @compensate: if true, compensate for the time spent idling.
8656 -+ * @reason: the reason causing the expiration.
8657 -+ *
8658 -+ *
8659 -+ * If the process associated to the queue is slow (i.e., seeky), or in
8660 -+ * case of budget timeout, or, finally, if it is async, we
8661 -+ * artificially charge it an entire budget (independently of the
8662 -+ * actual service it received). As a consequence, the queue will get
8663 -+ * higher timestamps than the correct ones upon reactivation, and
8664 -+ * hence it will be rescheduled as if it had received more service
8665 -+ * than what it actually received. In the end, this class of processes
8666 -+ * will receive less service in proportion to how slowly they consume
8667 -+ * their budgets (and hence how seriously they tend to lower the
8668 -+ * throughput).
8669 -+ *
8670 -+ * In contrast, when a queue expires because it has been idling for
8671 -+ * too much or because it exhausted its budget, we do not touch the
8672 -+ * amount of service it has received. Hence when the queue will be
8673 -+ * reactivated and its timestamps updated, the latter will be in sync
8674 -+ * with the actual service received by the queue until expiration.
8675 -+ *
8676 -+ * Charging a full budget to the first type of queues and the exact
8677 -+ * service to the others has the effect of using the WF2Q+ policy to
8678 -+ * schedule the former on a timeslice basis, without violating the
8679 -+ * service domain guarantees of the latter.
8680 -+ */
8681 -+static void bfq_bfqq_expire(struct bfq_data *bfqd,
8682 -+ struct bfq_queue *bfqq,
8683 -+ int compensate,
8684 -+ enum bfqq_expiration reason)
8685 -+{
8686 -+ int slow;
8687 -+ BUG_ON(bfqq != bfqd->active_queue);
8688 -+
8689 -+ /* Update disk peak rate for autotuning and check whether the
8690 -+ * process is slow (see bfq_update_peak_rate).
8691 -+ */
8692 -+ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
8693 -+
8694 -+ /*
8695 -+ * As above explained, 'punish' slow (i.e., seeky), timed-out
8696 -+ * and async queues, to favor sequential sync workloads.
8697 -+ *
8698 -+ * Processes doing IO in the slower disk zones will tend to be
8699 -+ * slow(er) even if not seeky. Hence, since the estimated peak
8700 -+ * rate is actually an average over the disk surface, these
8701 -+ * processes may timeout just for bad luck. To avoid punishing
8702 -+ * them we do not charge a full budget to a process that
8703 -+ * succeeded in consuming at least 2/3 of its budget.
8704 -+ */
8705 -+ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
8706 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
8707 -+ bfq_bfqq_charge_full_budget(bfqq);
8708 -+
8709 -+ if (bfqd->low_latency && bfqq->raising_coeff == 1)
8710 -+ bfqq->last_rais_start_finish = jiffies;
8711 -+
8712 -+ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {
8713 -+ if(reason != BFQ_BFQQ_BUDGET_TIMEOUT)
8714 -+ bfqq->soft_rt_next_start =
8715 -+ jiffies +
8716 -+ HZ * bfqq->entity.service /
8717 -+ bfqd->bfq_raising_max_softrt_rate;
8718 -+ else
8719 -+ bfqq->soft_rt_next_start = -1; /* infinity */
8720 -+ }
8721 -+ bfq_log_bfqq(bfqd, bfqq,
8722 -+ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
8723 -+ bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
8724 -+
8725 -+ /* Increase, decrease or leave budget unchanged according to reason */
8726 -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
8727 -+ __bfq_bfqq_expire(bfqd, bfqq);
8728 -+}
8729 -+
8730 -+/*
8731 -+ * Budget timeout is not implemented through a dedicated timer, but
8732 -+ * just checked on request arrivals and completions, as well as on
8733 -+ * idle timer expirations.
8734 -+ */
8735 -+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
8736 -+{
8737 -+ if (bfq_bfqq_budget_new(bfqq))
8738 -+ return 0;
8739 -+
8740 -+ if (time_before(jiffies, bfqq->budget_timeout))
8741 -+ return 0;
8742 -+
8743 -+ return 1;
8744 -+}
8745 -+
8746 -+/*
8747 -+ * If we expire a queue that is waiting for the arrival of a new
8748 -+ * request, we may prevent the fictitious timestamp backshifting that
8749 -+ * allows the guarantees of the queue to be preserved (see [1] for
8750 -+ * this tricky aspect). Hence we return true only if this condition
8751 -+ * does not hold, or if the queue is slow enough to deserve only to be
8752 -+ * kicked off for preserving a high throughput.
8753 -+*/
8754 -+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
8755 -+{
8756 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
8757 -+ "may_budget_timeout: wr %d left %d timeout %d",
8758 -+ bfq_bfqq_wait_request(bfqq),
8759 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
8760 -+ bfq_bfqq_budget_timeout(bfqq));
8761 -+
8762 -+ return (!bfq_bfqq_wait_request(bfqq) ||
8763 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
8764 -+ &&
8765 -+ bfq_bfqq_budget_timeout(bfqq);
8766 -+}
8767 -+
8768 -+/*
8769 -+ * If the active queue is empty, but it is sync and either of the following
8770 -+ * conditions holds, then: 1) the queue must remain active and cannot be
8771 -+ * expired, and 2) the disk must be idled to wait for the possible arrival
8772 -+ * of a new request for the queue. The conditions are:
8773 -+ * - the device is rotational and not performing NCQ, and the queue has its
8774 -+ * idle window set (in this case, waiting for a new request for the queue
8775 -+ * is likely to boost the disk throughput);
8776 -+ * - the queue is weight-raised (waiting for the request is necessary for
8777 -+ * providing the queue with fairness and latency guarantees).
8778 -+ */
8779 -+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
8780 -+ int budg_timeout)
8781 -+{
8782 -+ struct bfq_data *bfqd = bfqq->bfqd;
8783 -+
8784 -+ return (bfq_bfqq_sync(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&
8785 -+ bfqd->bfq_slice_idle != 0 &&
8786 -+ ((bfq_bfqq_idle_window(bfqq) && !bfqd->hw_tag &&
8787 -+ !blk_queue_nonrot(bfqd->queue))
8788 -+ || bfqq->raising_coeff > 1) &&
8789 -+ (bfqd->rq_in_driver == 0 ||
8790 -+ budg_timeout ||
8791 -+ bfqq->raising_coeff > 1) &&
8792 -+ !bfq_close_cooperator(bfqd, bfqq) &&
8793 -+ (!bfq_bfqq_coop(bfqq) ||
8794 -+ !bfq_bfqq_some_coop_idle(bfqq)) &&
8795 -+ !bfq_queue_nonrot_noidle(bfqd, bfqq));
8796 -+}
8797 -+
8798 -+/*
8799 -+ * Select a queue for service. If we have a current active queue,
8800 -+ * check whether to continue servicing it, or retrieve and set a new one.
8801 -+ */
8802 -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
8803 -+{
8804 -+ struct bfq_queue *bfqq, *new_bfqq = NULL;
8805 -+ struct request *next_rq;
8806 -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
8807 -+ int budg_timeout;
8808 -+
8809 -+ bfqq = bfqd->active_queue;
8810 -+ if (bfqq == NULL)
8811 -+ goto new_queue;
8812 -+
8813 -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
8814 -+
8815 -+ /*
8816 -+ * If another queue has a request waiting within our mean seek
8817 -+ * distance, let it run. The expire code will check for close
8818 -+ * cooperators and put the close queue at the front of the
8819 -+ * service tree. If possible, merge the expiring queue with the
8820 -+ * new bfqq.
8821 -+ */
8822 -+ new_bfqq = bfq_close_cooperator(bfqd, bfqq);
8823 -+ if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
8824 -+ bfq_setup_merge(bfqq, new_bfqq);
8825 -+
8826 -+ budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
8827 -+ if (budg_timeout &&
8828 -+ !bfq_bfqq_must_idle(bfqq, budg_timeout))
8829 -+ goto expire;
8830 -+
8831 -+ next_rq = bfqq->next_rq;
8832 -+ /*
8833 -+ * If bfqq has requests queued and it has enough budget left to
8834 -+ * serve them, keep the queue, otherwise expire it.
8835 -+ */
8836 -+ if (next_rq != NULL) {
8837 -+ if (bfq_serv_to_charge(next_rq, bfqq) >
8838 -+ bfq_bfqq_budget_left(bfqq)) {
8839 -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
8840 -+ goto expire;
8841 -+ } else {
8842 -+ /*
8843 -+ * The idle timer may be pending because we may not
8844 -+ * disable disk idling even when a new request arrives
8845 -+ */
8846 -+ if (timer_pending(&bfqd->idle_slice_timer)) {
8847 -+ /*
8848 -+ * If we get here: 1) at least a new request
8849 -+ * has arrived but we have not disabled the
8850 -+ * timer because the request was too small,
8851 -+ * 2) then the block layer has unplugged the
8852 -+ * device, causing the dispatch to be invoked.
8853 -+ *
8854 -+ * Since the device is unplugged, now the
8855 -+ * requests are probably large enough to
8856 -+ * provide a reasonable throughput.
8857 -+ * So we disable idling.
8858 -+ */
8859 -+ bfq_clear_bfqq_wait_request(bfqq);
8860 -+ del_timer(&bfqd->idle_slice_timer);
8861 -+ }
8862 -+ if (new_bfqq == NULL)
8863 -+ goto keep_queue;
8864 -+ else
8865 -+ goto expire;
8866 -+ }
8867 -+ }
8868 -+
8869 -+ /*
8870 -+ * No requests pending. If there is no cooperator, and the active
8871 -+ * queue still has requests in flight or is idling for a new request,
8872 -+ * then keep it.
8873 -+ */
8874 -+ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
8875 -+ (bfqq->dispatched != 0 &&
8876 -+ (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&
8877 -+ !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {
8878 -+ bfqq = NULL;
8879 -+ goto keep_queue;
8880 -+ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
8881 -+ /*
8882 -+ * Expiring the queue because there is a close cooperator,
8883 -+ * cancel timer.
8884 -+ */
8885 -+ bfq_clear_bfqq_wait_request(bfqq);
8886 -+ del_timer(&bfqd->idle_slice_timer);
8887 -+ }
8888 -+
8889 -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
8890 -+expire:
8891 -+ bfq_bfqq_expire(bfqd, bfqq, 0, reason);
8892 -+new_queue:
8893 -+ bfqq = bfq_set_active_queue(bfqd, new_bfqq);
8894 -+ bfq_log(bfqd, "select_queue: new queue %d returned",
8895 -+ bfqq != NULL ? bfqq->pid : 0);
8896 -+keep_queue:
8897 -+ return bfqq;
8898 -+}
8899 -+
8900 -+static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
8901 -+{
8902 -+ if (bfqq->raising_coeff > 1) { /* queue is being boosted */
8903 -+ struct bfq_entity *entity = &bfqq->entity;
8904 -+
8905 -+ bfq_log_bfqq(bfqd, bfqq,
8906 -+ "raising period dur %u/%u msec, "
8907 -+ "old raising coeff %u, w %d(%d)",
8908 -+ jiffies_to_msecs(jiffies -
8909 -+ bfqq->last_rais_start_finish),
8910 -+ jiffies_to_msecs(bfqq->raising_cur_max_time),
8911 -+ bfqq->raising_coeff,
8912 -+ bfqq->entity.weight, bfqq->entity.orig_weight);
8913 -+
8914 -+ BUG_ON(bfqq != bfqd->active_queue && entity->weight !=
8915 -+ entity->orig_weight * bfqq->raising_coeff);
8916 -+ if(entity->ioprio_changed)
8917 -+ bfq_log_bfqq(bfqd, bfqq,
8918 -+ "WARN: pending prio change");
8919 -+ /*
8920 -+ * If too much time has elapsed from the beginning
8921 -+ * of this weight-raising period and process is not soft
8922 -+ * real-time, stop it
8923 -+ */
8924 -+ if (jiffies - bfqq->last_rais_start_finish >
8925 -+ bfqq->raising_cur_max_time) {
8926 -+ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
8927 -+ bfqq->soft_rt_next_start < jiffies;
8928 -+
8929 -+ bfqq->last_rais_start_finish = jiffies;
8930 -+ if (soft_rt)
8931 -+ bfqq->raising_cur_max_time =
8932 -+ bfqd->bfq_raising_rt_max_time;
8933 -+ else {
8934 -+ bfq_log_bfqq(bfqd, bfqq,
8935 -+ "wrais ending at %llu msec,"
8936 -+ "rais_max_time %u",
8937 -+ bfqq->last_rais_start_finish,
8938 -+ jiffies_to_msecs(bfqq->
8939 -+ raising_cur_max_time));
8940 -+ bfq_bfqq_end_raising(bfqq);
8941 -+ __bfq_entity_update_weight_prio(
8942 -+ bfq_entity_service_tree(entity),
8943 -+ entity);
8944 -+ }
8945 -+ }
8946 -+ }
8947 -+}
8948 -+
8949 -+/*
8950 -+ * Dispatch one request from bfqq, moving it to the request queue
8951 -+ * dispatch list.
8952 -+ */
8953 -+static int bfq_dispatch_request(struct bfq_data *bfqd,
8954 -+ struct bfq_queue *bfqq)
8955 -+{
8956 -+ int dispatched = 0;
8957 -+ struct request *rq;
8958 -+ unsigned long service_to_charge;
8959 -+
8960 -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
8961 -+
8962 -+ /* Follow expired path, else get first next available. */
8963 -+ rq = bfq_check_fifo(bfqq);
8964 -+ if (rq == NULL)
8965 -+ rq = bfqq->next_rq;
8966 -+ service_to_charge = bfq_serv_to_charge(rq, bfqq);
8967 -+
8968 -+ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
8969 -+ /*
8970 -+ * This may happen if the next rq is chosen
8971 -+ * in fifo order instead of sector order.
8972 -+ * The budget is properly dimensioned
8973 -+ * to be always sufficient to serve the next request
8974 -+ * only if it is chosen in sector order. The reason is
8975 -+ * that it would be quite inefficient and little useful
8976 -+ * to always make sure that the budget is large enough
8977 -+ * to serve even the possible next rq in fifo order.
8978 -+ * In fact, requests are seldom served in fifo order.
8979 -+ *
8980 -+ * Expire the queue for budget exhaustion, and
8981 -+ * make sure that the next act_budget is enough
8982 -+ * to serve the next request, even if it comes
8983 -+ * from the fifo expired path.
8984 -+ */
8985 -+ bfqq->next_rq = rq;
8986 -+ /*
8987 -+ * Since this dispatch is failed, make sure that
8988 -+ * a new one will be performed
8989 -+ */
8990 -+ if (!bfqd->rq_in_driver)
8991 -+ bfq_schedule_dispatch(bfqd);
8992 -+ goto expire;
8993 -+ }
8994 -+
8995 -+ /* Finally, insert request into driver dispatch list. */
8996 -+ bfq_bfqq_served(bfqq, service_to_charge);
8997 -+ bfq_dispatch_insert(bfqd->queue, rq);
8998 -+
8999 -+ update_raising_data(bfqd, bfqq);
9000 -+
9001 -+ bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), "
9002 -+ "budg left %lu",
9003 -+ blk_rq_sectors(rq),
9004 -+ (long long unsigned)blk_rq_pos(rq),
9005 -+ bfq_bfqq_budget_left(bfqq));
9006 -+
9007 -+ dispatched++;
9008 -+
9009 -+ if (bfqd->active_bic == NULL) {
9010 -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
9011 -+ bfqd->active_bic = RQ_BIC(rq);
9012 -+ }
9013 -+
9014 -+ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
9015 -+ dispatched >= bfqd->bfq_max_budget_async_rq) ||
9016 -+ bfq_class_idle(bfqq)))
9017 -+ goto expire;
9018 -+
9019 -+ return dispatched;
9020 -+
9021 -+expire:
9022 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
9023 -+ return dispatched;
9024 -+}
9025 -+
9026 -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
9027 -+{
9028 -+ int dispatched = 0;
9029 -+
9030 -+ while (bfqq->next_rq != NULL) {
9031 -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
9032 -+ dispatched++;
9033 -+ }
9034 -+
9035 -+ BUG_ON(!list_empty(&bfqq->fifo));
9036 -+ return dispatched;
9037 -+}
9038 -+
9039 -+/*
9040 -+ * Drain our current requests. Used for barriers and when switching
9041 -+ * io schedulers on-the-fly.
9042 -+ */
9043 -+static int bfq_forced_dispatch(struct bfq_data *bfqd)
9044 -+{
9045 -+ struct bfq_queue *bfqq, *n;
9046 -+ struct bfq_service_tree *st;
9047 -+ int dispatched = 0;
9048 -+
9049 -+ bfqq = bfqd->active_queue;
9050 -+ if (bfqq != NULL)
9051 -+ __bfq_bfqq_expire(bfqd, bfqq);
9052 -+
9053 -+ /*
9054 -+ * Loop through classes, and be careful to leave the scheduler
9055 -+ * in a consistent state, as feedback mechanisms and vtime
9056 -+ * updates cannot be disabled during the process.
9057 -+ */
9058 -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
9059 -+ st = bfq_entity_service_tree(&bfqq->entity);
9060 -+
9061 -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
9062 -+ bfqq->max_budget = bfq_max_budget(bfqd);
9063 -+
9064 -+ bfq_forget_idle(st);
9065 -+ }
9066 -+
9067 -+ BUG_ON(bfqd->busy_queues != 0);
9068 -+
9069 -+ return dispatched;
9070 -+}
9071 -+
9072 -+static int bfq_dispatch_requests(struct request_queue *q, int force)
9073 -+{
9074 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
9075 -+ struct bfq_queue *bfqq;
9076 -+ int max_dispatch;
9077 -+
9078 -+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
9079 -+ if (bfqd->busy_queues == 0)
9080 -+ return 0;
9081 -+
9082 -+ if (unlikely(force))
9083 -+ return bfq_forced_dispatch(bfqd);
9084 -+
9085 -+ if((bfqq = bfq_select_queue(bfqd)) == NULL)
9086 -+ return 0;
9087 -+
9088 -+ max_dispatch = bfqd->bfq_quantum;
9089 -+ if (bfq_class_idle(bfqq))
9090 -+ max_dispatch = 1;
9091 -+
9092 -+ if (!bfq_bfqq_sync(bfqq))
9093 -+ max_dispatch = bfqd->bfq_max_budget_async_rq;
9094 -+
9095 -+ if (bfqq->dispatched >= max_dispatch) {
9096 -+ if (bfqd->busy_queues > 1)
9097 -+ return 0;
9098 -+ if (bfqq->dispatched >= 4 * max_dispatch)
9099 -+ return 0;
9100 -+ }
9101 -+
9102 -+ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
9103 -+ return 0;
9104 -+
9105 -+ bfq_clear_bfqq_wait_request(bfqq);
9106 -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
9107 -+
9108 -+ if (! bfq_dispatch_request(bfqd, bfqq))
9109 -+ return 0;
9110 -+
9111 -+ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d"
9112 -+ "(max_disp %d)", bfqq->pid, max_dispatch);
9113 -+
9114 -+ return 1;
9115 -+}
9116 -+
9117 -+/*
9118 -+ * Task holds one reference to the queue, dropped when task exits. Each rq
9119 -+ * in-flight on this queue also holds a reference, dropped when rq is freed.
9120 -+ *
9121 -+ * Queue lock must be held here.
9122 -+ */
9123 -+static void bfq_put_queue(struct bfq_queue *bfqq)
9124 -+{
9125 -+ struct bfq_data *bfqd = bfqq->bfqd;
9126 -+
9127 -+ BUG_ON(atomic_read(&bfqq->ref) <= 0);
9128 -+
9129 -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
9130 -+ atomic_read(&bfqq->ref));
9131 -+ if (!atomic_dec_and_test(&bfqq->ref))
9132 -+ return;
9133 -+
9134 -+ BUG_ON(rb_first(&bfqq->sort_list) != NULL);
9135 -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
9136 -+ BUG_ON(bfqq->entity.tree != NULL);
9137 -+ BUG_ON(bfq_bfqq_busy(bfqq));
9138 -+ BUG_ON(bfqd->active_queue == bfqq);
9139 -+
9140 -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
9141 -+
9142 -+ kmem_cache_free(bfq_pool, bfqq);
9143 -+}
9144 -+
9145 -+static void bfq_put_cooperator(struct bfq_queue *bfqq)
9146 -+{
9147 -+ struct bfq_queue *__bfqq, *next;
9148 -+
9149 -+ /*
9150 -+ * If this queue was scheduled to merge with another queue, be
9151 -+ * sure to drop the reference taken on that queue (and others in
9152 -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
9153 -+ */
9154 -+ __bfqq = bfqq->new_bfqq;
9155 -+ while (__bfqq) {
9156 -+ if (__bfqq == bfqq) {
9157 -+ WARN(1, "bfqq->new_bfqq loop detected.\n");
9158 -+ break;
9159 -+ }
9160 -+ next = __bfqq->new_bfqq;
9161 -+ bfq_put_queue(__bfqq);
9162 -+ __bfqq = next;
9163 -+ }
9164 -+}
9165 -+
9166 -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
9167 -+{
9168 -+ if (bfqq == bfqd->active_queue) {
9169 -+ __bfq_bfqq_expire(bfqd, bfqq);
9170 -+ bfq_schedule_dispatch(bfqd);
9171 -+ }
9172 -+
9173 -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
9174 -+ atomic_read(&bfqq->ref));
9175 -+
9176 -+ bfq_put_cooperator(bfqq);
9177 -+
9178 -+ bfq_put_queue(bfqq);
9179 -+}
9180 -+
9181 -+static void bfq_init_icq(struct io_cq *icq)
9182 -+{
9183 -+ struct bfq_io_cq *bic = icq_to_bic(icq);
9184 -+
9185 -+ bic->ttime.last_end_request = jiffies;
9186 -+}
9187 -+
9188 -+static void bfq_exit_icq(struct io_cq *icq)
9189 -+{
9190 -+ struct bfq_io_cq *bic = icq_to_bic(icq);
9191 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
9192 -+
9193 -+ if (bic->bfqq[BLK_RW_ASYNC]) {
9194 -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
9195 -+ bic->bfqq[BLK_RW_ASYNC] = NULL;
9196 -+ }
9197 -+
9198 -+ if (bic->bfqq[BLK_RW_SYNC]) {
9199 -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
9200 -+ bic->bfqq[BLK_RW_SYNC] = NULL;
9201 -+ }
9202 -+}
9203 -+
9204 -+/*
9205 -+ * Update the entity prio values; note that the new values will not
9206 -+ * be used until the next (re)activation.
9207 -+ */
9208 -+static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
9209 -+{
9210 -+ struct task_struct *tsk = current;
9211 -+ int ioprio_class;
9212 -+
9213 -+ if (!bfq_bfqq_prio_changed(bfqq))
9214 -+ return;
9215 -+
9216 -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
9217 -+ switch (ioprio_class) {
9218 -+ default:
9219 -+ printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);
9220 -+ case IOPRIO_CLASS_NONE:
9221 -+ /*
9222 -+ * No prio set, inherit CPU scheduling settings.
9223 -+ */
9224 -+ bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
9225 -+ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
9226 -+ break;
9227 -+ case IOPRIO_CLASS_RT:
9228 -+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
9229 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
9230 -+ break;
9231 -+ case IOPRIO_CLASS_BE:
9232 -+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
9233 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
9234 -+ break;
9235 -+ case IOPRIO_CLASS_IDLE:
9236 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
9237 -+ bfqq->entity.new_ioprio = 7;
9238 -+ bfq_clear_bfqq_idle_window(bfqq);
9239 -+ break;
9240 -+ }
9241 -+
9242 -+ bfqq->entity.ioprio_changed = 1;
9243 -+
9244 -+ /*
9245 -+ * Keep track of original prio settings in case we have to temporarily
9246 -+ * elevate the priority of this queue.
9247 -+ */
9248 -+ bfqq->org_ioprio = bfqq->entity.new_ioprio;
9249 -+ bfq_clear_bfqq_prio_changed(bfqq);
9250 -+}
9251 -+
9252 -+static void bfq_changed_ioprio(struct bfq_io_cq *bic)
9253 -+{
9254 -+ struct bfq_data *bfqd;
9255 -+ struct bfq_queue *bfqq, *new_bfqq;
9256 -+ struct bfq_group *bfqg;
9257 -+ unsigned long uninitialized_var(flags);
9258 -+ int ioprio = bic->icq.ioc->ioprio;
9259 -+
9260 -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
9261 -+ /*
9262 -+ * This condition may trigger on a newly created bic, be sure to drop the
9263 -+ * lock before returning.
9264 -+ */
9265 -+ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
9266 -+ goto out;
9267 -+
9268 -+ bfqq = bic->bfqq[BLK_RW_ASYNC];
9269 -+ if (bfqq != NULL) {
9270 -+ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
9271 -+ sched_data);
9272 -+ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
9273 -+ GFP_ATOMIC);
9274 -+ if (new_bfqq != NULL) {
9275 -+ bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
9276 -+ bfq_log_bfqq(bfqd, bfqq,
9277 -+ "changed_ioprio: bfqq %p %d",
9278 -+ bfqq, atomic_read(&bfqq->ref));
9279 -+ bfq_put_queue(bfqq);
9280 -+ }
9281 -+ }
9282 -+
9283 -+ bfqq = bic->bfqq[BLK_RW_SYNC];
9284 -+ if (bfqq != NULL)
9285 -+ bfq_mark_bfqq_prio_changed(bfqq);
9286 -+
9287 -+ bic->ioprio = ioprio;
9288 -+
9289 -+out:
9290 -+ bfq_put_bfqd_unlock(bfqd, &flags);
9291 -+}
9292 -+
9293 -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
9294 -+ pid_t pid, int is_sync)
9295 -+{
9296 -+ RB_CLEAR_NODE(&bfqq->entity.rb_node);
9297 -+ INIT_LIST_HEAD(&bfqq->fifo);
9298 -+
9299 -+ atomic_set(&bfqq->ref, 0);
9300 -+ bfqq->bfqd = bfqd;
9301 -+
9302 -+ bfq_mark_bfqq_prio_changed(bfqq);
9303 -+
9304 -+ if (is_sync) {
9305 -+ if (!bfq_class_idle(bfqq))
9306 -+ bfq_mark_bfqq_idle_window(bfqq);
9307 -+ bfq_mark_bfqq_sync(bfqq);
9308 -+ }
9309 -+
9310 -+ /* Tentative initial value to trade off between thr and lat */
9311 -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
9312 -+ bfqq->pid = pid;
9313 -+
9314 -+ bfqq->raising_coeff = 1;
9315 -+ bfqq->last_rais_start_finish = 0;
9316 -+ bfqq->soft_rt_next_start = -1;
9317 -+}
9318 -+
9319 -+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
9320 -+ struct bfq_group *bfqg,
9321 -+ int is_sync,
9322 -+ struct bfq_io_cq *bic,
9323 -+ gfp_t gfp_mask)
9324 -+{
9325 -+ struct bfq_queue *bfqq, *new_bfqq = NULL;
9326 -+
9327 -+retry:
9328 -+ /* bic always exists here */
9329 -+ bfqq = bic_to_bfqq(bic, is_sync);
9330 -+
9331 -+ /*
9332 -+ * Always try a new alloc if we fall back to the OOM bfqq
9333 -+ * originally, since it should just be a temporary situation.
9334 -+ */
9335 -+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
9336 -+ bfqq = NULL;
9337 -+ if (new_bfqq != NULL) {
9338 -+ bfqq = new_bfqq;
9339 -+ new_bfqq = NULL;
9340 -+ } else if (gfp_mask & __GFP_WAIT) {
9341 -+ spin_unlock_irq(bfqd->queue->queue_lock);
9342 -+ new_bfqq = kmem_cache_alloc_node(bfq_pool,
9343 -+ gfp_mask | __GFP_ZERO,
9344 -+ bfqd->queue->node);
9345 -+ spin_lock_irq(bfqd->queue->queue_lock);
9346 -+ if (new_bfqq != NULL)
9347 -+ goto retry;
9348 -+ } else {
9349 -+ bfqq = kmem_cache_alloc_node(bfq_pool,
9350 -+ gfp_mask | __GFP_ZERO,
9351 -+ bfqd->queue->node);
9352 -+ }
9353 -+
9354 -+ if (bfqq != NULL) {
9355 -+ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
9356 -+ bfq_log_bfqq(bfqd, bfqq, "allocated");
9357 -+ } else {
9358 -+ bfqq = &bfqd->oom_bfqq;
9359 -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
9360 -+ }
9361 -+
9362 -+ bfq_init_prio_data(bfqq, bic);
9363 -+ bfq_init_entity(&bfqq->entity, bfqg);
9364 -+ }
9365 -+
9366 -+ if (new_bfqq != NULL)
9367 -+ kmem_cache_free(bfq_pool, new_bfqq);
9368 -+
9369 -+ return bfqq;
9370 -+}
9371 -+
9372 -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
9373 -+ struct bfq_group *bfqg,
9374 -+ int ioprio_class, int ioprio)
9375 -+{
9376 -+ switch (ioprio_class) {
9377 -+ case IOPRIO_CLASS_RT:
9378 -+ return &bfqg->async_bfqq[0][ioprio];
9379 -+ case IOPRIO_CLASS_NONE:
9380 -+ ioprio = IOPRIO_NORM;
9381 -+ /* fall through */
9382 -+ case IOPRIO_CLASS_BE:
9383 -+ return &bfqg->async_bfqq[1][ioprio];
9384 -+ case IOPRIO_CLASS_IDLE:
9385 -+ return &bfqg->async_idle_bfqq;
9386 -+ default:
9387 -+ BUG();
9388 -+ }
9389 -+}
9390 -+
9391 -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
9392 -+ struct bfq_group *bfqg, int is_sync,
9393 -+ struct bfq_io_cq *bic, gfp_t gfp_mask)
9394 -+{
9395 -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
9396 -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
9397 -+ struct bfq_queue **async_bfqq = NULL;
9398 -+ struct bfq_queue *bfqq = NULL;
9399 -+
9400 -+ if (!is_sync) {
9401 -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
9402 -+ ioprio);
9403 -+ bfqq = *async_bfqq;
9404 -+ }
9405 -+
9406 -+ if (bfqq == NULL)
9407 -+ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
9408 -+
9409 -+ /*
9410 -+ * Pin the queue now that it's allocated, scheduler exit will prune it.
9411 -+ */
9412 -+ if (!is_sync && *async_bfqq == NULL) {
9413 -+ atomic_inc(&bfqq->ref);
9414 -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
9415 -+ bfqq, atomic_read(&bfqq->ref));
9416 -+ *async_bfqq = bfqq;
9417 -+ }
9418 -+
9419 -+ atomic_inc(&bfqq->ref);
9420 -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
9421 -+ atomic_read(&bfqq->ref));
9422 -+ return bfqq;
9423 -+}
9424 -+
9425 -+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
9426 -+ struct bfq_io_cq *bic)
9427 -+{
9428 -+ unsigned long elapsed = jiffies - bic->ttime.last_end_request;
9429 -+ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
9430 -+
9431 -+ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
9432 -+ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
9433 -+ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / bic->ttime.ttime_samples;
9434 -+}
9435 -+
9436 -+static void bfq_update_io_seektime(struct bfq_data *bfqd,
9437 -+ struct bfq_queue *bfqq,
9438 -+ struct request *rq)
9439 -+{
9440 -+ sector_t sdist;
9441 -+ u64 total;
9442 -+
9443 -+ if (bfqq->last_request_pos < blk_rq_pos(rq))
9444 -+ sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
9445 -+ else
9446 -+ sdist = bfqq->last_request_pos - blk_rq_pos(rq);
9447 -+
9448 -+ /*
9449 -+ * Don't allow the seek distance to get too large from the
9450 -+ * odd fragment, pagein, etc.
9451 -+ */
9452 -+ if (bfqq->seek_samples == 0) /* first request, not really a seek */
9453 -+ sdist = 0;
9454 -+ else if (bfqq->seek_samples <= 60) /* second & third seek */
9455 -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
9456 -+ else
9457 -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
9458 -+
9459 -+ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
9460 -+ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
9461 -+ total = bfqq->seek_total + (bfqq->seek_samples/2);
9462 -+ do_div(total, bfqq->seek_samples);
9463 -+ if (bfq_bfqq_coop(bfqq)) {
9464 -+ /*
9465 -+ * If the mean seektime increases for a (non-seeky) shared
9466 -+ * queue, some cooperator is likely to be idling too much.
9467 -+ * On the contrary, if it decreases, some cooperator has
9468 -+ * probably waked up.
9469 -+ *
9470 -+ */
9471 -+ if ((sector_t)total < bfqq->seek_mean)
9472 -+ bfq_mark_bfqq_some_coop_idle(bfqq) ;
9473 -+ else if ((sector_t)total > bfqq->seek_mean)
9474 -+ bfq_clear_bfqq_some_coop_idle(bfqq) ;
9475 -+ }
9476 -+ bfqq->seek_mean = (sector_t)total;
9477 -+
9478 -+ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
9479 -+ (u64)bfqq->seek_mean);
9480 -+}
9481 -+
9482 -+/*
9483 -+ * Disable idle window if the process thinks too long or seeks so much that
9484 -+ * it doesn't matter.
9485 -+ */
9486 -+static void bfq_update_idle_window(struct bfq_data *bfqd,
9487 -+ struct bfq_queue *bfqq,
9488 -+ struct bfq_io_cq *bic)
9489 -+{
9490 -+ int enable_idle;
9491 -+
9492 -+ /* Don't idle for async or idle io prio class. */
9493 -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
9494 -+ return;
9495 -+
9496 -+ enable_idle = bfq_bfqq_idle_window(bfqq);
9497 -+
9498 -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
9499 -+ bfqd->bfq_slice_idle == 0 ||
9500 -+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
9501 -+ bfqq->raising_coeff == 1))
9502 -+ enable_idle = 0;
9503 -+ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
9504 -+ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
9505 -+ bfqq->raising_coeff == 1)
9506 -+ enable_idle = 0;
9507 -+ else
9508 -+ enable_idle = 1;
9509 -+ }
9510 -+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
9511 -+ enable_idle);
9512 -+
9513 -+ if (enable_idle)
9514 -+ bfq_mark_bfqq_idle_window(bfqq);
9515 -+ else
9516 -+ bfq_clear_bfqq_idle_window(bfqq);
9517 -+}
9518 -+
9519 -+/*
9520 -+ * Called when a new fs request (rq) is added to bfqq. Check if there's
9521 -+ * something we should do about it.
9522 -+ */
9523 -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
9524 -+ struct request *rq)
9525 -+{
9526 -+ struct bfq_io_cq *bic = RQ_BIC(rq);
9527 -+
9528 -+ if (rq->cmd_flags & REQ_META)
9529 -+ bfqq->meta_pending++;
9530 -+
9531 -+ bfq_update_io_thinktime(bfqd, bic);
9532 -+ bfq_update_io_seektime(bfqd, bfqq, rq);
9533 -+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
9534 -+ !BFQQ_SEEKY(bfqq))
9535 -+ bfq_update_idle_window(bfqd, bfqq, bic);
9536 -+
9537 -+ bfq_log_bfqq(bfqd, bfqq,
9538 -+ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
9539 -+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
9540 -+ (long long unsigned)bfqq->seek_mean);
9541 -+
9542 -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
9543 -+
9544 -+ if (bfqq == bfqd->active_queue) {
9545 -+ /*
9546 -+ * If there is just this request queued and the request
9547 -+ * is small, just exit.
9548 -+ * In this way, if the disk is being idled to wait for a new
9549 -+ * request from the active queue, we avoid unplugging the
9550 -+ * device now.
9551 -+ *
9552 -+ * By doing so, we spare the disk to be committed
9553 -+ * to serve just a small request. On the contrary, we wait for
9554 -+ * the block layer to decide when to unplug the device:
9555 -+ * hopefully, new requests will be merged to this
9556 -+ * one quickly, then the device will be unplugged
9557 -+ * and larger requests will be dispatched.
9558 -+ */
9559 -+ if (bfqq->queued[rq_is_sync(rq)] == 1 &&
9560 -+ blk_rq_sectors(rq) < 32) {
9561 -+ return;
9562 -+ }
9563 -+ if (bfq_bfqq_wait_request(bfqq)) {
9564 -+ /*
9565 -+ * If we are waiting for a request for this queue, let
9566 -+ * it rip immediately and flag that we must not expire
9567 -+ * this queue just now.
9568 -+ */
9569 -+ bfq_clear_bfqq_wait_request(bfqq);
9570 -+ del_timer(&bfqd->idle_slice_timer);
9571 -+ /*
9572 -+ * Here we can safely expire the queue, in
9573 -+ * case of budget timeout, without wasting
9574 -+ * guarantees
9575 -+ */
9576 -+ if (bfq_bfqq_budget_timeout(bfqq))
9577 -+ bfq_bfqq_expire(bfqd, bfqq, 0,
9578 -+ BFQ_BFQQ_BUDGET_TIMEOUT);
9579 -+ __blk_run_queue(bfqd->queue);
9580 -+ }
9581 -+ }
9582 -+}
9583 -+
9584 -+static void bfq_insert_request(struct request_queue *q, struct request *rq)
9585 -+{
9586 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
9587 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
9588 -+
9589 -+ assert_spin_locked(bfqd->queue->queue_lock);
9590 -+ bfq_init_prio_data(bfqq, RQ_BIC(rq));
9591 -+
9592 -+ bfq_add_rq_rb(rq);
9593 -+
9594 -+ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
9595 -+ list_add_tail(&rq->queuelist, &bfqq->fifo);
9596 -+
9597 -+ bfq_rq_enqueued(bfqd, bfqq, rq);
9598 -+}
9599 -+
9600 -+static void bfq_update_hw_tag(struct bfq_data *bfqd)
9601 -+{
9602 -+ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
9603 -+ bfqd->rq_in_driver);
9604 -+
9605 -+ if (bfqd->hw_tag == 1)
9606 -+ return;
9607 -+
9608 -+ /*
9609 -+ * This sample is valid if the number of outstanding requests
9610 -+ * is large enough to allow a queueing behavior. Note that the
9611 -+ * sum is not exact, as it's not taking into account deactivated
9612 -+ * requests.
9613 -+ */
9614 -+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
9615 -+ return;
9616 -+
9617 -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
9618 -+ return;
9619 -+
9620 -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
9621 -+ bfqd->max_rq_in_driver = 0;
9622 -+ bfqd->hw_tag_samples = 0;
9623 -+}
9624 -+
9625 -+static void bfq_completed_request(struct request_queue *q, struct request *rq)
9626 -+{
9627 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
9628 -+ struct bfq_data *bfqd = bfqq->bfqd;
9629 -+ const int sync = rq_is_sync(rq);
9630 -+
9631 -+ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
9632 -+ blk_rq_sectors(rq), sync);
9633 -+
9634 -+ bfq_update_hw_tag(bfqd);
9635 -+
9636 -+ WARN_ON(!bfqd->rq_in_driver);
9637 -+ WARN_ON(!bfqq->dispatched);
9638 -+ bfqd->rq_in_driver--;
9639 -+ bfqq->dispatched--;
9640 -+
9641 -+ if (bfq_bfqq_sync(bfqq))
9642 -+ bfqd->sync_flight--;
9643 -+
9644 -+ if (sync)
9645 -+ RQ_BIC(rq)->ttime.last_end_request = jiffies;
9646 -+
9647 -+ /*
9648 -+ * If this is the active queue, check if it needs to be expired,
9649 -+ * or if we want to idle in case it has no pending requests.
9650 -+ */
9651 -+ if (bfqd->active_queue == bfqq) {
9652 -+ int budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
9653 -+ if (bfq_bfqq_budget_new(bfqq))
9654 -+ bfq_set_budget_timeout(bfqd);
9655 -+
9656 -+ /* Idling is disabled also for cooperation issues:
9657 -+ * 1) there is a close cooperator for the queue, or
9658 -+ * 2) the queue is shared and some cooperator is likely
9659 -+ * to be idle (in this case, by not arming the idle timer,
9660 -+ * we try to slow down the queue, to prevent the zones
9661 -+ * of the disk accessed by the active cooperators to become
9662 -+ * too distant from the zone that will be accessed by the
9663 -+ * currently idle cooperators)
9664 -+ */
9665 -+ if (bfq_bfqq_must_idle(bfqq, budg_timeout))
9666 -+ bfq_arm_slice_timer(bfqd);
9667 -+ else if (budg_timeout)
9668 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
9669 -+ }
9670 -+
9671 -+ if (!bfqd->rq_in_driver)
9672 -+ bfq_schedule_dispatch(bfqd);
9673 -+}
9674 -+
9675 -+static inline int __bfq_may_queue(struct bfq_queue *bfqq)
9676 -+{
9677 -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
9678 -+ bfq_clear_bfqq_must_alloc(bfqq);
9679 -+ return ELV_MQUEUE_MUST;
9680 -+ }
9681 -+
9682 -+ return ELV_MQUEUE_MAY;
9683 -+}
9684 -+
9685 -+static int bfq_may_queue(struct request_queue *q, int rw)
9686 -+{
9687 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
9688 -+ struct task_struct *tsk = current;
9689 -+ struct bfq_io_cq *bic;
9690 -+ struct bfq_queue *bfqq;
9691 -+
9692 -+ /*
9693 -+ * Don't force setup of a queue from here, as a call to may_queue
9694 -+ * does not necessarily imply that a request actually will be queued.
9695 -+ * So just lookup a possibly existing queue, or return 'may queue'
9696 -+ * if that fails.
9697 -+ */
9698 -+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
9699 -+ if (bic == NULL)
9700 -+ return ELV_MQUEUE_MAY;
9701 -+
9702 -+ bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
9703 -+ if (bfqq != NULL) {
9704 -+ bfq_init_prio_data(bfqq, bic);
9705 -+
9706 -+ return __bfq_may_queue(bfqq);
9707 -+ }
9708 -+
9709 -+ return ELV_MQUEUE_MAY;
9710 -+}
9711 -+
9712 -+/*
9713 -+ * Queue lock held here.
9714 -+ */
9715 -+static void bfq_put_request(struct request *rq)
9716 -+{
9717 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
9718 -+
9719 -+ if (bfqq != NULL) {
9720 -+ const int rw = rq_data_dir(rq);
9721 -+
9722 -+ BUG_ON(!bfqq->allocated[rw]);
9723 -+ bfqq->allocated[rw]--;
9724 -+
9725 -+ rq->elv.priv[0] = NULL;
9726 -+ rq->elv.priv[1] = NULL;
9727 -+
9728 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
9729 -+ bfqq, atomic_read(&bfqq->ref));
9730 -+ bfq_put_queue(bfqq);
9731 -+ }
9732 -+}
9733 -+
9734 -+static struct bfq_queue *
9735 -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
9736 -+ struct bfq_queue *bfqq)
9737 -+{
9738 -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
9739 -+ (long unsigned)bfqq->new_bfqq->pid);
9740 -+ bic_set_bfqq(bic, bfqq->new_bfqq, 1);
9741 -+ bfq_mark_bfqq_coop(bfqq->new_bfqq);
9742 -+ bfq_put_queue(bfqq);
9743 -+ return bic_to_bfqq(bic, 1);
9744 -+}
9745 -+
9746 -+/*
9747 -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
9748 -+ * was the last process referring to said bfqq.
9749 -+ */
9750 -+static struct bfq_queue *
9751 -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
9752 -+{
9753 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
9754 -+ if (bfqq_process_refs(bfqq) == 1) {
9755 -+ bfqq->pid = current->pid;
9756 -+ bfq_clear_bfqq_some_coop_idle(bfqq);
9757 -+ bfq_clear_bfqq_coop(bfqq);
9758 -+ bfq_clear_bfqq_split_coop(bfqq);
9759 -+ return bfqq;
9760 -+ }
9761 -+
9762 -+ bic_set_bfqq(bic, NULL, 1);
9763 -+
9764 -+ bfq_put_cooperator(bfqq);
9765 -+
9766 -+ bfq_put_queue(bfqq);
9767 -+ return NULL;
9768 -+}
9769 -+
9770 -+/*
9771 -+ * Allocate bfq data structures associated with this request.
9772 -+ */
9773 -+static int bfq_set_request(struct request_queue *q, struct request *rq,
9774 -+ struct bio *bio, gfp_t gfp_mask)
9775 -+{
9776 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
9777 -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
9778 -+ const int rw = rq_data_dir(rq);
9779 -+ const int is_sync = rq_is_sync(rq);
9780 -+ struct bfq_queue *bfqq;
9781 -+ struct bfq_group *bfqg;
9782 -+ unsigned long flags;
9783 -+
9784 -+ might_sleep_if(gfp_mask & __GFP_WAIT);
9785 -+
9786 -+ bfq_changed_ioprio(bic);
9787 -+
9788 -+ spin_lock_irqsave(q->queue_lock, flags);
9789 -+
9790 -+ if (bic == NULL)
9791 -+ goto queue_fail;
9792 -+
9793 -+ bfqg = bfq_bic_update_cgroup(bic);
9794 -+
9795 -+new_queue:
9796 -+ bfqq = bic_to_bfqq(bic, is_sync);
9797 -+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
9798 -+ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
9799 -+ bic_set_bfqq(bic, bfqq, is_sync);
9800 -+ } else {
9801 -+ /*
9802 -+ * If the queue was seeky for too long, break it apart.
9803 -+ */
9804 -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
9805 -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
9806 -+ bfqq = bfq_split_bfqq(bic, bfqq);
9807 -+ if (!bfqq)
9808 -+ goto new_queue;
9809 -+ }
9810 -+
9811 -+ /*
9812 -+ * Check to see if this queue is scheduled to merge with
9813 -+ * another closely cooperating queue. The merging of queues
9814 -+ * happens here as it must be done in process context.
9815 -+ * The reference on new_bfqq was taken in merge_bfqqs.
9816 -+ */
9817 -+ if (bfqq->new_bfqq != NULL)
9818 -+ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
9819 -+ }
9820 -+
9821 -+ bfqq->allocated[rw]++;
9822 -+ atomic_inc(&bfqq->ref);
9823 -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
9824 -+ atomic_read(&bfqq->ref));
9825 -+
9826 -+ rq->elv.priv[0] = bic;
9827 -+ rq->elv.priv[1] = bfqq;
9828 -+
9829 -+ spin_unlock_irqrestore(q->queue_lock, flags);
9830 -+
9831 -+ return 0;
9832 -+
9833 -+queue_fail:
9834 -+ bfq_schedule_dispatch(bfqd);
9835 -+ spin_unlock_irqrestore(q->queue_lock, flags);
9836 -+
9837 -+ return 1;
9838 -+}
9839 -+
9840 -+static void bfq_kick_queue(struct work_struct *work)
9841 -+{
9842 -+ struct bfq_data *bfqd =
9843 -+ container_of(work, struct bfq_data, unplug_work);
9844 -+ struct request_queue *q = bfqd->queue;
9845 -+
9846 -+ spin_lock_irq(q->queue_lock);
9847 -+ __blk_run_queue(q);
9848 -+ spin_unlock_irq(q->queue_lock);
9849 -+}
9850 -+
9851 -+/*
9852 -+ * Handler of the expiration of the timer running if the active_queue
9853 -+ * is idling inside its time slice.
9854 -+ */
9855 -+static void bfq_idle_slice_timer(unsigned long data)
9856 -+{
9857 -+ struct bfq_data *bfqd = (struct bfq_data *)data;
9858 -+ struct bfq_queue *bfqq;
9859 -+ unsigned long flags;
9860 -+ enum bfqq_expiration reason;
9861 -+
9862 -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
9863 -+
9864 -+ bfqq = bfqd->active_queue;
9865 -+ /*
9866 -+ * Theoretical race here: active_queue can be NULL or different
9867 -+ * from the queue that was idling if the timer handler spins on
9868 -+ * the queue_lock and a new request arrives for the current
9869 -+ * queue and there is a full dispatch cycle that changes the
9870 -+ * active_queue. This can hardly happen, but in the worst case
9871 -+ * we just expire a queue too early.
9872 -+ */
9873 -+ if (bfqq != NULL) {
9874 -+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
9875 -+ if (bfq_bfqq_budget_timeout(bfqq))
9876 -+ /*
9877 -+ * Also here the queue can be safely expired
9878 -+ * for budget timeout without wasting
9879 -+ * guarantees
9880 -+ */
9881 -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
9882 -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
9883 -+ /*
9884 -+ * The queue may not be empty upon timer expiration,
9885 -+ * because we may not disable the timer when the first
9886 -+ * request of the active queue arrives during
9887 -+ * disk idling
9888 -+ */
9889 -+ reason = BFQ_BFQQ_TOO_IDLE;
9890 -+ else
9891 -+ goto schedule_dispatch;
9892 -+
9893 -+ bfq_bfqq_expire(bfqd, bfqq, 1, reason);
9894 -+ }
9895 -+
9896 -+schedule_dispatch:
9897 -+ bfq_schedule_dispatch(bfqd);
9898 -+
9899 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
9900 -+}
9901 -+
9902 -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
9903 -+{
9904 -+ del_timer_sync(&bfqd->idle_slice_timer);
9905 -+ cancel_work_sync(&bfqd->unplug_work);
9906 -+}
9907 -+
9908 -+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
9909 -+ struct bfq_queue **bfqq_ptr)
9910 -+{
9911 -+ struct bfq_group *root_group = bfqd->root_group;
9912 -+ struct bfq_queue *bfqq = *bfqq_ptr;
9913 -+
9914 -+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
9915 -+ if (bfqq != NULL) {
9916 -+ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
9917 -+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
9918 -+ bfqq, atomic_read(&bfqq->ref));
9919 -+ bfq_put_queue(bfqq);
9920 -+ *bfqq_ptr = NULL;
9921 -+ }
9922 -+}
9923 -+
9924 -+/*
9925 -+ * Release all the bfqg references to its async queues. If we are
9926 -+ * deallocating the group these queues may still contain requests, so
9927 -+ * we reparent them to the root cgroup (i.e., the only one that will
9928 -+ * exist for sure untill all the requests on a device are gone).
9929 -+ */
9930 -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
9931 -+{
9932 -+ int i, j;
9933 -+
9934 -+ for (i = 0; i < 2; i++)
9935 -+ for (j = 0; j < IOPRIO_BE_NR; j++)
9936 -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
9937 -+
9938 -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
9939 -+}
9940 -+
9941 -+static void bfq_exit_queue(struct elevator_queue *e)
9942 -+{
9943 -+ struct bfq_data *bfqd = e->elevator_data;
9944 -+ struct request_queue *q = bfqd->queue;
9945 -+ struct bfq_queue *bfqq, *n;
9946 -+
9947 -+ bfq_shutdown_timer_wq(bfqd);
9948 -+
9949 -+ spin_lock_irq(q->queue_lock);
9950 -+
9951 -+ BUG_ON(bfqd->active_queue != NULL);
9952 -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
9953 -+ bfq_deactivate_bfqq(bfqd, bfqq, 0);
9954 -+
9955 -+ bfq_disconnect_groups(bfqd);
9956 -+ spin_unlock_irq(q->queue_lock);
9957 -+
9958 -+ bfq_shutdown_timer_wq(bfqd);
9959 -+
9960 -+ synchronize_rcu();
9961 -+
9962 -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
9963 -+
9964 -+ bfq_free_root_group(bfqd);
9965 -+ kfree(bfqd);
9966 -+}
9967 -+
9968 -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
9969 -+{
9970 -+ struct bfq_group *bfqg;
9971 -+ struct bfq_data *bfqd;
9972 -+ struct elevator_queue *eq;
9973 -+
9974 -+ eq = elevator_alloc(q, e);
9975 -+ if (eq == NULL)
9976 -+ return -ENOMEM;
9977 -+
9978 -+ bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node);
9979 -+ if (bfqd == NULL) {
9980 -+ kobject_put(&eq->kobj);
9981 -+ return -ENOMEM;
9982 -+ }
9983 -+ eq->elevator_data = bfqd;
9984 -+
9985 -+ /*
9986 -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
9987 -+ * Grab a permanent reference to it, so that the normal code flow
9988 -+ * will not attempt to free it.
9989 -+ */
9990 -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
9991 -+ atomic_inc(&bfqd->oom_bfqq.ref);
9992 -+
9993 -+ bfqd->queue = q;
9994 -+
9995 -+ spin_lock_irq(q->queue_lock);
9996 -+ q->elevator = eq;
9997 -+ spin_unlock_irq(q->queue_lock);
9998 -+
9999 -+ bfqg = bfq_alloc_root_group(bfqd, q->node);
10000 -+ if (bfqg == NULL) {
10001 -+ kfree(bfqd);
10002 -+ kobject_put(&eq->kobj);
10003 -+ return -ENOMEM;
10004 -+ }
10005 -+
10006 -+ bfqd->root_group = bfqg;
10007 -+
10008 -+ init_timer(&bfqd->idle_slice_timer);
10009 -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
10010 -+ bfqd->idle_slice_timer.data = (unsigned long)bfqd;
10011 -+
10012 -+ bfqd->rq_pos_tree = RB_ROOT;
10013 -+
10014 -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
10015 -+
10016 -+ INIT_LIST_HEAD(&bfqd->active_list);
10017 -+ INIT_LIST_HEAD(&bfqd->idle_list);
10018 -+
10019 -+ bfqd->hw_tag = -1;
10020 -+
10021 -+ bfqd->bfq_max_budget = bfq_default_max_budget;
10022 -+
10023 -+ bfqd->bfq_quantum = bfq_quantum;
10024 -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
10025 -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
10026 -+ bfqd->bfq_back_max = bfq_back_max;
10027 -+ bfqd->bfq_back_penalty = bfq_back_penalty;
10028 -+ bfqd->bfq_slice_idle = bfq_slice_idle;
10029 -+ bfqd->bfq_class_idle_last_service = 0;
10030 -+ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
10031 -+ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
10032 -+ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
10033 -+
10034 -+ bfqd->low_latency = true;
10035 -+
10036 -+ bfqd->bfq_raising_coeff = 20;
10037 -+ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
10038 -+ bfqd->bfq_raising_max_time = 0;
10039 -+ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
10040 -+ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
10041 -+ bfqd->bfq_raising_max_softrt_rate = 7000;
10042 -+
10043 -+ /* Initially estimate the device's peak rate as the reference rate */
10044 -+ if (blk_queue_nonrot(bfqd->queue)) {
10045 -+ bfqd->RT_prod = R_nonrot * T_nonrot;
10046 -+ bfqd->peak_rate = R_nonrot;
10047 -+ } else {
10048 -+ bfqd->RT_prod = R_rot * T_rot;
10049 -+ bfqd->peak_rate = R_rot;
10050 -+ }
10051 -+
10052 -+ return 0;
10053 -+}
10054 -+
10055 -+static void bfq_slab_kill(void)
10056 -+{
10057 -+ if (bfq_pool != NULL)
10058 -+ kmem_cache_destroy(bfq_pool);
10059 -+}
10060 -+
10061 -+static int __init bfq_slab_setup(void)
10062 -+{
10063 -+ bfq_pool = KMEM_CACHE(bfq_queue, 0);
10064 -+ if (bfq_pool == NULL)
10065 -+ return -ENOMEM;
10066 -+ return 0;
10067 -+}
10068 -+
10069 -+static ssize_t bfq_var_show(unsigned int var, char *page)
10070 -+{
10071 -+ return sprintf(page, "%d\n", var);
10072 -+}
10073 -+
10074 -+static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
10075 -+{
10076 -+ unsigned long new_val;
10077 -+ int ret = strict_strtoul(page, 10, &new_val);
10078 -+
10079 -+ if (ret == 0)
10080 -+ *var = new_val;
10081 -+
10082 -+ return count;
10083 -+}
10084 -+
10085 -+static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
10086 -+{
10087 -+ struct bfq_data *bfqd = e->elevator_data;
10088 -+ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
10089 -+ jiffies_to_msecs(bfqd->bfq_raising_max_time) :
10090 -+ jiffies_to_msecs(bfq_wrais_duration(bfqd)));
10091 -+}
10092 -+
10093 -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
10094 -+{
10095 -+ struct bfq_queue *bfqq;
10096 -+ struct bfq_data *bfqd = e->elevator_data;
10097 -+ ssize_t num_char = 0;
10098 -+
10099 -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
10100 -+ bfqd->queued);
10101 -+
10102 -+ spin_lock_irq(bfqd->queue->queue_lock);
10103 -+
10104 -+ num_char += sprintf(page + num_char, "Active:\n");
10105 -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
10106 -+ num_char += sprintf(page + num_char,
10107 -+ "pid%d: weight %hu, nr_queued %d %d,"
10108 -+ " dur %d/%u\n",
10109 -+ bfqq->pid,
10110 -+ bfqq->entity.weight,
10111 -+ bfqq->queued[0],
10112 -+ bfqq->queued[1],
10113 -+ jiffies_to_msecs(jiffies -
10114 -+ bfqq->last_rais_start_finish),
10115 -+ jiffies_to_msecs(bfqq->raising_cur_max_time));
10116 -+ }
10117 -+
10118 -+ num_char += sprintf(page + num_char, "Idle:\n");
10119 -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
10120 -+ num_char += sprintf(page + num_char,
10121 -+ "pid%d: weight %hu, dur %d/%u\n",
10122 -+ bfqq->pid,
10123 -+ bfqq->entity.weight,
10124 -+ jiffies_to_msecs(jiffies -
10125 -+ bfqq->last_rais_start_finish),
10126 -+ jiffies_to_msecs(bfqq->raising_cur_max_time));
10127 -+ }
10128 -+
10129 -+ spin_unlock_irq(bfqd->queue->queue_lock);
10130 -+
10131 -+ return num_char;
10132 -+}
10133 -+
10134 -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
10135 -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
10136 -+{ \
10137 -+ struct bfq_data *bfqd = e->elevator_data; \
10138 -+ unsigned int __data = __VAR; \
10139 -+ if (__CONV) \
10140 -+ __data = jiffies_to_msecs(__data); \
10141 -+ return bfq_var_show(__data, (page)); \
10142 -+}
10143 -+SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
10144 -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
10145 -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
10146 -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
10147 -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
10148 -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
10149 -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
10150 -+SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
10151 -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
10152 -+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
10153 -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
10154 -+SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
10155 -+SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
10156 -+SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
10157 -+ 1);
10158 -+SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
10159 -+ bfqd->bfq_raising_min_inter_arr_async,
10160 -+ 1);
10161 -+SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
10162 -+ bfqd->bfq_raising_max_softrt_rate, 0);
10163 -+#undef SHOW_FUNCTION
10164 -+
10165 -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
10166 -+static ssize_t \
10167 -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \
10168 -+{ \
10169 -+ struct bfq_data *bfqd = e->elevator_data; \
10170 -+ unsigned long uninitialized_var(__data); \
10171 -+ int ret = bfq_var_store(&__data, (page), count); \
10172 -+ if (__data < (MIN)) \
10173 -+ __data = (MIN); \
10174 -+ else if (__data > (MAX)) \
10175 -+ __data = (MAX); \
10176 -+ if (__CONV) \
10177 -+ *(__PTR) = msecs_to_jiffies(__data); \
10178 -+ else \
10179 -+ *(__PTR) = __data; \
10180 -+ return ret; \
10181 -+}
10182 -+STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
10183 -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
10184 -+ INT_MAX, 1);
10185 -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
10186 -+ INT_MAX, 1);
10187 -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
10188 -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
10189 -+ INT_MAX, 0);
10190 -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
10191 -+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
10192 -+ 1, INT_MAX, 0);
10193 -+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
10194 -+ INT_MAX, 1);
10195 -+STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
10196 -+ INT_MAX, 0);
10197 -+STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
10198 -+ INT_MAX, 1);
10199 -+STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
10200 -+ INT_MAX, 1);
10201 -+STORE_FUNCTION(bfq_raising_min_idle_time_store,
10202 -+ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
10203 -+STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
10204 -+ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
10205 -+STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
10206 -+ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
10207 -+#undef STORE_FUNCTION
10208 -+
10209 -+/* do nothing for the moment */
10210 -+static ssize_t bfq_weights_store(struct elevator_queue *e,
10211 -+ const char *page, size_t count)
10212 -+{
10213 -+ return count;
10214 -+}
10215 -+
10216 -+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
10217 -+{
10218 -+ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
10219 -+
10220 -+ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
10221 -+ return bfq_calc_max_budget(bfqd->peak_rate, timeout);
10222 -+ else
10223 -+ return bfq_default_max_budget;
10224 -+}
10225 -+
10226 -+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
10227 -+ const char *page, size_t count)
10228 -+{
10229 -+ struct bfq_data *bfqd = e->elevator_data;
10230 -+ unsigned long uninitialized_var(__data);
10231 -+ int ret = bfq_var_store(&__data, (page), count);
10232 -+
10233 -+ if (__data == 0)
10234 -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
10235 -+ else {
10236 -+ if (__data > INT_MAX)
10237 -+ __data = INT_MAX;
10238 -+ bfqd->bfq_max_budget = __data;
10239 -+ }
10240 -+
10241 -+ bfqd->bfq_user_max_budget = __data;
10242 -+
10243 -+ return ret;
10244 -+}
10245 -+
10246 -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
10247 -+ const char *page, size_t count)
10248 -+{
10249 -+ struct bfq_data *bfqd = e->elevator_data;
10250 -+ unsigned long uninitialized_var(__data);
10251 -+ int ret = bfq_var_store(&__data, (page), count);
10252 -+
10253 -+ if (__data < 1)
10254 -+ __data = 1;
10255 -+ else if (__data > INT_MAX)
10256 -+ __data = INT_MAX;
10257 -+
10258 -+ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
10259 -+ if (bfqd->bfq_user_max_budget == 0)
10260 -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
10261 -+
10262 -+ return ret;
10263 -+}
10264 -+
10265 -+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
10266 -+ const char *page, size_t count)
10267 -+{
10268 -+ struct bfq_data *bfqd = e->elevator_data;
10269 -+ unsigned long uninitialized_var(__data);
10270 -+ int ret = bfq_var_store(&__data, (page), count);
10271 -+
10272 -+ if (__data > 1)
10273 -+ __data = 1;
10274 -+ if (__data == 0 && bfqd->low_latency != 0)
10275 -+ bfq_end_raising(bfqd);
10276 -+ bfqd->low_latency = __data;
10277 -+
10278 -+ return ret;
10279 -+}
10280 -+
10281 -+#define BFQ_ATTR(name) \
10282 -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
10283 -+
10284 -+static struct elv_fs_entry bfq_attrs[] = {
10285 -+ BFQ_ATTR(quantum),
10286 -+ BFQ_ATTR(fifo_expire_sync),
10287 -+ BFQ_ATTR(fifo_expire_async),
10288 -+ BFQ_ATTR(back_seek_max),
10289 -+ BFQ_ATTR(back_seek_penalty),
10290 -+ BFQ_ATTR(slice_idle),
10291 -+ BFQ_ATTR(max_budget),
10292 -+ BFQ_ATTR(max_budget_async_rq),
10293 -+ BFQ_ATTR(timeout_sync),
10294 -+ BFQ_ATTR(timeout_async),
10295 -+ BFQ_ATTR(low_latency),
10296 -+ BFQ_ATTR(raising_coeff),
10297 -+ BFQ_ATTR(raising_max_time),
10298 -+ BFQ_ATTR(raising_rt_max_time),
10299 -+ BFQ_ATTR(raising_min_idle_time),
10300 -+ BFQ_ATTR(raising_min_inter_arr_async),
10301 -+ BFQ_ATTR(raising_max_softrt_rate),
10302 -+ BFQ_ATTR(weights),
10303 -+ __ATTR_NULL
10304 -+};
10305 -+
10306 -+static struct elevator_type iosched_bfq = {
10307 -+ .ops = {
10308 -+ .elevator_merge_fn = bfq_merge,
10309 -+ .elevator_merged_fn = bfq_merged_request,
10310 -+ .elevator_merge_req_fn = bfq_merged_requests,
10311 -+ .elevator_allow_merge_fn = bfq_allow_merge,
10312 -+ .elevator_dispatch_fn = bfq_dispatch_requests,
10313 -+ .elevator_add_req_fn = bfq_insert_request,
10314 -+ .elevator_activate_req_fn = bfq_activate_request,
10315 -+ .elevator_deactivate_req_fn = bfq_deactivate_request,
10316 -+ .elevator_completed_req_fn = bfq_completed_request,
10317 -+ .elevator_former_req_fn = elv_rb_former_request,
10318 -+ .elevator_latter_req_fn = elv_rb_latter_request,
10319 -+ .elevator_init_icq_fn = bfq_init_icq,
10320 -+ .elevator_exit_icq_fn = bfq_exit_icq,
10321 -+ .elevator_set_req_fn = bfq_set_request,
10322 -+ .elevator_put_req_fn = bfq_put_request,
10323 -+ .elevator_may_queue_fn = bfq_may_queue,
10324 -+ .elevator_init_fn = bfq_init_queue,
10325 -+ .elevator_exit_fn = bfq_exit_queue,
10326 -+ },
10327 -+ .icq_size = sizeof(struct bfq_io_cq),
10328 -+ .icq_align = __alignof__(struct bfq_io_cq),
10329 -+ .elevator_attrs = bfq_attrs,
10330 -+ .elevator_name = "bfq",
10331 -+ .elevator_owner = THIS_MODULE,
10332 -+};
10333 -+
10334 -+static int __init bfq_init(void)
10335 -+{
10336 -+ /*
10337 -+ * Can be 0 on HZ < 1000 setups.
10338 -+ */
10339 -+ if (bfq_slice_idle == 0)
10340 -+ bfq_slice_idle = 1;
10341 -+
10342 -+ if (bfq_timeout_async == 0)
10343 -+ bfq_timeout_async = 1;
10344 -+
10345 -+ if (bfq_slab_setup())
10346 -+ return -ENOMEM;
10347 -+
10348 -+ elv_register(&iosched_bfq);
10349 -+
10350 -+ return 0;
10351 -+}
10352 -+
10353 -+static void __exit bfq_exit(void)
10354 -+{
10355 -+ elv_unregister(&iosched_bfq);
10356 -+ bfq_slab_kill();
10357 -+}
10358 -+
10359 -+module_init(bfq_init);
10360 -+module_exit(bfq_exit);
10361 -+
10362 -+MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
10363 -+MODULE_LICENSE("GPL");
10364 -+MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
10365 -diff --git a/block/bfq-sched.c b/block/bfq-sched.c
10366 -new file mode 100644
10367 -index 0000000..03f8061
10368 ---- /dev/null
10369 -+++ b/block/bfq-sched.c
10370 -@@ -0,0 +1,1072 @@
10371 -+/*
10372 -+ * BFQ: Hierarchical B-WF2Q+ scheduler.
10373 -+ *
10374 -+ * Based on ideas and code from CFQ:
10375 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
10376 -+ *
10377 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
10378 -+ * Paolo Valente <paolo.valente@×××××××.it>
10379 -+ *
10380 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
10381 -+ */
10382 -+
10383 -+#ifdef CONFIG_CGROUP_BFQIO
10384 -+#define for_each_entity(entity) \
10385 -+ for (; entity != NULL; entity = entity->parent)
10386 -+
10387 -+#define for_each_entity_safe(entity, parent) \
10388 -+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
10389 -+
10390 -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
10391 -+ int extract,
10392 -+ struct bfq_data *bfqd);
10393 -+
10394 -+static inline void bfq_update_budget(struct bfq_entity *next_active)
10395 -+{
10396 -+ struct bfq_entity *bfqg_entity;
10397 -+ struct bfq_group *bfqg;
10398 -+ struct bfq_sched_data *group_sd;
10399 -+
10400 -+ BUG_ON(next_active == NULL);
10401 -+
10402 -+ group_sd = next_active->sched_data;
10403 -+
10404 -+ bfqg = container_of(group_sd, struct bfq_group, sched_data);
10405 -+ /*
10406 -+ * bfq_group's my_entity field is not NULL only if the group
10407 -+ * is not the root group. We must not touch the root entity
10408 -+ * as it must never become an active entity.
10409 -+ */
10410 -+ bfqg_entity = bfqg->my_entity;
10411 -+ if (bfqg_entity != NULL)
10412 -+ bfqg_entity->budget = next_active->budget;
10413 -+}
10414 -+
10415 -+static int bfq_update_next_active(struct bfq_sched_data *sd)
10416 -+{
10417 -+ struct bfq_entity *next_active;
10418 -+
10419 -+ if (sd->active_entity != NULL)
10420 -+ /* will update/requeue at the end of service */
10421 -+ return 0;
10422 -+
10423 -+ /*
10424 -+ * NOTE: this can be improved in many ways, such as returning
10425 -+ * 1 (and thus propagating upwards the update) only when the
10426 -+ * budget changes, or caching the bfqq that will be scheduled
10427 -+ * next from this subtree. By now we worry more about
10428 -+ * correctness than about performance...
10429 -+ */
10430 -+ next_active = bfq_lookup_next_entity(sd, 0, NULL);
10431 -+ sd->next_active = next_active;
10432 -+
10433 -+ if (next_active != NULL)
10434 -+ bfq_update_budget(next_active);
10435 -+
10436 -+ return 1;
10437 -+}
10438 -+
10439 -+static inline void bfq_check_next_active(struct bfq_sched_data *sd,
10440 -+ struct bfq_entity *entity)
10441 -+{
10442 -+ BUG_ON(sd->next_active != entity);
10443 -+}
10444 -+#else
10445 -+#define for_each_entity(entity) \
10446 -+ for (; entity != NULL; entity = NULL)
10447 -+
10448 -+#define for_each_entity_safe(entity, parent) \
10449 -+ for (parent = NULL; entity != NULL; entity = parent)
10450 -+
10451 -+static inline int bfq_update_next_active(struct bfq_sched_data *sd)
10452 -+{
10453 -+ return 0;
10454 -+}
10455 -+
10456 -+static inline void bfq_check_next_active(struct bfq_sched_data *sd,
10457 -+ struct bfq_entity *entity)
10458 -+{
10459 -+}
10460 -+
10461 -+static inline void bfq_update_budget(struct bfq_entity *next_active)
10462 -+{
10463 -+}
10464 -+#endif
10465 -+
10466 -+/*
10467 -+ * Shift for timestamp calculations. This actually limits the maximum
10468 -+ * service allowed in one timestamp delta (small shift values increase it),
10469 -+ * the maximum total weight that can be used for the queues in the system
10470 -+ * (big shift values increase it), and the period of virtual time wraparounds.
10471 -+ */
10472 -+#define WFQ_SERVICE_SHIFT 22
10473 -+
10474 -+/**
10475 -+ * bfq_gt - compare two timestamps.
10476 -+ * @a: first ts.
10477 -+ * @b: second ts.
10478 -+ *
10479 -+ * Return @a > @b, dealing with wrapping correctly.
10480 -+ */
10481 -+static inline int bfq_gt(u64 a, u64 b)
10482 -+{
10483 -+ return (s64)(a - b) > 0;
10484 -+}
10485 -+
10486 -+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
10487 -+{
10488 -+ struct bfq_queue *bfqq = NULL;
10489 -+
10490 -+ BUG_ON(entity == NULL);
10491 -+
10492 -+ if (entity->my_sched_data == NULL)
10493 -+ bfqq = container_of(entity, struct bfq_queue, entity);
10494 -+
10495 -+ return bfqq;
10496 -+}
10497 -+
10498 -+
10499 -+/**
10500 -+ * bfq_delta - map service into the virtual time domain.
10501 -+ * @service: amount of service.
10502 -+ * @weight: scale factor (weight of an entity or weight sum).
10503 -+ */
10504 -+static inline u64 bfq_delta(unsigned long service,
10505 -+ unsigned long weight)
10506 -+{
10507 -+ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
10508 -+
10509 -+ do_div(d, weight);
10510 -+ return d;
10511 -+}
10512 -+
10513 -+/**
10514 -+ * bfq_calc_finish - assign the finish time to an entity.
10515 -+ * @entity: the entity to act upon.
10516 -+ * @service: the service to be charged to the entity.
10517 -+ */
10518 -+static inline void bfq_calc_finish(struct bfq_entity *entity,
10519 -+ unsigned long service)
10520 -+{
10521 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10522 -+
10523 -+ BUG_ON(entity->weight == 0);
10524 -+
10525 -+ entity->finish = entity->start +
10526 -+ bfq_delta(service, entity->weight);
10527 -+
10528 -+ if (bfqq != NULL) {
10529 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
10530 -+ "calc_finish: serv %lu, w %d",
10531 -+ service, entity->weight);
10532 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
10533 -+ "calc_finish: start %llu, finish %llu, delta %llu",
10534 -+ entity->start, entity->finish,
10535 -+ bfq_delta(service, entity->weight));
10536 -+ }
10537 -+}
10538 -+
10539 -+/**
10540 -+ * bfq_entity_of - get an entity from a node.
10541 -+ * @node: the node field of the entity.
10542 -+ *
10543 -+ * Convert a node pointer to the relative entity. This is used only
10544 -+ * to simplify the logic of some functions and not as the generic
10545 -+ * conversion mechanism because, e.g., in the tree walking functions,
10546 -+ * the check for a %NULL value would be redundant.
10547 -+ */
10548 -+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
10549 -+{
10550 -+ struct bfq_entity *entity = NULL;
10551 -+
10552 -+ if (node != NULL)
10553 -+ entity = rb_entry(node, struct bfq_entity, rb_node);
10554 -+
10555 -+ return entity;
10556 -+}
10557 -+
10558 -+/**
10559 -+ * bfq_extract - remove an entity from a tree.
10560 -+ * @root: the tree root.
10561 -+ * @entity: the entity to remove.
10562 -+ */
10563 -+static inline void bfq_extract(struct rb_root *root,
10564 -+ struct bfq_entity *entity)
10565 -+{
10566 -+ BUG_ON(entity->tree != root);
10567 -+
10568 -+ entity->tree = NULL;
10569 -+ rb_erase(&entity->rb_node, root);
10570 -+}
10571 -+
10572 -+/**
10573 -+ * bfq_idle_extract - extract an entity from the idle tree.
10574 -+ * @st: the service tree of the owning @entity.
10575 -+ * @entity: the entity being removed.
10576 -+ */
10577 -+static void bfq_idle_extract(struct bfq_service_tree *st,
10578 -+ struct bfq_entity *entity)
10579 -+{
10580 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10581 -+ struct rb_node *next;
10582 -+
10583 -+ BUG_ON(entity->tree != &st->idle);
10584 -+
10585 -+ if (entity == st->first_idle) {
10586 -+ next = rb_next(&entity->rb_node);
10587 -+ st->first_idle = bfq_entity_of(next);
10588 -+ }
10589 -+
10590 -+ if (entity == st->last_idle) {
10591 -+ next = rb_prev(&entity->rb_node);
10592 -+ st->last_idle = bfq_entity_of(next);
10593 -+ }
10594 -+
10595 -+ bfq_extract(&st->idle, entity);
10596 -+
10597 -+ if (bfqq != NULL)
10598 -+ list_del(&bfqq->bfqq_list);
10599 -+}
10600 -+
10601 -+/**
10602 -+ * bfq_insert - generic tree insertion.
10603 -+ * @root: tree root.
10604 -+ * @entity: entity to insert.
10605 -+ *
10606 -+ * This is used for the idle and the active tree, since they are both
10607 -+ * ordered by finish time.
10608 -+ */
10609 -+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
10610 -+{
10611 -+ struct bfq_entity *entry;
10612 -+ struct rb_node **node = &root->rb_node;
10613 -+ struct rb_node *parent = NULL;
10614 -+
10615 -+ BUG_ON(entity->tree != NULL);
10616 -+
10617 -+ while (*node != NULL) {
10618 -+ parent = *node;
10619 -+ entry = rb_entry(parent, struct bfq_entity, rb_node);
10620 -+
10621 -+ if (bfq_gt(entry->finish, entity->finish))
10622 -+ node = &parent->rb_left;
10623 -+ else
10624 -+ node = &parent->rb_right;
10625 -+ }
10626 -+
10627 -+ rb_link_node(&entity->rb_node, parent, node);
10628 -+ rb_insert_color(&entity->rb_node, root);
10629 -+
10630 -+ entity->tree = root;
10631 -+}
10632 -+
10633 -+/**
10634 -+ * bfq_update_min - update the min_start field of a entity.
10635 -+ * @entity: the entity to update.
10636 -+ * @node: one of its children.
10637 -+ *
10638 -+ * This function is called when @entity may store an invalid value for
10639 -+ * min_start due to updates to the active tree. The function assumes
10640 -+ * that the subtree rooted at @node (which may be its left or its right
10641 -+ * child) has a valid min_start value.
10642 -+ */
10643 -+static inline void bfq_update_min(struct bfq_entity *entity,
10644 -+ struct rb_node *node)
10645 -+{
10646 -+ struct bfq_entity *child;
10647 -+
10648 -+ if (node != NULL) {
10649 -+ child = rb_entry(node, struct bfq_entity, rb_node);
10650 -+ if (bfq_gt(entity->min_start, child->min_start))
10651 -+ entity->min_start = child->min_start;
10652 -+ }
10653 -+}
10654 -+
10655 -+/**
10656 -+ * bfq_update_active_node - recalculate min_start.
10657 -+ * @node: the node to update.
10658 -+ *
10659 -+ * @node may have changed position or one of its children may have moved,
10660 -+ * this function updates its min_start value. The left and right subtrees
10661 -+ * are assumed to hold a correct min_start value.
10662 -+ */
10663 -+static inline void bfq_update_active_node(struct rb_node *node)
10664 -+{
10665 -+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
10666 -+
10667 -+ entity->min_start = entity->start;
10668 -+ bfq_update_min(entity, node->rb_right);
10669 -+ bfq_update_min(entity, node->rb_left);
10670 -+}
10671 -+
10672 -+/**
10673 -+ * bfq_update_active_tree - update min_start for the whole active tree.
10674 -+ * @node: the starting node.
10675 -+ *
10676 -+ * @node must be the deepest modified node after an update. This function
10677 -+ * updates its min_start using the values held by its children, assuming
10678 -+ * that they did not change, and then updates all the nodes that may have
10679 -+ * changed in the path to the root. The only nodes that may have changed
10680 -+ * are the ones in the path or their siblings.
10681 -+ */
10682 -+static void bfq_update_active_tree(struct rb_node *node)
10683 -+{
10684 -+ struct rb_node *parent;
10685 -+
10686 -+up:
10687 -+ bfq_update_active_node(node);
10688 -+
10689 -+ parent = rb_parent(node);
10690 -+ if (parent == NULL)
10691 -+ return;
10692 -+
10693 -+ if (node == parent->rb_left && parent->rb_right != NULL)
10694 -+ bfq_update_active_node(parent->rb_right);
10695 -+ else if (parent->rb_left != NULL)
10696 -+ bfq_update_active_node(parent->rb_left);
10697 -+
10698 -+ node = parent;
10699 -+ goto up;
10700 -+}
10701 -+
10702 -+/**
10703 -+ * bfq_active_insert - insert an entity in the active tree of its group/device.
10704 -+ * @st: the service tree of the entity.
10705 -+ * @entity: the entity being inserted.
10706 -+ *
10707 -+ * The active tree is ordered by finish time, but an extra key is kept
10708 -+ * per each node, containing the minimum value for the start times of
10709 -+ * its children (and the node itself), so it's possible to search for
10710 -+ * the eligible node with the lowest finish time in logarithmic time.
10711 -+ */
10712 -+static void bfq_active_insert(struct bfq_service_tree *st,
10713 -+ struct bfq_entity *entity)
10714 -+{
10715 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10716 -+ struct rb_node *node = &entity->rb_node;
10717 -+
10718 -+ bfq_insert(&st->active, entity);
10719 -+
10720 -+ if (node->rb_left != NULL)
10721 -+ node = node->rb_left;
10722 -+ else if (node->rb_right != NULL)
10723 -+ node = node->rb_right;
10724 -+
10725 -+ bfq_update_active_tree(node);
10726 -+
10727 -+ if (bfqq != NULL)
10728 -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
10729 -+}
10730 -+
10731 -+/**
10732 -+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
10733 -+ * @ioprio: the ioprio value to convert.
10734 -+ */
10735 -+static unsigned short bfq_ioprio_to_weight(int ioprio)
10736 -+{
10737 -+ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
10738 -+ return IOPRIO_BE_NR - ioprio;
10739 -+}
10740 -+
10741 -+/**
10742 -+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
10743 -+ * @weight: the weight value to convert.
10744 -+ *
10745 -+ * To preserve as mush as possible the old only-ioprio user interface,
10746 -+ * 0 is used as an escape ioprio value for weights (numerically) equal or
10747 -+ * larger than IOPRIO_BE_NR
10748 -+ */
10749 -+static unsigned short bfq_weight_to_ioprio(int weight)
10750 -+{
10751 -+ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
10752 -+ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
10753 -+}
10754 -+
10755 -+static inline void bfq_get_entity(struct bfq_entity *entity)
10756 -+{
10757 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10758 -+ struct bfq_sched_data *sd;
10759 -+
10760 -+ if (bfqq != NULL) {
10761 -+ sd = entity->sched_data;
10762 -+ atomic_inc(&bfqq->ref);
10763 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
10764 -+ bfqq, atomic_read(&bfqq->ref));
10765 -+ }
10766 -+}
10767 -+
10768 -+/**
10769 -+ * bfq_find_deepest - find the deepest node that an extraction can modify.
10770 -+ * @node: the node being removed.
10771 -+ *
10772 -+ * Do the first step of an extraction in an rb tree, looking for the
10773 -+ * node that will replace @node, and returning the deepest node that
10774 -+ * the following modifications to the tree can touch. If @node is the
10775 -+ * last node in the tree return %NULL.
10776 -+ */
10777 -+static struct rb_node *bfq_find_deepest(struct rb_node *node)
10778 -+{
10779 -+ struct rb_node *deepest;
10780 -+
10781 -+ if (node->rb_right == NULL && node->rb_left == NULL)
10782 -+ deepest = rb_parent(node);
10783 -+ else if (node->rb_right == NULL)
10784 -+ deepest = node->rb_left;
10785 -+ else if (node->rb_left == NULL)
10786 -+ deepest = node->rb_right;
10787 -+ else {
10788 -+ deepest = rb_next(node);
10789 -+ if (deepest->rb_right != NULL)
10790 -+ deepest = deepest->rb_right;
10791 -+ else if (rb_parent(deepest) != node)
10792 -+ deepest = rb_parent(deepest);
10793 -+ }
10794 -+
10795 -+ return deepest;
10796 -+}
10797 -+
10798 -+/**
10799 -+ * bfq_active_extract - remove an entity from the active tree.
10800 -+ * @st: the service_tree containing the tree.
10801 -+ * @entity: the entity being removed.
10802 -+ */
10803 -+static void bfq_active_extract(struct bfq_service_tree *st,
10804 -+ struct bfq_entity *entity)
10805 -+{
10806 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10807 -+ struct rb_node *node;
10808 -+
10809 -+ node = bfq_find_deepest(&entity->rb_node);
10810 -+ bfq_extract(&st->active, entity);
10811 -+
10812 -+ if (node != NULL)
10813 -+ bfq_update_active_tree(node);
10814 -+
10815 -+ if (bfqq != NULL)
10816 -+ list_del(&bfqq->bfqq_list);
10817 -+}
10818 -+
10819 -+/**
10820 -+ * bfq_idle_insert - insert an entity into the idle tree.
10821 -+ * @st: the service tree containing the tree.
10822 -+ * @entity: the entity to insert.
10823 -+ */
10824 -+static void bfq_idle_insert(struct bfq_service_tree *st,
10825 -+ struct bfq_entity *entity)
10826 -+{
10827 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10828 -+ struct bfq_entity *first_idle = st->first_idle;
10829 -+ struct bfq_entity *last_idle = st->last_idle;
10830 -+
10831 -+ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
10832 -+ st->first_idle = entity;
10833 -+ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
10834 -+ st->last_idle = entity;
10835 -+
10836 -+ bfq_insert(&st->idle, entity);
10837 -+
10838 -+ if (bfqq != NULL)
10839 -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
10840 -+}
10841 -+
10842 -+/**
10843 -+ * bfq_forget_entity - remove an entity from the wfq trees.
10844 -+ * @st: the service tree.
10845 -+ * @entity: the entity being removed.
10846 -+ *
10847 -+ * Update the device status and forget everything about @entity, putting
10848 -+ * the device reference to it, if it is a queue. Entities belonging to
10849 -+ * groups are not refcounted.
10850 -+ */
10851 -+static void bfq_forget_entity(struct bfq_service_tree *st,
10852 -+ struct bfq_entity *entity)
10853 -+{
10854 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10855 -+ struct bfq_sched_data *sd;
10856 -+
10857 -+ BUG_ON(!entity->on_st);
10858 -+
10859 -+ entity->on_st = 0;
10860 -+ st->wsum -= entity->weight;
10861 -+ if (bfqq != NULL) {
10862 -+ sd = entity->sched_data;
10863 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
10864 -+ bfqq, atomic_read(&bfqq->ref));
10865 -+ bfq_put_queue(bfqq);
10866 -+ }
10867 -+}
10868 -+
10869 -+/**
10870 -+ * bfq_put_idle_entity - release the idle tree ref of an entity.
10871 -+ * @st: service tree for the entity.
10872 -+ * @entity: the entity being released.
10873 -+ */
10874 -+static void bfq_put_idle_entity(struct bfq_service_tree *st,
10875 -+ struct bfq_entity *entity)
10876 -+{
10877 -+ bfq_idle_extract(st, entity);
10878 -+ bfq_forget_entity(st, entity);
10879 -+}
10880 -+
10881 -+/**
10882 -+ * bfq_forget_idle - update the idle tree if necessary.
10883 -+ * @st: the service tree to act upon.
10884 -+ *
10885 -+ * To preserve the global O(log N) complexity we only remove one entry here;
10886 -+ * as the idle tree will not grow indefinitely this can be done safely.
10887 -+ */
10888 -+static void bfq_forget_idle(struct bfq_service_tree *st)
10889 -+{
10890 -+ struct bfq_entity *first_idle = st->first_idle;
10891 -+ struct bfq_entity *last_idle = st->last_idle;
10892 -+
10893 -+ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
10894 -+ !bfq_gt(last_idle->finish, st->vtime)) {
10895 -+ /*
10896 -+ * Forget the whole idle tree, increasing the vtime past
10897 -+ * the last finish time of idle entities.
10898 -+ */
10899 -+ st->vtime = last_idle->finish;
10900 -+ }
10901 -+
10902 -+ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
10903 -+ bfq_put_idle_entity(st, first_idle);
10904 -+}
10905 -+
10906 -+static struct bfq_service_tree *
10907 -+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
10908 -+ struct bfq_entity *entity)
10909 -+{
10910 -+ struct bfq_service_tree *new_st = old_st;
10911 -+
10912 -+ if (entity->ioprio_changed) {
10913 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10914 -+
10915 -+ BUG_ON(old_st->wsum < entity->weight);
10916 -+ old_st->wsum -= entity->weight;
10917 -+
10918 -+ if (entity->new_weight != entity->orig_weight) {
10919 -+ entity->orig_weight = entity->new_weight;
10920 -+ entity->ioprio =
10921 -+ bfq_weight_to_ioprio(entity->orig_weight);
10922 -+ } else if (entity->new_ioprio != entity->ioprio) {
10923 -+ entity->ioprio = entity->new_ioprio;
10924 -+ entity->orig_weight =
10925 -+ bfq_ioprio_to_weight(entity->ioprio);
10926 -+ } else
10927 -+ entity->new_weight = entity->orig_weight =
10928 -+ bfq_ioprio_to_weight(entity->ioprio);
10929 -+
10930 -+ entity->ioprio_class = entity->new_ioprio_class;
10931 -+ entity->ioprio_changed = 0;
10932 -+
10933 -+ /*
10934 -+ * NOTE: here we may be changing the weight too early,
10935 -+ * this will cause unfairness. The correct approach
10936 -+ * would have required additional complexity to defer
10937 -+ * weight changes to the proper time instants (i.e.,
10938 -+ * when entity->finish <= old_st->vtime).
10939 -+ */
10940 -+ new_st = bfq_entity_service_tree(entity);
10941 -+ entity->weight = entity->orig_weight *
10942 -+ (bfqq != NULL ? bfqq->raising_coeff : 1);
10943 -+ new_st->wsum += entity->weight;
10944 -+
10945 -+ if (new_st != old_st)
10946 -+ entity->start = new_st->vtime;
10947 -+ }
10948 -+
10949 -+ return new_st;
10950 -+}
10951 -+
10952 -+/**
10953 -+ * bfq_bfqq_served - update the scheduler status after selection for service.
10954 -+ * @bfqq: the queue being served.
10955 -+ * @served: bytes to transfer.
10956 -+ *
10957 -+ * NOTE: this can be optimized, as the timestamps of upper level entities
10958 -+ * are synchronized every time a new bfqq is selected for service. By now,
10959 -+ * we keep it to better check consistency.
10960 -+ */
10961 -+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
10962 -+{
10963 -+ struct bfq_entity *entity = &bfqq->entity;
10964 -+ struct bfq_service_tree *st;
10965 -+
10966 -+ for_each_entity(entity) {
10967 -+ st = bfq_entity_service_tree(entity);
10968 -+
10969 -+ entity->service += served;
10970 -+ BUG_ON(entity->service > entity->budget);
10971 -+ BUG_ON(st->wsum == 0);
10972 -+
10973 -+ st->vtime += bfq_delta(served, st->wsum);
10974 -+ bfq_forget_idle(st);
10975 -+ }
10976 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
10977 -+}
10978 -+
10979 -+/**
10980 -+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
10981 -+ * @bfqq: the queue that needs a service update.
10982 -+ *
10983 -+ * When it's not possible to be fair in the service domain, because
10984 -+ * a queue is not consuming its budget fast enough (the meaning of
10985 -+ * fast depends on the timeout parameter), we charge it a full
10986 -+ * budget. In this way we should obtain a sort of time-domain
10987 -+ * fairness among all the seeky/slow queues.
10988 -+ */
10989 -+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
10990 -+{
10991 -+ struct bfq_entity *entity = &bfqq->entity;
10992 -+
10993 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
10994 -+
10995 -+ bfq_bfqq_served(bfqq, entity->budget - entity->service);
10996 -+}
10997 -+
10998 -+/**
10999 -+ * __bfq_activate_entity - activate an entity.
11000 -+ * @entity: the entity being activated.
11001 -+ *
11002 -+ * Called whenever an entity is activated, i.e., it is not active and one
11003 -+ * of its children receives a new request, or has to be reactivated due to
11004 -+ * budget exhaustion. It uses the current budget of the entity (and the
11005 -+ * service received if @entity is active) of the queue to calculate its
11006 -+ * timestamps.
11007 -+ */
11008 -+static void __bfq_activate_entity(struct bfq_entity *entity)
11009 -+{
11010 -+ struct bfq_sched_data *sd = entity->sched_data;
11011 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
11012 -+
11013 -+ if (entity == sd->active_entity) {
11014 -+ BUG_ON(entity->tree != NULL);
11015 -+ /*
11016 -+ * If we are requeueing the current entity we have
11017 -+ * to take care of not charging to it service it has
11018 -+ * not received.
11019 -+ */
11020 -+ bfq_calc_finish(entity, entity->service);
11021 -+ entity->start = entity->finish;
11022 -+ sd->active_entity = NULL;
11023 -+ } else if (entity->tree == &st->active) {
11024 -+ /*
11025 -+ * Requeueing an entity due to a change of some
11026 -+ * next_active entity below it. We reuse the old
11027 -+ * start time.
11028 -+ */
11029 -+ bfq_active_extract(st, entity);
11030 -+ } else if (entity->tree == &st->idle) {
11031 -+ /*
11032 -+ * Must be on the idle tree, bfq_idle_extract() will
11033 -+ * check for that.
11034 -+ */
11035 -+ bfq_idle_extract(st, entity);
11036 -+ entity->start = bfq_gt(st->vtime, entity->finish) ?
11037 -+ st->vtime : entity->finish;
11038 -+ } else {
11039 -+ /*
11040 -+ * The finish time of the entity may be invalid, and
11041 -+ * it is in the past for sure, otherwise the queue
11042 -+ * would have been on the idle tree.
11043 -+ */
11044 -+ entity->start = st->vtime;
11045 -+ st->wsum += entity->weight;
11046 -+ bfq_get_entity(entity);
11047 -+
11048 -+ BUG_ON(entity->on_st);
11049 -+ entity->on_st = 1;
11050 -+ }
11051 -+
11052 -+ st = __bfq_entity_update_weight_prio(st, entity);
11053 -+ bfq_calc_finish(entity, entity->budget);
11054 -+ bfq_active_insert(st, entity);
11055 -+}
11056 -+
11057 -+/**
11058 -+ * bfq_activate_entity - activate an entity and its ancestors if necessary.
11059 -+ * @entity: the entity to activate.
11060 -+ *
11061 -+ * Activate @entity and all the entities on the path from it to the root.
11062 -+ */
11063 -+static void bfq_activate_entity(struct bfq_entity *entity)
11064 -+{
11065 -+ struct bfq_sched_data *sd;
11066 -+
11067 -+ for_each_entity(entity) {
11068 -+ __bfq_activate_entity(entity);
11069 -+
11070 -+ sd = entity->sched_data;
11071 -+ if (!bfq_update_next_active(sd))
11072 -+ /*
11073 -+ * No need to propagate the activation to the
11074 -+ * upper entities, as they will be updated when
11075 -+ * the active entity is rescheduled.
11076 -+ */
11077 -+ break;
11078 -+ }
11079 -+}
11080 -+
11081 -+/**
11082 -+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
11083 -+ * @entity: the entity to deactivate.
11084 -+ * @requeue: if false, the entity will not be put into the idle tree.
11085 -+ *
11086 -+ * Deactivate an entity, independently from its previous state. If the
11087 -+ * entity was not on a service tree just return, otherwise if it is on
11088 -+ * any scheduler tree, extract it from that tree, and if necessary
11089 -+ * and if the caller did not specify @requeue, put it on the idle tree.
11090 -+ *
11091 -+ * Return %1 if the caller should update the entity hierarchy, i.e.,
11092 -+ * if the entity was under service or if it was the next_active for
11093 -+ * its sched_data; return %0 otherwise.
11094 -+ */
11095 -+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
11096 -+{
11097 -+ struct bfq_sched_data *sd = entity->sched_data;
11098 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
11099 -+ int was_active = entity == sd->active_entity;
11100 -+ int ret = 0;
11101 -+
11102 -+ if (!entity->on_st)
11103 -+ return 0;
11104 -+
11105 -+ BUG_ON(was_active && entity->tree != NULL);
11106 -+
11107 -+ if (was_active) {
11108 -+ bfq_calc_finish(entity, entity->service);
11109 -+ sd->active_entity = NULL;
11110 -+ } else if (entity->tree == &st->active)
11111 -+ bfq_active_extract(st, entity);
11112 -+ else if (entity->tree == &st->idle)
11113 -+ bfq_idle_extract(st, entity);
11114 -+ else if (entity->tree != NULL)
11115 -+ BUG();
11116 -+
11117 -+ if (was_active || sd->next_active == entity)
11118 -+ ret = bfq_update_next_active(sd);
11119 -+
11120 -+ if (!requeue || !bfq_gt(entity->finish, st->vtime))
11121 -+ bfq_forget_entity(st, entity);
11122 -+ else
11123 -+ bfq_idle_insert(st, entity);
11124 -+
11125 -+ BUG_ON(sd->active_entity == entity);
11126 -+ BUG_ON(sd->next_active == entity);
11127 -+
11128 -+ return ret;
11129 -+}
11130 -+
11131 -+/**
11132 -+ * bfq_deactivate_entity - deactivate an entity.
11133 -+ * @entity: the entity to deactivate.
11134 -+ * @requeue: true if the entity can be put on the idle tree
11135 -+ */
11136 -+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
11137 -+{
11138 -+ struct bfq_sched_data *sd;
11139 -+ struct bfq_entity *parent;
11140 -+
11141 -+ for_each_entity_safe(entity, parent) {
11142 -+ sd = entity->sched_data;
11143 -+
11144 -+ if (!__bfq_deactivate_entity(entity, requeue))
11145 -+ /*
11146 -+ * The parent entity is still backlogged, and
11147 -+ * we don't need to update it as it is still
11148 -+ * under service.
11149 -+ */
11150 -+ break;
11151 -+
11152 -+ if (sd->next_active != NULL)
11153 -+ /*
11154 -+ * The parent entity is still backlogged and
11155 -+ * the budgets on the path towards the root
11156 -+ * need to be updated.
11157 -+ */
11158 -+ goto update;
11159 -+
11160 -+ /*
11161 -+ * If we reach there the parent is no more backlogged and
11162 -+ * we want to propagate the dequeue upwards.
11163 -+ */
11164 -+ requeue = 1;
11165 -+ }
11166 -+
11167 -+ return;
11168 -+
11169 -+update:
11170 -+ entity = parent;
11171 -+ for_each_entity(entity) {
11172 -+ __bfq_activate_entity(entity);
11173 -+
11174 -+ sd = entity->sched_data;
11175 -+ if (!bfq_update_next_active(sd))
11176 -+ break;
11177 -+ }
11178 -+}
11179 -+
11180 -+/**
11181 -+ * bfq_update_vtime - update vtime if necessary.
11182 -+ * @st: the service tree to act upon.
11183 -+ *
11184 -+ * If necessary update the service tree vtime to have at least one
11185 -+ * eligible entity, skipping to its start time. Assumes that the
11186 -+ * active tree of the device is not empty.
11187 -+ *
11188 -+ * NOTE: this hierarchical implementation updates vtimes quite often,
11189 -+ * we may end up with reactivated tasks getting timestamps after a
11190 -+ * vtime skip done because we needed a ->first_active entity on some
11191 -+ * intermediate node.
11192 -+ */
11193 -+static void bfq_update_vtime(struct bfq_service_tree *st)
11194 -+{
11195 -+ struct bfq_entity *entry;
11196 -+ struct rb_node *node = st->active.rb_node;
11197 -+
11198 -+ entry = rb_entry(node, struct bfq_entity, rb_node);
11199 -+ if (bfq_gt(entry->min_start, st->vtime)) {
11200 -+ st->vtime = entry->min_start;
11201 -+ bfq_forget_idle(st);
11202 -+ }
11203 -+}
11204 -+
11205 -+/**
11206 -+ * bfq_first_active - find the eligible entity with the smallest finish time
11207 -+ * @st: the service tree to select from.
11208 -+ *
11209 -+ * This function searches the first schedulable entity, starting from the
11210 -+ * root of the tree and going on the left every time on this side there is
11211 -+ * a subtree with at least one eligible (start >= vtime) entity. The path
11212 -+ * on the right is followed only if a) the left subtree contains no eligible
11213 -+ * entities and b) no eligible entity has been found yet.
11214 -+ */
11215 -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
11216 -+{
11217 -+ struct bfq_entity *entry, *first = NULL;
11218 -+ struct rb_node *node = st->active.rb_node;
11219 -+
11220 -+ while (node != NULL) {
11221 -+ entry = rb_entry(node, struct bfq_entity, rb_node);
11222 -+left:
11223 -+ if (!bfq_gt(entry->start, st->vtime))
11224 -+ first = entry;
11225 -+
11226 -+ BUG_ON(bfq_gt(entry->min_start, st->vtime));
11227 -+
11228 -+ if (node->rb_left != NULL) {
11229 -+ entry = rb_entry(node->rb_left,
11230 -+ struct bfq_entity, rb_node);
11231 -+ if (!bfq_gt(entry->min_start, st->vtime)) {
11232 -+ node = node->rb_left;
11233 -+ goto left;
11234 -+ }
11235 -+ }
11236 -+ if (first != NULL)
11237 -+ break;
11238 -+ node = node->rb_right;
11239 -+ }
11240 -+
11241 -+ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
11242 -+ return first;
11243 -+}
11244 -+
11245 -+/**
11246 -+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
11247 -+ * @st: the service tree.
11248 -+ *
11249 -+ * Update the virtual time in @st and return the first eligible entity
11250 -+ * it contains.
11251 -+ */
11252 -+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
11253 -+ bool force)
11254 -+{
11255 -+ struct bfq_entity *entity, *new_next_active = NULL;
11256 -+
11257 -+ if (RB_EMPTY_ROOT(&st->active))
11258 -+ return NULL;
11259 -+
11260 -+ bfq_update_vtime(st);
11261 -+ entity = bfq_first_active_entity(st);
11262 -+ BUG_ON(bfq_gt(entity->start, st->vtime));
11263 -+
11264 -+ /*
11265 -+ * If the chosen entity does not match with the sched_data's
11266 -+ * next_active and we are forcedly serving the IDLE priority
11267 -+ * class tree, bubble up budget update.
11268 -+ */
11269 -+ if (unlikely(force && entity != entity->sched_data->next_active)) {
11270 -+ new_next_active = entity;
11271 -+ for_each_entity(new_next_active)
11272 -+ bfq_update_budget(new_next_active);
11273 -+ }
11274 -+
11275 -+ return entity;
11276 -+}
11277 -+
11278 -+/**
11279 -+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
11280 -+ * @sd: the sched_data.
11281 -+ * @extract: if true the returned entity will be also extracted from @sd.
11282 -+ *
11283 -+ * NOTE: since we cache the next_active entity at each level of the
11284 -+ * hierarchy, the complexity of the lookup can be decreased with
11285 -+ * absolutely no effort just returning the cached next_active value;
11286 -+ * we prefer to do full lookups to test the consistency of * the data
11287 -+ * structures.
11288 -+ */
11289 -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
11290 -+ int extract,
11291 -+ struct bfq_data *bfqd)
11292 -+{
11293 -+ struct bfq_service_tree *st = sd->service_tree;
11294 -+ struct bfq_entity *entity;
11295 -+ int i=0;
11296 -+
11297 -+ BUG_ON(sd->active_entity != NULL);
11298 -+
11299 -+ if (bfqd != NULL &&
11300 -+ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
11301 -+ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true);
11302 -+ if (entity != NULL) {
11303 -+ i = BFQ_IOPRIO_CLASSES - 1;
11304 -+ bfqd->bfq_class_idle_last_service = jiffies;
11305 -+ sd->next_active = entity;
11306 -+ }
11307 -+ }
11308 -+ for (; i < BFQ_IOPRIO_CLASSES; i++) {
11309 -+ entity = __bfq_lookup_next_entity(st + i, false);
11310 -+ if (entity != NULL) {
11311 -+ if (extract) {
11312 -+ bfq_check_next_active(sd, entity);
11313 -+ bfq_active_extract(st + i, entity);
11314 -+ sd->active_entity = entity;
11315 -+ sd->next_active = NULL;
11316 -+ }
11317 -+ break;
11318 -+ }
11319 -+ }
11320 -+
11321 -+ return entity;
11322 -+}
11323 -+
11324 -+/*
11325 -+ * Get next queue for service.
11326 -+ */
11327 -+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
11328 -+{
11329 -+ struct bfq_entity *entity = NULL;
11330 -+ struct bfq_sched_data *sd;
11331 -+ struct bfq_queue *bfqq;
11332 -+
11333 -+ BUG_ON(bfqd->active_queue != NULL);
11334 -+
11335 -+ if (bfqd->busy_queues == 0)
11336 -+ return NULL;
11337 -+
11338 -+ sd = &bfqd->root_group->sched_data;
11339 -+ for (; sd != NULL; sd = entity->my_sched_data) {
11340 -+ entity = bfq_lookup_next_entity(sd, 1, bfqd);
11341 -+ BUG_ON(entity == NULL);
11342 -+ entity->service = 0;
11343 -+ }
11344 -+
11345 -+ bfqq = bfq_entity_to_bfqq(entity);
11346 -+ BUG_ON(bfqq == NULL);
11347 -+
11348 -+ return bfqq;
11349 -+}
11350 -+
11351 -+/*
11352 -+ * Forced extraction of the given queue.
11353 -+ */
11354 -+static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
11355 -+ struct bfq_queue *bfqq)
11356 -+{
11357 -+ struct bfq_entity *entity;
11358 -+ struct bfq_sched_data *sd;
11359 -+
11360 -+ BUG_ON(bfqd->active_queue != NULL);
11361 -+
11362 -+ entity = &bfqq->entity;
11363 -+ /*
11364 -+ * Bubble up extraction/update from the leaf to the root.
11365 -+ */
11366 -+ for_each_entity(entity) {
11367 -+ sd = entity->sched_data;
11368 -+ bfq_update_budget(entity);
11369 -+ bfq_update_vtime(bfq_entity_service_tree(entity));
11370 -+ bfq_active_extract(bfq_entity_service_tree(entity), entity);
11371 -+ sd->active_entity = entity;
11372 -+ sd->next_active = NULL;
11373 -+ entity->service = 0;
11374 -+ }
11375 -+
11376 -+ return;
11377 -+}
11378 -+
11379 -+static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)
11380 -+{
11381 -+ if (bfqd->active_bic != NULL) {
11382 -+ put_io_context(bfqd->active_bic->icq.ioc);
11383 -+ bfqd->active_bic = NULL;
11384 -+ }
11385 -+
11386 -+ bfqd->active_queue = NULL;
11387 -+ del_timer(&bfqd->idle_slice_timer);
11388 -+}
11389 -+
11390 -+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
11391 -+ int requeue)
11392 -+{
11393 -+ struct bfq_entity *entity = &bfqq->entity;
11394 -+
11395 -+ if (bfqq == bfqd->active_queue)
11396 -+ __bfq_bfqd_reset_active(bfqd);
11397 -+
11398 -+ bfq_deactivate_entity(entity, requeue);
11399 -+}
11400 -+
11401 -+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
11402 -+{
11403 -+ struct bfq_entity *entity = &bfqq->entity;
11404 -+
11405 -+ bfq_activate_entity(entity);
11406 -+}
11407 -+
11408 -+/*
11409 -+ * Called when the bfqq no longer has requests pending, remove it from
11410 -+ * the service tree.
11411 -+ */
11412 -+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
11413 -+ int requeue)
11414 -+{
11415 -+ BUG_ON(!bfq_bfqq_busy(bfqq));
11416 -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
11417 -+
11418 -+ bfq_log_bfqq(bfqd, bfqq, "del from busy");
11419 -+
11420 -+ bfq_clear_bfqq_busy(bfqq);
11421 -+
11422 -+ BUG_ON(bfqd->busy_queues == 0);
11423 -+ bfqd->busy_queues--;
11424 -+
11425 -+ bfq_deactivate_bfqq(bfqd, bfqq, requeue);
11426 -+}
11427 -+
11428 -+/*
11429 -+ * Called when an inactive queue receives a new request.
11430 -+ */
11431 -+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
11432 -+{
11433 -+ BUG_ON(bfq_bfqq_busy(bfqq));
11434 -+ BUG_ON(bfqq == bfqd->active_queue);
11435 -+
11436 -+ bfq_log_bfqq(bfqd, bfqq, "add to busy");
11437 -+
11438 -+ bfq_activate_bfqq(bfqd, bfqq);
11439 -+
11440 -+ bfq_mark_bfqq_busy(bfqq);
11441 -+ bfqd->busy_queues++;
11442 -+}
11443 -diff --git a/block/bfq.h b/block/bfq.h
11444 -new file mode 100644
11445 -index 0000000..48ecde9
11446 ---- /dev/null
11447 -+++ b/block/bfq.h
11448 -@@ -0,0 +1,603 @@
11449 -+/*
11450 -+ * BFQ-v6r2 for 3.10.0: data structures and common functions prototypes.
11451 -+ *
11452 -+ * Based on ideas and code from CFQ:
11453 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
11454 -+ *
11455 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
11456 -+ * Paolo Valente <paolo.valente@×××××××.it>
11457 -+ *
11458 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
11459 -+ */
11460 -+
11461 -+#ifndef _BFQ_H
11462 -+#define _BFQ_H
11463 -+
11464 -+#include <linux/blktrace_api.h>
11465 -+#include <linux/hrtimer.h>
11466 -+#include <linux/ioprio.h>
11467 -+#include <linux/rbtree.h>
11468 -+
11469 -+#define BFQ_IOPRIO_CLASSES 3
11470 -+#define BFQ_CL_IDLE_TIMEOUT HZ/5
11471 -+
11472 -+#define BFQ_MIN_WEIGHT 1
11473 -+#define BFQ_MAX_WEIGHT 1000
11474 -+
11475 -+#define BFQ_DEFAULT_GRP_WEIGHT 10
11476 -+#define BFQ_DEFAULT_GRP_IOPRIO 0
11477 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
11478 -+
11479 -+struct bfq_entity;
11480 -+
11481 -+/**
11482 -+ * struct bfq_service_tree - per ioprio_class service tree.
11483 -+ * @active: tree for active entities (i.e., those backlogged).
11484 -+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
11485 -+ * @first_idle: idle entity with minimum F_i.
11486 -+ * @last_idle: idle entity with maximum F_i.
11487 -+ * @vtime: scheduler virtual time.
11488 -+ * @wsum: scheduler weight sum; active and idle entities contribute to it.
11489 -+ *
11490 -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
11491 -+ * ioprio_class has its own independent scheduler, and so its own
11492 -+ * bfq_service_tree. All the fields are protected by the queue lock
11493 -+ * of the containing bfqd.
11494 -+ */
11495 -+struct bfq_service_tree {
11496 -+ struct rb_root active;
11497 -+ struct rb_root idle;
11498 -+
11499 -+ struct bfq_entity *first_idle;
11500 -+ struct bfq_entity *last_idle;
11501 -+
11502 -+ u64 vtime;
11503 -+ unsigned long wsum;
11504 -+};
11505 -+
11506 -+/**
11507 -+ * struct bfq_sched_data - multi-class scheduler.
11508 -+ * @active_entity: entity under service.
11509 -+ * @next_active: head-of-the-line entity in the scheduler.
11510 -+ * @service_tree: array of service trees, one per ioprio_class.
11511 -+ *
11512 -+ * bfq_sched_data is the basic scheduler queue. It supports three
11513 -+ * ioprio_classes, and can be used either as a toplevel queue or as
11514 -+ * an intermediate queue on a hierarchical setup.
11515 -+ * @next_active points to the active entity of the sched_data service
11516 -+ * trees that will be scheduled next.
11517 -+ *
11518 -+ * The supported ioprio_classes are the same as in CFQ, in descending
11519 -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
11520 -+ * Requests from higher priority queues are served before all the
11521 -+ * requests from lower priority queues; among requests of the same
11522 -+ * queue requests are served according to B-WF2Q+.
11523 -+ * All the fields are protected by the queue lock of the containing bfqd.
11524 -+ */
11525 -+struct bfq_sched_data {
11526 -+ struct bfq_entity *active_entity;
11527 -+ struct bfq_entity *next_active;
11528 -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
11529 -+};
11530 -+
11531 -+/**
11532 -+ * struct bfq_entity - schedulable entity.
11533 -+ * @rb_node: service_tree member.
11534 -+ * @on_st: flag, true if the entity is on a tree (either the active or
11535 -+ * the idle one of its service_tree).
11536 -+ * @finish: B-WF2Q+ finish timestamp (aka F_i).
11537 -+ * @start: B-WF2Q+ start timestamp (aka S_i).
11538 -+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
11539 -+ * @min_start: minimum start time of the (active) subtree rooted at
11540 -+ * this entity; used for O(log N) lookups into active trees.
11541 -+ * @service: service received during the last round of service.
11542 -+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
11543 -+ * @weight: weight of the queue
11544 -+ * @parent: parent entity, for hierarchical scheduling.
11545 -+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
11546 -+ * associated scheduler queue, %NULL on leaf nodes.
11547 -+ * @sched_data: the scheduler queue this entity belongs to.
11548 -+ * @ioprio: the ioprio in use.
11549 -+ * @new_weight: when a weight change is requested, the new weight value.
11550 -+ * @orig_weight: original weight, used to implement weight boosting
11551 -+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
11552 -+ * @ioprio_class: the ioprio_class in use.
11553 -+ * @new_ioprio_class: when an ioprio_class change is requested, the new
11554 -+ * ioprio_class value.
11555 -+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
11556 -+ * ioprio_class change.
11557 -+ *
11558 -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
11559 -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
11560 -+ * entity belongs to the sched_data of the parent group in the cgroup
11561 -+ * hierarchy. Non-leaf entities have also their own sched_data, stored
11562 -+ * in @my_sched_data.
11563 -+ *
11564 -+ * Each entity stores independently its priority values; this would
11565 -+ * allow different weights on different devices, but this
11566 -+ * functionality is not exported to userspace by now. Priorities and
11567 -+ * weights are updated lazily, first storing the new values into the
11568 -+ * new_* fields, then setting the @ioprio_changed flag. As soon as
11569 -+ * there is a transition in the entity state that allows the priority
11570 -+ * update to take place the effective and the requested priority
11571 -+ * values are synchronized.
11572 -+ *
11573 -+ * Unless cgroups are used, the weight value is calculated from the
11574 -+ * ioprio to export the same interface as CFQ. When dealing with
11575 -+ * ``well-behaved'' queues (i.e., queues that do not spend too much
11576 -+ * time to consume their budget and have true sequential behavior, and
11577 -+ * when there are no external factors breaking anticipation) the
11578 -+ * relative weights at each level of the cgroups hierarchy should be
11579 -+ * guaranteed. All the fields are protected by the queue lock of the
11580 -+ * containing bfqd.
11581 -+ */
11582 -+struct bfq_entity {
11583 -+ struct rb_node rb_node;
11584 -+
11585 -+ int on_st;
11586 -+
11587 -+ u64 finish;
11588 -+ u64 start;
11589 -+
11590 -+ struct rb_root *tree;
11591 -+
11592 -+ u64 min_start;
11593 -+
11594 -+ unsigned long service, budget;
11595 -+ unsigned short weight, new_weight;
11596 -+ unsigned short orig_weight;
11597 -+
11598 -+ struct bfq_entity *parent;
11599 -+
11600 -+ struct bfq_sched_data *my_sched_data;
11601 -+ struct bfq_sched_data *sched_data;
11602 -+
11603 -+ unsigned short ioprio, new_ioprio;
11604 -+ unsigned short ioprio_class, new_ioprio_class;
11605 -+
11606 -+ int ioprio_changed;
11607 -+};
11608 -+
11609 -+struct bfq_group;
11610 -+
11611 -+/**
11612 -+ * struct bfq_queue - leaf schedulable entity.
11613 -+ * @ref: reference counter.
11614 -+ * @bfqd: parent bfq_data.
11615 -+ * @new_bfqq: shared bfq_queue if queue is cooperating with
11616 -+ * one or more other queues.
11617 -+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
11618 -+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
11619 -+ * @sort_list: sorted list of pending requests.
11620 -+ * @next_rq: if fifo isn't expired, next request to serve.
11621 -+ * @queued: nr of requests queued in @sort_list.
11622 -+ * @allocated: currently allocated requests.
11623 -+ * @meta_pending: pending metadata requests.
11624 -+ * @fifo: fifo list of requests in sort_list.
11625 -+ * @entity: entity representing this queue in the scheduler.
11626 -+ * @max_budget: maximum budget allowed from the feedback mechanism.
11627 -+ * @budget_timeout: budget expiration (in jiffies).
11628 -+ * @dispatched: number of requests on the dispatch list or inside driver.
11629 -+ * @org_ioprio: saved ioprio during boosted periods.
11630 -+ * @flags: status flags.
11631 -+ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
11632 -+ * @seek_samples: number of seeks sampled
11633 -+ * @seek_total: sum of the distances of the seeks sampled
11634 -+ * @seek_mean: mean seek distance
11635 -+ * @last_request_pos: position of the last request enqueued
11636 -+ * @pid: pid of the process owning the queue, used for logging purposes.
11637 -+ * @last_rais_start_time: last (idle -> weight-raised) transition attempt
11638 -+ * @raising_cur_max_time: current max raising time for this queue
11639 -+ *
11640 -+ * A bfq_queue is a leaf request queue; it can be associated to an io_context
11641 -+ * or more (if it is an async one). @cgroup holds a reference to the
11642 -+ * cgroup, to be sure that it does not disappear while a bfqq still
11643 -+ * references it (mostly to avoid races between request issuing and task
11644 -+ * migration followed by cgroup distruction).
11645 -+ * All the fields are protected by the queue lock of the containing bfqd.
11646 -+ */
11647 -+struct bfq_queue {
11648 -+ atomic_t ref;
11649 -+ struct bfq_data *bfqd;
11650 -+
11651 -+ /* fields for cooperating queues handling */
11652 -+ struct bfq_queue *new_bfqq;
11653 -+ struct rb_node pos_node;
11654 -+ struct rb_root *pos_root;
11655 -+
11656 -+ struct rb_root sort_list;
11657 -+ struct request *next_rq;
11658 -+ int queued[2];
11659 -+ int allocated[2];
11660 -+ int meta_pending;
11661 -+ struct list_head fifo;
11662 -+
11663 -+ struct bfq_entity entity;
11664 -+
11665 -+ unsigned long max_budget;
11666 -+ unsigned long budget_timeout;
11667 -+
11668 -+ int dispatched;
11669 -+
11670 -+ unsigned short org_ioprio;
11671 -+
11672 -+ unsigned int flags;
11673 -+
11674 -+ struct list_head bfqq_list;
11675 -+
11676 -+ unsigned int seek_samples;
11677 -+ u64 seek_total;
11678 -+ sector_t seek_mean;
11679 -+ sector_t last_request_pos;
11680 -+
11681 -+ pid_t pid;
11682 -+
11683 -+ /* weight-raising fields */
11684 -+ unsigned int raising_cur_max_time;
11685 -+ u64 last_rais_start_finish, soft_rt_next_start;
11686 -+ unsigned int raising_coeff;
11687 -+};
11688 -+
11689 -+/**
11690 -+ * struct bfq_ttime - per process thinktime stats.
11691 -+ * @ttime_total: total process thinktime
11692 -+ * @ttime_samples: number of thinktime samples
11693 -+ * @ttime_mean: average process thinktime
11694 -+ */
11695 -+struct bfq_ttime {
11696 -+ unsigned long last_end_request;
11697 -+
11698 -+ unsigned long ttime_total;
11699 -+ unsigned long ttime_samples;
11700 -+ unsigned long ttime_mean;
11701 -+};
11702 -+
11703 -+/**
11704 -+ * struct bfq_io_cq - per (request_queue, io_context) structure.
11705 -+ * @icq: associated io_cq structure
11706 -+ * @bfqq: array of two process queues, the sync and the async
11707 -+ * @ttime: associated @bfq_ttime struct
11708 -+ */
11709 -+struct bfq_io_cq {
11710 -+ struct io_cq icq; /* must be the first member */
11711 -+ struct bfq_queue *bfqq[2];
11712 -+ struct bfq_ttime ttime;
11713 -+ int ioprio;
11714 -+};
11715 -+
11716 -+/**
11717 -+ * struct bfq_data - per device data structure.
11718 -+ * @queue: request queue for the managed device.
11719 -+ * @root_group: root bfq_group for the device.
11720 -+ * @rq_pos_tree: rbtree sorted by next_request position,
11721 -+ * used when determining if two or more queues
11722 -+ * have interleaving requests (see bfq_close_cooperator).
11723 -+ * @busy_queues: number of bfq_queues containing requests (including the
11724 -+ * queue under service, even if it is idling).
11725 -+ * @queued: number of queued requests.
11726 -+ * @rq_in_driver: number of requests dispatched and waiting for completion.
11727 -+ * @sync_flight: number of sync requests in the driver.
11728 -+ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
11729 -+ * completed requests .
11730 -+ * @hw_tag_samples: nr of samples used to calculate hw_tag.
11731 -+ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
11732 -+ * @budgets_assigned: number of budgets assigned.
11733 -+ * @idle_slice_timer: timer set when idling for the next sequential request
11734 -+ * from the queue under service.
11735 -+ * @unplug_work: delayed work to restart dispatching on the request queue.
11736 -+ * @active_queue: bfq_queue under service.
11737 -+ * @active_bic: bfq_io_cq (bic) associated with the @active_queue.
11738 -+ * @last_position: on-disk position of the last served request.
11739 -+ * @last_budget_start: beginning of the last budget.
11740 -+ * @last_idling_start: beginning of the last idle slice.
11741 -+ * @peak_rate: peak transfer rate observed for a budget.
11742 -+ * @peak_rate_samples: number of samples used to calculate @peak_rate.
11743 -+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
11744 -+ * @group_list: list of all the bfq_groups active on the device.
11745 -+ * @active_list: list of all the bfq_queues active on the device.
11746 -+ * @idle_list: list of all the bfq_queues idle on the device.
11747 -+ * @bfq_quantum: max number of requests dispatched per dispatch round.
11748 -+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
11749 -+ * requests are served in fifo order.
11750 -+ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
11751 -+ * @bfq_back_max: maximum allowed backward seek.
11752 -+ * @bfq_slice_idle: maximum idling time.
11753 -+ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
11754 -+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
11755 -+ * async queues.
11756 -+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
11757 -+ * to prevent seeky queues to impose long latencies to well
11758 -+ * behaved ones (this also implies that seeky queues cannot
11759 -+ * receive guarantees in the service domain; after a timeout
11760 -+ * they are charged for the whole allocated budget, to try
11761 -+ * to preserve a behavior reasonably fair among them, but
11762 -+ * without service-domain guarantees).
11763 -+ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
11764 -+ * queue is multiplied
11765 -+ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
11766 -+ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
11767 -+ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
11768 -+ * may be reactivated for a queue (in jiffies)
11769 -+ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
11770 -+ * after which weight-raising may be
11771 -+ * reactivated for an already busy queue
11772 -+ * (in jiffies)
11773 -+ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
11774 -+ * sectors per seconds
11775 -+ * @RT_prod: cached value of the product R*T used for computing the maximum
11776 -+ * duration of the weight raising automatically
11777 -+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
11778 -+ *
11779 -+ * All the fields are protected by the @queue lock.
11780 -+ */
11781 -+struct bfq_data {
11782 -+ struct request_queue *queue;
11783 -+
11784 -+ struct bfq_group *root_group;
11785 -+
11786 -+ struct rb_root rq_pos_tree;
11787 -+
11788 -+ int busy_queues;
11789 -+ int queued;
11790 -+ int rq_in_driver;
11791 -+ int sync_flight;
11792 -+
11793 -+ int max_rq_in_driver;
11794 -+ int hw_tag_samples;
11795 -+ int hw_tag;
11796 -+
11797 -+ int budgets_assigned;
11798 -+
11799 -+ struct timer_list idle_slice_timer;
11800 -+ struct work_struct unplug_work;
11801 -+
11802 -+ struct bfq_queue *active_queue;
11803 -+ struct bfq_io_cq *active_bic;
11804 -+
11805 -+ sector_t last_position;
11806 -+
11807 -+ ktime_t last_budget_start;
11808 -+ ktime_t last_idling_start;
11809 -+ int peak_rate_samples;
11810 -+ u64 peak_rate;
11811 -+ unsigned long bfq_max_budget;
11812 -+
11813 -+ struct hlist_head group_list;
11814 -+ struct list_head active_list;
11815 -+ struct list_head idle_list;
11816 -+
11817 -+ unsigned int bfq_quantum;
11818 -+ unsigned int bfq_fifo_expire[2];
11819 -+ unsigned int bfq_back_penalty;
11820 -+ unsigned int bfq_back_max;
11821 -+ unsigned int bfq_slice_idle;
11822 -+ u64 bfq_class_idle_last_service;
11823 -+
11824 -+ unsigned int bfq_user_max_budget;
11825 -+ unsigned int bfq_max_budget_async_rq;
11826 -+ unsigned int bfq_timeout[2];
11827 -+
11828 -+ bool low_latency;
11829 -+
11830 -+ /* parameters of the low_latency heuristics */
11831 -+ unsigned int bfq_raising_coeff;
11832 -+ unsigned int bfq_raising_max_time;
11833 -+ unsigned int bfq_raising_rt_max_time;
11834 -+ unsigned int bfq_raising_min_idle_time;
11835 -+ unsigned int bfq_raising_min_inter_arr_async;
11836 -+ unsigned int bfq_raising_max_softrt_rate;
11837 -+ u64 RT_prod;
11838 -+
11839 -+ struct bfq_queue oom_bfqq;
11840 -+};
11841 -+
11842 -+enum bfqq_state_flags {
11843 -+ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */
11844 -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
11845 -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
11846 -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
11847 -+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
11848 -+ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */
11849 -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
11850 -+ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
11851 -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
11852 -+ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
11853 -+ BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */
11854 -+};
11855 -+
11856 -+#define BFQ_BFQQ_FNS(name) \
11857 -+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
11858 -+{ \
11859 -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
11860 -+} \
11861 -+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
11862 -+{ \
11863 -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
11864 -+} \
11865 -+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
11866 -+{ \
11867 -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
11868 -+}
11869 -+
11870 -+BFQ_BFQQ_FNS(busy);
11871 -+BFQ_BFQQ_FNS(wait_request);
11872 -+BFQ_BFQQ_FNS(must_alloc);
11873 -+BFQ_BFQQ_FNS(fifo_expire);
11874 -+BFQ_BFQQ_FNS(idle_window);
11875 -+BFQ_BFQQ_FNS(prio_changed);
11876 -+BFQ_BFQQ_FNS(sync);
11877 -+BFQ_BFQQ_FNS(budget_new);
11878 -+BFQ_BFQQ_FNS(coop);
11879 -+BFQ_BFQQ_FNS(split_coop);
11880 -+BFQ_BFQQ_FNS(some_coop_idle);
11881 -+#undef BFQ_BFQQ_FNS
11882 -+
11883 -+/* Logging facilities. */
11884 -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
11885 -+ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
11886 -+
11887 -+#define bfq_log(bfqd, fmt, args...) \
11888 -+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
11889 -+
11890 -+/* Expiration reasons. */
11891 -+enum bfqq_expiration {
11892 -+ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */
11893 -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
11894 -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
11895 -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
11896 -+};
11897 -+
11898 -+#ifdef CONFIG_CGROUP_BFQIO
11899 -+/**
11900 -+ * struct bfq_group - per (device, cgroup) data structure.
11901 -+ * @entity: schedulable entity to insert into the parent group sched_data.
11902 -+ * @sched_data: own sched_data, to contain child entities (they may be
11903 -+ * both bfq_queues and bfq_groups).
11904 -+ * @group_node: node to be inserted into the bfqio_cgroup->group_data
11905 -+ * list of the containing cgroup's bfqio_cgroup.
11906 -+ * @bfqd_node: node to be inserted into the @bfqd->group_list list
11907 -+ * of the groups active on the same device; used for cleanup.
11908 -+ * @bfqd: the bfq_data for the device this group acts upon.
11909 -+ * @async_bfqq: array of async queues for all the tasks belonging to
11910 -+ * the group, one queue per ioprio value per ioprio_class,
11911 -+ * except for the idle class that has only one queue.
11912 -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
11913 -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
11914 -+ * to avoid too many special cases during group creation/migration.
11915 -+ *
11916 -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
11917 -+ * there is a set of bfq_groups, each one collecting the lower-level
11918 -+ * entities belonging to the group that are acting on the same device.
11919 -+ *
11920 -+ * Locking works as follows:
11921 -+ * o @group_node is protected by the bfqio_cgroup lock, and is accessed
11922 -+ * via RCU from its readers.
11923 -+ * o @bfqd is protected by the queue lock, RCU is used to access it
11924 -+ * from the readers.
11925 -+ * o All the other fields are protected by the @bfqd queue lock.
11926 -+ */
11927 -+struct bfq_group {
11928 -+ struct bfq_entity entity;
11929 -+ struct bfq_sched_data sched_data;
11930 -+
11931 -+ struct hlist_node group_node;
11932 -+ struct hlist_node bfqd_node;
11933 -+
11934 -+ void *bfqd;
11935 -+
11936 -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
11937 -+ struct bfq_queue *async_idle_bfqq;
11938 -+
11939 -+ struct bfq_entity *my_entity;
11940 -+};
11941 -+
11942 -+/**
11943 -+ * struct bfqio_cgroup - bfq cgroup data structure.
11944 -+ * @css: subsystem state for bfq in the containing cgroup.
11945 -+ * @weight: cgroup weight.
11946 -+ * @ioprio: cgroup ioprio.
11947 -+ * @ioprio_class: cgroup ioprio_class.
11948 -+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
11949 -+ * @group_data: list containing the bfq_group belonging to this cgroup.
11950 -+ *
11951 -+ * @group_data is accessed using RCU, with @lock protecting the updates,
11952 -+ * @ioprio and @ioprio_class are protected by @lock.
11953 -+ */
11954 -+struct bfqio_cgroup {
11955 -+ struct cgroup_subsys_state css;
11956 -+
11957 -+ unsigned short weight, ioprio, ioprio_class;
11958 -+
11959 -+ spinlock_t lock;
11960 -+ struct hlist_head group_data;
11961 -+};
11962 -+#else
11963 -+struct bfq_group {
11964 -+ struct bfq_sched_data sched_data;
11965 -+
11966 -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
11967 -+ struct bfq_queue *async_idle_bfqq;
11968 -+};
11969 -+#endif
11970 -+
11971 -+static inline struct bfq_service_tree *
11972 -+bfq_entity_service_tree(struct bfq_entity *entity)
11973 -+{
11974 -+ struct bfq_sched_data *sched_data = entity->sched_data;
11975 -+ unsigned int idx = entity->ioprio_class - 1;
11976 -+
11977 -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
11978 -+ BUG_ON(sched_data == NULL);
11979 -+
11980 -+ return sched_data->service_tree + idx;
11981 -+}
11982 -+
11983 -+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
11984 -+ int is_sync)
11985 -+{
11986 -+ return bic->bfqq[!!is_sync];
11987 -+}
11988 -+
11989 -+static inline void bic_set_bfqq(struct bfq_io_cq *bic,
11990 -+ struct bfq_queue *bfqq, int is_sync)
11991 -+{
11992 -+ bic->bfqq[!!is_sync] = bfqq;
11993 -+}
11994 -+
11995 -+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
11996 -+{
11997 -+ return bic->icq.q->elevator->elevator_data;
11998 -+}
11999 -+
12000 -+/**
12001 -+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
12002 -+ * @ptr: a pointer to a bfqd.
12003 -+ * @flags: storage for the flags to be saved.
12004 -+ *
12005 -+ * This function allows bfqg->bfqd to be protected by the
12006 -+ * queue lock of the bfqd they reference; the pointer is dereferenced
12007 -+ * under RCU, so the storage for bfqd is assured to be safe as long
12008 -+ * as the RCU read side critical section does not end. After the
12009 -+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
12010 -+ * sure that no other writer accessed it. If we raced with a writer,
12011 -+ * the function returns NULL, with the queue unlocked, otherwise it
12012 -+ * returns the dereferenced pointer, with the queue locked.
12013 -+ */
12014 -+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
12015 -+ unsigned long *flags)
12016 -+{
12017 -+ struct bfq_data *bfqd;
12018 -+
12019 -+ rcu_read_lock();
12020 -+ bfqd = rcu_dereference(*(struct bfq_data **)ptr);
12021 -+
12022 -+ if (bfqd != NULL) {
12023 -+ spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
12024 -+ if (*ptr == bfqd)
12025 -+ goto out;
12026 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
12027 -+ }
12028 -+
12029 -+ bfqd = NULL;
12030 -+out:
12031 -+ rcu_read_unlock();
12032 -+ return bfqd;
12033 -+}
12034 -+
12035 -+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
12036 -+ unsigned long *flags)
12037 -+{
12038 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
12039 -+}
12040 -+
12041 -+static void bfq_changed_ioprio(struct bfq_io_cq *bic);
12042 -+static void bfq_put_queue(struct bfq_queue *bfqq);
12043 -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
12044 -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
12045 -+ struct bfq_group *bfqg, int is_sync,
12046 -+ struct bfq_io_cq *bic, gfp_t gfp_mask);
12047 -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
12048 -+ struct bfq_group *bfqg);
12049 -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
12050 -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
12051 -+#endif
12052 ---
12053 -1.8.1.4
12054 -
12055
12056 Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1
12057 ===================================================================
12058 --- genpatches-2.6/trunk/3.14/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1 2014-01-29 14:41:45 UTC (rev 2660)
12059 +++ genpatches-2.6/trunk/3.14/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1 2014-01-30 16:49:47 UTC (rev 2661)
12060 @@ -1,1049 +0,0 @@
12061 -From 9acaa783ecab69925d38c6aca7252ff565a093d0 Mon Sep 17 00:00:00 2001
12062 -From: Mauro Andreolini <mauro.andreolini@×××××××.it>
12063 -Date: Fri, 14 Jun 2013 13:46:47 +0200
12064 -Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v6r2 for
12065 - 3.11.0
12066 -
12067 -A set of processes may happen to perform interleaved reads, i.e., requests
12068 -whose union would give rise to a sequential read pattern. There are two
12069 -typical cases: in the first case, processes read fixed-size chunks of
12070 -data at a fixed distance from each other, while in the second case processes
12071 -may read variable-size chunks at variable distances. The latter case occurs
12072 -for example with KVM, which splits the I/O generated by the guest into
12073 -multiple chunks, and lets these chunks be served by a pool of cooperating
12074 -processes, iteratively assigning the next chunk of I/O to the first
12075 -available process. CFQ uses actual queue merging for the first type of
12076 -processes, whereas it uses preemption to get a sequential read pattern out
12077 -of the read requests performed by the second type of processes. In the end
12078 -it uses two different mechanisms to achieve the same goal: boosting the
12079 -throughput with interleaved I/O.
12080 -
12081 -This patch introduces Early Queue Merge (EQM), a unified mechanism to get a
12082 -sequential read pattern with both types of processes. The main idea is
12083 -checking newly arrived requests against the next request of the active queue
12084 -both in case of actual request insert and in case of request merge. By doing
12085 -so, both the types of processes can be handled by just merging their queues.
12086 -EQM is then simpler and more compact than the pair of mechanisms used in
12087 -CFQ.
12088 -
12089 -Finally, EQM also preserves the typical low-latency properties of BFQ, by
12090 -properly restoring the weight-raising state of a queue when it gets back to
12091 -a non-merged state.
12092 -
12093 -Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>
12094 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
12095 -Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>
12096 ----
12097 - block/bfq-iosched.c | 653 ++++++++++++++++++++++++++++++++++++----------------
12098 - block/bfq-sched.c | 28 ---
12099 - block/bfq.h | 16 ++
12100 - 3 files changed, 466 insertions(+), 231 deletions(-)
12101 -
12102 -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
12103 -index 0ed2746..bbe79fb 100644
12104 ---- a/block/bfq-iosched.c
12105 -+++ b/block/bfq-iosched.c
12106 -@@ -444,6 +444,43 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
12107 - return dur;
12108 - }
12109 -
12110 -+static inline void
12111 -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
12112 -+{
12113 -+ if (bic->saved_idle_window)
12114 -+ bfq_mark_bfqq_idle_window(bfqq);
12115 -+ else
12116 -+ bfq_clear_bfqq_idle_window(bfqq);
12117 -+ if (bic->raising_time_left && bfqq->bfqd->low_latency) {
12118 -+ /*
12119 -+ * Start a weight raising period with the duration given by
12120 -+ * the raising_time_left snapshot.
12121 -+ */
12122 -+ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;
12123 -+ bfqq->raising_cur_max_time = bic->raising_time_left;
12124 -+ bfqq->last_rais_start_finish = jiffies;
12125 -+ }
12126 -+ /*
12127 -+ * Clear raising_time_left to prevent bfq_bfqq_save_state() from
12128 -+ * getting confused about the queue's need of a weight-raising
12129 -+ * period.
12130 -+ */
12131 -+ bic->raising_time_left = 0;
12132 -+}
12133 -+
12134 -+/*
12135 -+ * Must be called with the queue_lock held.
12136 -+ */
12137 -+static int bfqq_process_refs(struct bfq_queue *bfqq)
12138 -+{
12139 -+ int process_refs, io_refs;
12140 -+
12141 -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
12142 -+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
12143 -+ BUG_ON(process_refs < 0);
12144 -+ return process_refs;
12145 -+}
12146 -+
12147 - static void bfq_add_rq_rb(struct request *rq)
12148 - {
12149 - struct bfq_queue *bfqq = RQ_BFQQ(rq);
12150 -@@ -483,11 +520,20 @@ static void bfq_add_rq_rb(struct request *rq)
12151 - if (! bfqd->low_latency)
12152 - goto add_bfqq_busy;
12153 -
12154 -+ if (bfq_bfqq_just_split(bfqq))
12155 -+ goto set_ioprio_changed;
12156 -+
12157 - /*
12158 -- * If the queue is not being boosted and has been idle
12159 -- * for enough time, start a weight-raising period
12160 -+ * If the queue:
12161 -+ * - is not being boosted,
12162 -+ * - has been idle for enough time,
12163 -+ * - is not a sync queue or is linked to a bfq_io_cq (it is
12164 -+ * shared "for its nature" or it is not shared and its
12165 -+ * requests have not been redirected to a shared queue)
12166 -+ * start a weight-raising period.
12167 - */
12168 -- if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {
12169 -+ if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&
12170 -+ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
12171 - bfqq->raising_coeff = bfqd->bfq_raising_coeff;
12172 - if (idle_for_long_time)
12173 - bfqq->raising_cur_max_time =
12174 -@@ -517,6 +563,7 @@ static void bfq_add_rq_rb(struct request *rq)
12175 - raising_cur_max_time));
12176 - }
12177 - }
12178 -+set_ioprio_changed:
12179 - if (old_raising_coeff != bfqq->raising_coeff)
12180 - entity->ioprio_changed = 1;
12181 - add_bfqq_busy:
12182 -@@ -695,89 +742,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)
12183 - spin_unlock_irq(bfqd->queue->queue_lock);
12184 - }
12185 -
12186 --static int bfq_allow_merge(struct request_queue *q, struct request *rq,
12187 -- struct bio *bio)
12188 -+static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
12189 - {
12190 -- struct bfq_data *bfqd = q->elevator->elevator_data;
12191 -- struct bfq_io_cq *bic;
12192 -- struct bfq_queue *bfqq;
12193 --
12194 -- /*
12195 -- * Disallow merge of a sync bio into an async request.
12196 -- */
12197 -- if (bfq_bio_sync(bio) && !rq_is_sync(rq))
12198 -- return 0;
12199 --
12200 -- /*
12201 -- * Lookup the bfqq that this bio will be queued with. Allow
12202 -- * merge only if rq is queued there.
12203 -- * Queue lock is held here.
12204 -- */
12205 -- bic = bfq_bic_lookup(bfqd, current->io_context);
12206 -- if (bic == NULL)
12207 -- return 0;
12208 --
12209 -- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
12210 -- return bfqq == RQ_BFQQ(rq);
12211 --}
12212 --
12213 --static void __bfq_set_active_queue(struct bfq_data *bfqd,
12214 -- struct bfq_queue *bfqq)
12215 --{
12216 -- if (bfqq != NULL) {
12217 -- bfq_mark_bfqq_must_alloc(bfqq);
12218 -- bfq_mark_bfqq_budget_new(bfqq);
12219 -- bfq_clear_bfqq_fifo_expire(bfqq);
12220 --
12221 -- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
12222 --
12223 -- bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
12224 -- bfqq->entity.budget);
12225 -- }
12226 --
12227 -- bfqd->active_queue = bfqq;
12228 --}
12229 --
12230 --/*
12231 -- * Get and set a new active queue for service.
12232 -- */
12233 --static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,
12234 -- struct bfq_queue *bfqq)
12235 --{
12236 -- if (!bfqq)
12237 -- bfqq = bfq_get_next_queue(bfqd);
12238 -+ if (request)
12239 -+ return blk_rq_pos(io_struct);
12240 - else
12241 -- bfq_get_next_queue_forced(bfqd, bfqq);
12242 --
12243 -- __bfq_set_active_queue(bfqd, bfqq);
12244 -- return bfqq;
12245 -+ return ((struct bio *)io_struct)->bi_sector;
12246 - }
12247 -
12248 --static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
12249 -- struct request *rq)
12250 -+static inline sector_t bfq_dist_from(sector_t pos1,
12251 -+ sector_t pos2)
12252 - {
12253 -- if (blk_rq_pos(rq) >= bfqd->last_position)
12254 -- return blk_rq_pos(rq) - bfqd->last_position;
12255 -+ if (pos1 >= pos2)
12256 -+ return pos1 - pos2;
12257 - else
12258 -- return bfqd->last_position - blk_rq_pos(rq);
12259 -+ return pos2 - pos1;
12260 - }
12261 -
12262 --/*
12263 -- * Return true if bfqq has no request pending and rq is close enough to
12264 -- * bfqd->last_position, or if rq is closer to bfqd->last_position than
12265 -- * bfqq->next_rq
12266 -- */
12267 --static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
12268 -+static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
12269 -+ sector_t sector)
12270 - {
12271 -- return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
12272 -+ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
12273 -+ BFQQ_SEEK_THR;
12274 - }
12275 -
12276 --static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
12277 -+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
12278 - {
12279 - struct rb_root *root = &bfqd->rq_pos_tree;
12280 - struct rb_node *parent, *node;
12281 - struct bfq_queue *__bfqq;
12282 -- sector_t sector = bfqd->last_position;
12283 -
12284 - if (RB_EMPTY_ROOT(root))
12285 - return NULL;
12286 -@@ -796,7 +789,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
12287 - * position).
12288 - */
12289 - __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
12290 -- if (bfq_rq_close(bfqd, __bfqq->next_rq))
12291 -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
12292 - return __bfqq;
12293 -
12294 - if (blk_rq_pos(__bfqq->next_rq) < sector)
12295 -@@ -807,7 +800,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
12296 - return NULL;
12297 -
12298 - __bfqq = rb_entry(node, struct bfq_queue, pos_node);
12299 -- if (bfq_rq_close(bfqd, __bfqq->next_rq))
12300 -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
12301 - return __bfqq;
12302 -
12303 - return NULL;
12304 -@@ -816,14 +809,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
12305 - /*
12306 - * bfqd - obvious
12307 - * cur_bfqq - passed in so that we don't decide that the current queue
12308 -- * is closely cooperating with itself.
12309 -- *
12310 -- * We are assuming that cur_bfqq has dispatched at least one request,
12311 -- * and that bfqd->last_position reflects a position on the disk associated
12312 -- * with the I/O issued by cur_bfqq.
12313 -+ * is closely cooperating with itself
12314 -+ * sector - used as a reference point to search for a close queue
12315 - */
12316 - static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
12317 -- struct bfq_queue *cur_bfqq)
12318 -+ struct bfq_queue *cur_bfqq,
12319 -+ sector_t sector)
12320 - {
12321 - struct bfq_queue *bfqq;
12322 -
12323 -@@ -843,7 +834,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
12324 - * working closely on the same area of the disk. In that case,
12325 - * we can group them together and don't waste time idling.
12326 - */
12327 -- bfqq = bfqq_close(bfqd);
12328 -+ bfqq = bfqq_close(bfqd, sector);
12329 - if (bfqq == NULL || bfqq == cur_bfqq)
12330 - return NULL;
12331 -
12332 -@@ -870,6 +861,275 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
12333 - return bfqq;
12334 - }
12335 -
12336 -+static struct bfq_queue *
12337 -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
12338 -+{
12339 -+ int process_refs, new_process_refs;
12340 -+ struct bfq_queue *__bfqq;
12341 -+
12342 -+ /*
12343 -+ * If there are no process references on the new_bfqq, then it is
12344 -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
12345 -+ * may have dropped their last reference (not just their last process
12346 -+ * reference).
12347 -+ */
12348 -+ if (!bfqq_process_refs(new_bfqq))
12349 -+ return NULL;
12350 -+
12351 -+ /* Avoid a circular list and skip interim queue merges. */
12352 -+ while ((__bfqq = new_bfqq->new_bfqq)) {
12353 -+ if (__bfqq == bfqq)
12354 -+ return NULL;
12355 -+ new_bfqq = __bfqq;
12356 -+ }
12357 -+
12358 -+ process_refs = bfqq_process_refs(bfqq);
12359 -+ new_process_refs = bfqq_process_refs(new_bfqq);
12360 -+ /*
12361 -+ * If the process for the bfqq has gone away, there is no
12362 -+ * sense in merging the queues.
12363 -+ */
12364 -+ if (process_refs == 0 || new_process_refs == 0)
12365 -+ return NULL;
12366 -+
12367 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
12368 -+ new_bfqq->pid);
12369 -+
12370 -+ /*
12371 -+ * Merging is just a redirection: the requests of the process owning
12372 -+ * one of the two queues are redirected to the other queue. The latter
12373 -+ * queue, in its turn, is set as shared if this is the first time that
12374 -+ * the requests of some process are redirected to it.
12375 -+ *
12376 -+ * We redirect bfqq to new_bfqq and not the opposite, because we
12377 -+ * are in the context of the process owning bfqq, hence we have the
12378 -+ * io_cq of this process. So we can immediately configure this io_cq
12379 -+ * to redirect the requests of the process to new_bfqq.
12380 -+ *
12381 -+ * NOTE, even if new_bfqq coincides with the active queue, the io_cq of
12382 -+ * new_bfqq is not available, because, if the active queue is shared,
12383 -+ * bfqd->active_bic may not point to the io_cq of the active queue.
12384 -+ * Redirecting the requests of the process owning bfqq to the currently
12385 -+ * active queue is in any case the best option, as we feed the active queue
12386 -+ * with new requests close to the last request served and, by doing so,
12387 -+ * hopefully increase the throughput.
12388 -+ */
12389 -+ bfqq->new_bfqq = new_bfqq;
12390 -+ atomic_add(process_refs, &new_bfqq->ref);
12391 -+ return new_bfqq;
12392 -+}
12393 -+
12394 -+/*
12395 -+ * Attempt to schedule a merge of bfqq with the currently active queue or
12396 -+ * with a close queue among the scheduled queues.
12397 -+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
12398 -+ * structure otherwise.
12399 -+ */
12400 -+static struct bfq_queue *
12401 -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
12402 -+ void *io_struct, bool request)
12403 -+{
12404 -+ struct bfq_queue *active_bfqq, *new_bfqq;
12405 -+
12406 -+ if (bfqq->new_bfqq)
12407 -+ return bfqq->new_bfqq;
12408 -+
12409 -+ if (!io_struct)
12410 -+ return NULL;
12411 -+
12412 -+ active_bfqq = bfqd->active_queue;
12413 -+
12414 -+ if (active_bfqq == NULL || active_bfqq == bfqq || !bfqd->active_bic)
12415 -+ goto check_scheduled;
12416 -+
12417 -+ if (bfq_class_idle(active_bfqq) || bfq_class_idle(bfqq))
12418 -+ goto check_scheduled;
12419 -+
12420 -+ if (bfq_class_rt(active_bfqq) != bfq_class_rt(bfqq))
12421 -+ goto check_scheduled;
12422 -+
12423 -+ if (active_bfqq->entity.parent != bfqq->entity.parent)
12424 -+ goto check_scheduled;
12425 -+
12426 -+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
12427 -+ bfq_bfqq_sync(active_bfqq) && bfq_bfqq_sync(bfqq))
12428 -+ if ((new_bfqq = bfq_setup_merge(bfqq, active_bfqq)))
12429 -+ return new_bfqq; /* Merge with the active queue */
12430 -+
12431 -+ /*
12432 -+ * Check whether there is a cooperator among currently scheduled
12433 -+ * queues. The only thing we need is that the bio/request is not
12434 -+ * NULL, as we need it to establish whether a cooperator exists.
12435 -+ */
12436 -+check_scheduled:
12437 -+ new_bfqq = bfq_close_cooperator(bfqd, bfqq,
12438 -+ bfq_io_struct_pos(io_struct, request));
12439 -+ if (new_bfqq)
12440 -+ return bfq_setup_merge(bfqq, new_bfqq);
12441 -+
12442 -+ return NULL;
12443 -+}
12444 -+
12445 -+static inline void
12446 -+bfq_bfqq_save_state(struct bfq_queue *bfqq)
12447 -+{
12448 -+ /*
12449 -+ * If bfqq->bic == NULL, the queue is already shared or its requests
12450 -+ * have already been redirected to a shared queue; both idle window
12451 -+ * and weight raising state have already been saved. Do nothing.
12452 -+ */
12453 -+ if (bfqq->bic == NULL)
12454 -+ return;
12455 -+ if (bfqq->bic->raising_time_left)
12456 -+ /*
12457 -+ * This is the queue of a just-started process, and would
12458 -+ * deserve weight raising: we set raising_time_left to the full
12459 -+ * weight-raising duration to trigger weight-raising when and
12460 -+ * if the queue is split and the first request of the queue
12461 -+ * is enqueued.
12462 -+ */
12463 -+ bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);
12464 -+ else if (bfqq->raising_coeff > 1) {
12465 -+ unsigned long wrais_duration =
12466 -+ jiffies - bfqq->last_rais_start_finish;
12467 -+ /*
12468 -+ * It may happen that a queue's weight raising period lasts
12469 -+ * longer than its raising_cur_max_time, as weight raising is
12470 -+ * handled only when a request is enqueued or dispatched (it
12471 -+ * does not use any timer). If the weight raising period is
12472 -+ * about to end, don't save it.
12473 -+ */
12474 -+ if (bfqq->raising_cur_max_time <= wrais_duration)
12475 -+ bfqq->bic->raising_time_left = 0;
12476 -+ else
12477 -+ bfqq->bic->raising_time_left =
12478 -+ bfqq->raising_cur_max_time - wrais_duration;
12479 -+ /*
12480 -+ * The bfq_queue is becoming shared or the requests of the
12481 -+ * process owning the queue are being redirected to a shared
12482 -+ * queue. Stop the weight raising period of the queue, as in
12483 -+ * both cases it should not be owned by an interactive or soft
12484 -+ * real-time application.
12485 -+ */
12486 -+ bfq_bfqq_end_raising(bfqq);
12487 -+ } else
12488 -+ bfqq->bic->raising_time_left = 0;
12489 -+ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
12490 -+}
12491 -+
12492 -+static inline void
12493 -+bfq_get_bic_reference(struct bfq_queue *bfqq)
12494 -+{
12495 -+ /*
12496 -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs
12497 -+ * is about to begin using a shared bfq_queue.
12498 -+ */
12499 -+ if (bfqq->bic)
12500 -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
12501 -+}
12502 -+
12503 -+static void
12504 -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
12505 -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
12506 -+{
12507 -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
12508 -+ (long unsigned)new_bfqq->pid);
12509 -+ /* Save weight raising and idle window of the merged queues */
12510 -+ bfq_bfqq_save_state(bfqq);
12511 -+ bfq_bfqq_save_state(new_bfqq);
12512 -+ /*
12513 -+ * Grab a reference to the bic, to prevent it from being destroyed
12514 -+ * before being possibly touched by a bfq_split_bfqq().
12515 -+ */
12516 -+ bfq_get_bic_reference(bfqq);
12517 -+ bfq_get_bic_reference(new_bfqq);
12518 -+ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */
12519 -+ bic_set_bfqq(bic, new_bfqq, 1);
12520 -+ bfq_mark_bfqq_coop(new_bfqq);
12521 -+ /*
12522 -+ * new_bfqq now belongs to at least two bics (it is a shared queue): set
12523 -+ * new_bfqq->bic to NULL. bfqq either:
12524 -+ * - does not belong to any bic any more, and hence bfqq->bic must
12525 -+ * be set to NULL, or
12526 -+ * - is a queue whose owning bics have already been redirected to a
12527 -+ * different queue, hence the queue is destined to not belong to any
12528 -+ * bic soon and bfqq->bic is already NULL (therefore the next
12529 -+ * assignment causes no harm).
12530 -+ */
12531 -+ new_bfqq->bic = NULL;
12532 -+ bfqq->bic = NULL;
12533 -+ bfq_put_queue(bfqq);
12534 -+}
12535 -+
12536 -+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
12537 -+ struct bio *bio)
12538 -+{
12539 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
12540 -+ struct bfq_io_cq *bic;
12541 -+ struct bfq_queue *bfqq, *new_bfqq;
12542 -+
12543 -+ /*
12544 -+ * Disallow merge of a sync bio into an async request.
12545 -+ */
12546 -+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
12547 -+ return 0;
12548 -+
12549 -+ /*
12550 -+ * Lookup the bfqq that this bio will be queued with. Allow
12551 -+ * merge only if rq is queued there.
12552 -+ * Queue lock is held here.
12553 -+ */
12554 -+ bic = bfq_bic_lookup(bfqd, current->io_context);
12555 -+ if (bic == NULL)
12556 -+ return 0;
12557 -+
12558 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
12559 -+ /*
12560 -+ * We take advantage of this function to perform an early merge
12561 -+ * of the queues of possible cooperating processes.
12562 -+ */
12563 -+ if (bfqq != NULL &&
12564 -+ (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false))) {
12565 -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
12566 -+ /*
12567 -+ * If we get here, the bio will be queued in the shared queue,
12568 -+ * i.e., new_bfqq, so use new_bfqq to decide whether bio and
12569 -+ * rq can be merged.
12570 -+ */
12571 -+ bfqq = new_bfqq;
12572 -+ }
12573 -+
12574 -+ return bfqq == RQ_BFQQ(rq);
12575 -+}
12576 -+
12577 -+static void __bfq_set_active_queue(struct bfq_data *bfqd,
12578 -+ struct bfq_queue *bfqq)
12579 -+{
12580 -+ if (bfqq != NULL) {
12581 -+ bfq_mark_bfqq_must_alloc(bfqq);
12582 -+ bfq_mark_bfqq_budget_new(bfqq);
12583 -+ bfq_clear_bfqq_fifo_expire(bfqq);
12584 -+
12585 -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
12586 -+
12587 -+ bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
12588 -+ bfqq->entity.budget);
12589 -+ }
12590 -+
12591 -+ bfqd->active_queue = bfqq;
12592 -+}
12593 -+
12594 -+/*
12595 -+ * Get and set a new active queue for service.
12596 -+ */
12597 -+static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd)
12598 -+{
12599 -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
12600 -+
12601 -+ __bfq_set_active_queue(bfqd, bfqq);
12602 -+ return bfqq;
12603 -+}
12604 -+
12605 - /*
12606 - * If enough samples have been computed, return the current max budget
12607 - * stored in bfqd, which is dynamically updated according to the
12608 -@@ -1017,63 +1277,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
12609 - return rq;
12610 - }
12611 -
12612 --/*
12613 -- * Must be called with the queue_lock held.
12614 -- */
12615 --static int bfqq_process_refs(struct bfq_queue *bfqq)
12616 --{
12617 -- int process_refs, io_refs;
12618 --
12619 -- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
12620 -- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
12621 -- BUG_ON(process_refs < 0);
12622 -- return process_refs;
12623 --}
12624 --
12625 --static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
12626 --{
12627 -- int process_refs, new_process_refs;
12628 -- struct bfq_queue *__bfqq;
12629 --
12630 -- /*
12631 -- * If there are no process references on the new_bfqq, then it is
12632 -- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
12633 -- * may have dropped their last reference (not just their last process
12634 -- * reference).
12635 -- */
12636 -- if (!bfqq_process_refs(new_bfqq))
12637 -- return;
12638 --
12639 -- /* Avoid a circular list and skip interim queue merges. */
12640 -- while ((__bfqq = new_bfqq->new_bfqq)) {
12641 -- if (__bfqq == bfqq)
12642 -- return;
12643 -- new_bfqq = __bfqq;
12644 -- }
12645 --
12646 -- process_refs = bfqq_process_refs(bfqq);
12647 -- new_process_refs = bfqq_process_refs(new_bfqq);
12648 -- /*
12649 -- * If the process for the bfqq has gone away, there is no
12650 -- * sense in merging the queues.
12651 -- */
12652 -- if (process_refs == 0 || new_process_refs == 0)
12653 -- return;
12654 --
12655 -- /*
12656 -- * Merge in the direction of the lesser amount of work.
12657 -- */
12658 -- if (new_process_refs >= process_refs) {
12659 -- bfqq->new_bfqq = new_bfqq;
12660 -- atomic_add(process_refs, &new_bfqq->ref);
12661 -- } else {
12662 -- new_bfqq->new_bfqq = bfqq;
12663 -- atomic_add(new_process_refs, &bfqq->ref);
12664 -- }
12665 -- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
12666 -- new_bfqq->pid);
12667 --}
12668 --
12669 - static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
12670 - {
12671 - struct bfq_entity *entity = &bfqq->entity;
12672 -@@ -1493,6 +1696,14 @@ static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
12673 - * is likely to boost the disk throughput);
12674 - * - the queue is weight-raised (waiting for the request is necessary for
12675 - * providing the queue with fairness and latency guarantees).
12676 -+ *
12677 -+ * In any case, idling can be disabled for cooperation issues, if
12678 -+ * 1) there is a close cooperator for the queue, or
12679 -+ * 2) the queue is shared and some cooperator is likely to be idle (in this
12680 -+ * case, by not arming the idle timer, we try to slow down the queue, to
12681 -+ * prevent the zones of the disk accessed by the active cooperators to
12682 -+ * become too distant from the zone that will be accessed by the currently
12683 -+ * idle cooperators).
12684 - */
12685 - static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
12686 - int budg_timeout)
12687 -@@ -1507,7 +1718,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
12688 - (bfqd->rq_in_driver == 0 ||
12689 - budg_timeout ||
12690 - bfqq->raising_coeff > 1) &&
12691 -- !bfq_close_cooperator(bfqd, bfqq) &&
12692 -+ !bfq_close_cooperator(bfqd, bfqq, bfqd->last_position) &&
12693 - (!bfq_bfqq_coop(bfqq) ||
12694 - !bfq_bfqq_some_coop_idle(bfqq)) &&
12695 - !bfq_queue_nonrot_noidle(bfqd, bfqq));
12696 -@@ -1519,7 +1730,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
12697 - */
12698 - static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
12699 - {
12700 -- struct bfq_queue *bfqq, *new_bfqq = NULL;
12701 -+ struct bfq_queue *bfqq;
12702 - struct request *next_rq;
12703 - enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
12704 - int budg_timeout;
12705 -@@ -1530,17 +1741,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
12706 -
12707 - bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
12708 -
12709 -- /*
12710 -- * If another queue has a request waiting within our mean seek
12711 -- * distance, let it run. The expire code will check for close
12712 -- * cooperators and put the close queue at the front of the
12713 -- * service tree. If possible, merge the expiring queue with the
12714 -- * new bfqq.
12715 -- */
12716 -- new_bfqq = bfq_close_cooperator(bfqd, bfqq);
12717 -- if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
12718 -- bfq_setup_merge(bfqq, new_bfqq);
12719 --
12720 - budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
12721 - if (budg_timeout &&
12722 - !bfq_bfqq_must_idle(bfqq, budg_timeout))
12723 -@@ -1577,10 +1777,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
12724 - bfq_clear_bfqq_wait_request(bfqq);
12725 - del_timer(&bfqd->idle_slice_timer);
12726 - }
12727 -- if (new_bfqq == NULL)
12728 -- goto keep_queue;
12729 -- else
12730 -- goto expire;
12731 -+ goto keep_queue;
12732 - }
12733 - }
12734 -
12735 -@@ -1589,26 +1786,19 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
12736 - * queue still has requests in flight or is idling for a new request,
12737 - * then keep it.
12738 - */
12739 -- if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
12740 -+ if (timer_pending(&bfqd->idle_slice_timer) ||
12741 - (bfqq->dispatched != 0 &&
12742 - (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&
12743 -- !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {
12744 -+ !bfq_queue_nonrot_noidle(bfqd, bfqq))) {
12745 - bfqq = NULL;
12746 - goto keep_queue;
12747 -- } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
12748 -- /*
12749 -- * Expiring the queue because there is a close cooperator,
12750 -- * cancel timer.
12751 -- */
12752 -- bfq_clear_bfqq_wait_request(bfqq);
12753 -- del_timer(&bfqd->idle_slice_timer);
12754 - }
12755 -
12756 - reason = BFQ_BFQQ_NO_MORE_REQUESTS;
12757 - expire:
12758 - bfq_bfqq_expire(bfqd, bfqq, 0, reason);
12759 - new_queue:
12760 -- bfqq = bfq_set_active_queue(bfqd, new_bfqq);
12761 -+ bfqq = bfq_set_active_queue(bfqd);
12762 - bfq_log(bfqd, "select_queue: new queue %d returned",
12763 - bfqq != NULL ? bfqq->pid : 0);
12764 - keep_queue:
12765 -@@ -1617,9 +1807,8 @@ keep_queue:
12766 -
12767 - static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
12768 - {
12769 -+ struct bfq_entity *entity = &bfqq->entity;
12770 - if (bfqq->raising_coeff > 1) { /* queue is being boosted */
12771 -- struct bfq_entity *entity = &bfqq->entity;
12772 --
12773 - bfq_log_bfqq(bfqd, bfqq,
12774 - "raising period dur %u/%u msec, "
12775 - "old raising coeff %u, w %d(%d)",
12776 -@@ -1656,12 +1845,14 @@ static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
12777 - jiffies_to_msecs(bfqq->
12778 - raising_cur_max_time));
12779 - bfq_bfqq_end_raising(bfqq);
12780 -- __bfq_entity_update_weight_prio(
12781 -- bfq_entity_service_tree(entity),
12782 -- entity);
12783 - }
12784 - }
12785 - }
12786 -+ /* Update weight both if it must be raised and if it must be lowered */
12787 -+ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))
12788 -+ __bfq_entity_update_weight_prio(
12789 -+ bfq_entity_service_tree(entity),
12790 -+ entity);
12791 - }
12792 -
12793 - /*
12794 -@@ -1901,6 +2092,25 @@ static void bfq_init_icq(struct io_cq *icq)
12795 - struct bfq_io_cq *bic = icq_to_bic(icq);
12796 -
12797 - bic->ttime.last_end_request = jiffies;
12798 -+ /*
12799 -+ * A newly created bic indicates that the process has just
12800 -+ * started doing I/O, and is probably mapping into memory its
12801 -+ * executable and libraries: it definitely needs weight raising.
12802 -+ * There is however the possibility that the process performs,
12803 -+ * for a while, I/O close to some other process. EQM intercepts
12804 -+ * this behavior and may merge the queue corresponding to the
12805 -+ * process with some other queue, BEFORE the weight of the queue
12806 -+ * is raised. Merged queues are not weight-raised (they are assumed
12807 -+ * to belong to processes that benefit only from high throughput).
12808 -+ * If the merge is basically the consequence of an accident, then
12809 -+ * the queue will be split soon and will get back its old weight.
12810 -+ * It is then important to write down somewhere that this queue
12811 -+ * does need weight raising, even if it did not make it to get its
12812 -+ * weight raised before being merged. To this purpose, we overload
12813 -+ * the field raising_time_left and assign 1 to it, to mark the queue
12814 -+ * as needing weight raising.
12815 -+ */
12816 -+ bic->raising_time_left = 1;
12817 - }
12818 -
12819 - static void bfq_exit_icq(struct io_cq *icq)
12820 -@@ -1914,6 +2124,13 @@ static void bfq_exit_icq(struct io_cq *icq)
12821 - }
12822 -
12823 - if (bic->bfqq[BLK_RW_SYNC]) {
12824 -+ /*
12825 -+ * If the bic is using a shared queue, put the reference
12826 -+ * taken on the io_context when the bic started using a
12827 -+ * shared bfq_queue.
12828 -+ */
12829 -+ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
12830 -+ put_io_context(icq->ioc);
12831 - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
12832 - bic->bfqq[BLK_RW_SYNC] = NULL;
12833 - }
12834 -@@ -2211,6 +2428,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
12835 - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
12836 - return;
12837 -
12838 -+ /* Idle window just restored, statistics are meaningless. */
12839 -+ if (bfq_bfqq_just_split(bfqq))
12840 -+ return;
12841 -+
12842 - enable_idle = bfq_bfqq_idle_window(bfqq);
12843 -
12844 - if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
12845 -@@ -2251,6 +2472,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
12846 - if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
12847 - !BFQQ_SEEKY(bfqq))
12848 - bfq_update_idle_window(bfqd, bfqq, bic);
12849 -+ bfq_clear_bfqq_just_split(bfqq);
12850 -
12851 - bfq_log_bfqq(bfqd, bfqq,
12852 - "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
12853 -@@ -2302,13 +2524,45 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
12854 - static void bfq_insert_request(struct request_queue *q, struct request *rq)
12855 - {
12856 - struct bfq_data *bfqd = q->elevator->elevator_data;
12857 -- struct bfq_queue *bfqq = RQ_BFQQ(rq);
12858 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
12859 -
12860 - assert_spin_locked(bfqd->queue->queue_lock);
12861 -+
12862 -+ /*
12863 -+ * An unplug may trigger a requeue of a request from the device
12864 -+ * driver: make sure we are in process context while trying to
12865 -+ * merge two bfq_queues.
12866 -+ */
12867 -+ if (!in_interrupt() &&
12868 -+ (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true))) {
12869 -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
12870 -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
12871 -+ /*
12872 -+ * Release the request's reference to the old bfqq
12873 -+ * and make sure one is taken to the shared queue.
12874 -+ */
12875 -+ new_bfqq->allocated[rq_data_dir(rq)]++;
12876 -+ bfqq->allocated[rq_data_dir(rq)]--;
12877 -+ atomic_inc(&new_bfqq->ref);
12878 -+ bfq_put_queue(bfqq);
12879 -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
12880 -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq);
12881 -+ rq->elv.priv[1] = new_bfqq;
12882 -+ bfqq = new_bfqq;
12883 -+ }
12884 -+
12885 - bfq_init_prio_data(bfqq, RQ_BIC(rq));
12886 -
12887 - bfq_add_rq_rb(rq);
12888 -
12889 -+ /*
12890 -+ * Here a newly-created bfq_queue has already started a weight-raising
12891 -+ * period: clear raising_time_left to prevent bfq_bfqq_save_state()
12892 -+ * from assigning it a full weight-raising period. See the detailed
12893 -+ * comments about this field in bfq_init_icq().
12894 -+ */
12895 -+ if (bfqq->bic != NULL)
12896 -+ bfqq->bic->raising_time_left = 0;
12897 - rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
12898 - list_add_tail(&rq->queuelist, &bfqq->fifo);
12899 -
12900 -@@ -2371,15 +2625,6 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
12901 - if (bfq_bfqq_budget_new(bfqq))
12902 - bfq_set_budget_timeout(bfqd);
12903 -
12904 -- /* Idling is disabled also for cooperation issues:
12905 -- * 1) there is a close cooperator for the queue, or
12906 -- * 2) the queue is shared and some cooperator is likely
12907 -- * to be idle (in this case, by not arming the idle timer,
12908 -- * we try to slow down the queue, to prevent the zones
12909 -- * of the disk accessed by the active cooperators to become
12910 -- * too distant from the zone that will be accessed by the
12911 -- * currently idle cooperators)
12912 -- */
12913 - if (bfq_bfqq_must_idle(bfqq, budg_timeout))
12914 - bfq_arm_slice_timer(bfqd);
12915 - else if (budg_timeout)
12916 -@@ -2449,18 +2694,6 @@ static void bfq_put_request(struct request *rq)
12917 - }
12918 - }
12919 -
12920 --static struct bfq_queue *
12921 --bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
12922 -- struct bfq_queue *bfqq)
12923 --{
12924 -- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
12925 -- (long unsigned)bfqq->new_bfqq->pid);
12926 -- bic_set_bfqq(bic, bfqq->new_bfqq, 1);
12927 -- bfq_mark_bfqq_coop(bfqq->new_bfqq);
12928 -- bfq_put_queue(bfqq);
12929 -- return bic_to_bfqq(bic, 1);
12930 --}
12931 --
12932 - /*
12933 - * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
12934 - * was the last process referring to said bfqq.
12935 -@@ -2469,6 +2702,9 @@ static struct bfq_queue *
12936 - bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
12937 - {
12938 - bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
12939 -+
12940 -+ put_io_context(bic->icq.ioc);
12941 -+
12942 - if (bfqq_process_refs(bfqq) == 1) {
12943 - bfqq->pid = current->pid;
12944 - bfq_clear_bfqq_some_coop_idle(bfqq);
12945 -@@ -2498,6 +2734,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
12946 - struct bfq_queue *bfqq;
12947 - struct bfq_group *bfqg;
12948 - unsigned long flags;
12949 -+ bool split = false;
12950 -
12951 - might_sleep_if(gfp_mask & __GFP_WAIT);
12952 -
12953 -@@ -2516,24 +2753,14 @@ new_queue:
12954 - bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
12955 - bic_set_bfqq(bic, bfqq, is_sync);
12956 - } else {
12957 -- /*
12958 -- * If the queue was seeky for too long, break it apart.
12959 -- */
12960 -+ /* If the queue was seeky for too long, break it apart. */
12961 - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
12962 - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
12963 - bfqq = bfq_split_bfqq(bic, bfqq);
12964 -+ split = true;
12965 - if (!bfqq)
12966 - goto new_queue;
12967 - }
12968 --
12969 -- /*
12970 -- * Check to see if this queue is scheduled to merge with
12971 -- * another closely cooperating queue. The merging of queues
12972 -- * happens here as it must be done in process context.
12973 -- * The reference on new_bfqq was taken in merge_bfqqs.
12974 -- */
12975 -- if (bfqq->new_bfqq != NULL)
12976 -- bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
12977 - }
12978 -
12979 - bfqq->allocated[rw]++;
12980 -@@ -2544,6 +2771,26 @@ new_queue:
12981 - rq->elv.priv[0] = bic;
12982 - rq->elv.priv[1] = bfqq;
12983 -
12984 -+ /*
12985 -+ * If a bfq_queue has only one process reference, it is owned
12986 -+ * by only one bfq_io_cq: we can set the bic field of the
12987 -+ * bfq_queue to the address of that structure. Also, if the
12988 -+ * queue has just been split, mark a flag so that the
12989 -+ * information is available to the other scheduler hooks.
12990 -+ */
12991 -+ if (bfqq_process_refs(bfqq) == 1) {
12992 -+ bfqq->bic = bic;
12993 -+ if (split) {
12994 -+ bfq_mark_bfqq_just_split(bfqq);
12995 -+ /*
12996 -+ * If the queue has just been split from a shared queue,
12997 -+ * restore the idle window and the possible weight
12998 -+ * raising period.
12999 -+ */
13000 -+ bfq_bfqq_resume_state(bfqq, bic);
13001 -+ }
13002 -+ }
13003 -+
13004 - spin_unlock_irqrestore(q->queue_lock, flags);
13005 -
13006 - return 0;
13007 -diff --git a/block/bfq-sched.c b/block/bfq-sched.c
13008 -index 03f8061..a0edaa2 100644
13009 ---- a/block/bfq-sched.c
13010 -+++ b/block/bfq-sched.c
13011 -@@ -978,34 +978,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
13012 - return bfqq;
13013 - }
13014 -
13015 --/*
13016 -- * Forced extraction of the given queue.
13017 -- */
13018 --static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
13019 -- struct bfq_queue *bfqq)
13020 --{
13021 -- struct bfq_entity *entity;
13022 -- struct bfq_sched_data *sd;
13023 --
13024 -- BUG_ON(bfqd->active_queue != NULL);
13025 --
13026 -- entity = &bfqq->entity;
13027 -- /*
13028 -- * Bubble up extraction/update from the leaf to the root.
13029 -- */
13030 -- for_each_entity(entity) {
13031 -- sd = entity->sched_data;
13032 -- bfq_update_budget(entity);
13033 -- bfq_update_vtime(bfq_entity_service_tree(entity));
13034 -- bfq_active_extract(bfq_entity_service_tree(entity), entity);
13035 -- sd->active_entity = entity;
13036 -- sd->next_active = NULL;
13037 -- entity->service = 0;
13038 -- }
13039 --
13040 -- return;
13041 --}
13042 --
13043 - static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)
13044 - {
13045 - if (bfqd->active_bic != NULL) {
13046 -diff --git a/block/bfq.h b/block/bfq.h
13047 -index 48ecde9..bb52975 100644
13048 ---- a/block/bfq.h
13049 -+++ b/block/bfq.h
13050 -@@ -188,6 +188,8 @@ struct bfq_group;
13051 - * @pid: pid of the process owning the queue, used for logging purposes.
13052 - * @last_rais_start_time: last (idle -> weight-raised) transition attempt
13053 - * @raising_cur_max_time: current max raising time for this queue
13054 -+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
13055 -+ * queue is shared
13056 - *
13057 - * A bfq_queue is a leaf request queue; it can be associated to an io_context
13058 - * or more (if it is an async one). @cgroup holds a reference to the
13059 -@@ -231,6 +233,7 @@ struct bfq_queue {
13060 - sector_t last_request_pos;
13061 -
13062 - pid_t pid;
13063 -+ struct bfq_io_cq *bic;
13064 -
13065 - /* weight-raising fields */
13066 - unsigned int raising_cur_max_time;
13067 -@@ -257,12 +260,23 @@ struct bfq_ttime {
13068 - * @icq: associated io_cq structure
13069 - * @bfqq: array of two process queues, the sync and the async
13070 - * @ttime: associated @bfq_ttime struct
13071 -+ * @raising_time_left: snapshot of the time left before weight raising ends
13072 -+ * for the sync queue associated to this process; this
13073 -+ * snapshot is taken to remember this value while the weight
13074 -+ * raising is suspended because the queue is merged with a
13075 -+ * shared queue, and is used to set @raising_cur_max_time
13076 -+ * when the queue is split from the shared queue and its
13077 -+ * weight is raised again
13078 -+ * @saved_idle_window: same purpose as the previous field for the idle window
13079 - */
13080 - struct bfq_io_cq {
13081 - struct io_cq icq; /* must be the first member */
13082 - struct bfq_queue *bfqq[2];
13083 - struct bfq_ttime ttime;
13084 - int ioprio;
13085 -+
13086 -+ unsigned int raising_time_left;
13087 -+ unsigned int saved_idle_window;
13088 - };
13089 -
13090 - /**
13091 -@@ -403,6 +417,7 @@ enum bfqq_state_flags {
13092 - BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
13093 - BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
13094 - BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */
13095 -+ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */
13096 - };
13097 -
13098 - #define BFQ_BFQQ_FNS(name) \
13099 -@@ -430,6 +445,7 @@ BFQ_BFQQ_FNS(budget_new);
13100 - BFQ_BFQQ_FNS(coop);
13101 - BFQ_BFQQ_FNS(split_coop);
13102 - BFQ_BFQQ_FNS(some_coop_idle);
13103 -+BFQ_BFQQ_FNS(just_split);
13104 - #undef BFQ_BFQQ_FNS
13105 -
13106 - /* Logging facilities. */
13107 ---
13108 -1.8.1.4
13109 -
13110
13111 Added: genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch
13112 ===================================================================
13113 --- genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch (rev 0)
13114 +++ genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch 2014-01-30 16:49:47 UTC (rev 2661)
13115 @@ -0,0 +1,1034 @@
13116 +From 3cd9e2ea29c3ba9e420556e8ecf161d166186b63 Mon Sep 17 00:00:00 2001
13117 +From: Mauro Andreolini <mauro.andreolini@×××××××.it>
13118 +Date: Thu, 23 Jan 2014 16:54:44 +0100
13119 +Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7 for
13120 + 3.13.0
13121 +
13122 +A set of processes may happen to perform interleaved reads, i.e., requests
13123 +whose union would give rise to a sequential read pattern. There are two
13124 +typical cases: in the first case, processes read fixed-size chunks of
13125 +data at a fixed distance from each other, while in the second case processes
13126 +may read variable-size chunks at variable distances. The latter case occurs
13127 +for example with KVM, which splits the I/O generated by the guest into
13128 +multiple chunks, and lets these chunks be served by a pool of cooperating
13129 +processes, iteratively assigning the next chunk of I/O to the first
13130 +available process. CFQ uses actual queue merging for the first type of
13131 +rocesses, whereas it uses preemption to get a sequential read pattern out
13132 +of the read requests performed by the second type of processes. In the end
13133 +it uses two different mechanisms to achieve the same goal: boosting the
13134 +throughput with interleaved I/O.
13135 +
13136 +This patch introduces Early Queue Merge (EQM), a unified mechanism to get a
13137 +sequential read pattern with both types of processes. The main idea is
13138 +checking newly arrived requests against the next request of the active queue
13139 +both in case of actual request insert and in case of request merge. By doing
13140 +so, both the types of processes can be handled by just merging their queues.
13141 +EQM is then simpler and more compact than the pair of mechanisms used in
13142 +CFQ.
13143 +
13144 +Finally, EQM also preserves the typical low-latency properties of BFQ, by
13145 +properly restoring the weight-raising state of a queue when it gets back to
13146 +a non-merged state.
13147 +
13148 +Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>
13149 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
13150 +Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>
13151 +---
13152 + block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++----------------
13153 + block/bfq-sched.c | 28 ---
13154 + block/bfq.h | 16 ++
13155 + 3 files changed, 474 insertions(+), 227 deletions(-)
13156 +
13157 +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
13158 +index 7670400..295236e 100644
13159 +--- a/block/bfq-iosched.c
13160 ++++ b/block/bfq-iosched.c
13161 +@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
13162 + return dur;
13163 + }
13164 +
13165 ++static inline void
13166 ++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
13167 ++{
13168 ++ if (bic->saved_idle_window)
13169 ++ bfq_mark_bfqq_idle_window(bfqq);
13170 ++ else
13171 ++ bfq_clear_bfqq_idle_window(bfqq);
13172 ++ if (bic->raising_time_left && bfqq->bfqd->low_latency) {
13173 ++ /*
13174 ++ * Start a weight raising period with the duration given by
13175 ++ * the raising_time_left snapshot.
13176 ++ */
13177 ++ if (bfq_bfqq_busy(bfqq))
13178 ++ bfqq->bfqd->raised_busy_queues++;
13179 ++ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;
13180 ++ bfqq->raising_cur_max_time = bic->raising_time_left;
13181 ++ bfqq->last_rais_start_finish = jiffies;
13182 ++ bfqq->entity.ioprio_changed = 1;
13183 ++ }
13184 ++ /*
13185 ++ * Clear raising_time_left to prevent bfq_bfqq_save_state() from
13186 ++ * getting confused about the queue's need of a weight-raising
13187 ++ * period.
13188 ++ */
13189 ++ bic->raising_time_left = 0;
13190 ++}
13191 ++
13192 ++/*
13193 ++ * Must be called with the queue_lock held.
13194 ++ */
13195 ++static int bfqq_process_refs(struct bfq_queue *bfqq)
13196 ++{
13197 ++ int process_refs, io_refs;
13198 ++
13199 ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
13200 ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
13201 ++ BUG_ON(process_refs < 0);
13202 ++ return process_refs;
13203 ++}
13204 ++
13205 + static void bfq_add_rq_rb(struct request *rq)
13206 + {
13207 + struct bfq_queue *bfqq = RQ_BFQQ(rq);
13208 +@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq)
13209 + if (!bfqd->low_latency)
13210 + goto add_bfqq_busy;
13211 +
13212 ++ if (bfq_bfqq_just_split(bfqq))
13213 ++ goto set_ioprio_changed;
13214 ++
13215 + /*
13216 +- * If the queue is not being boosted and has been idle
13217 +- * for enough time, start a weight-raising period
13218 ++ * If the queue:
13219 ++ * - is not being boosted,
13220 ++ * - has been idle for enough time,
13221 ++ * - is not a sync queue or is linked to a bfq_io_cq (it is
13222 ++ * shared "for its nature" or it is not shared and its
13223 ++ * requests have not been redirected to a shared queue)
13224 ++ * start a weight-raising period.
13225 + */
13226 +- if (old_raising_coeff == 1 &&
13227 +- (idle_for_long_time || soft_rt)) {
13228 ++ if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&
13229 ++ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
13230 + bfqq->raising_coeff = bfqd->bfq_raising_coeff;
13231 + if (idle_for_long_time)
13232 + bfqq->raising_cur_max_time =
13233 +@@ -572,6 +620,7 @@ static void bfq_add_rq_rb(struct request *rq)
13234 + bfqd->bfq_raising_rt_max_time;
13235 + }
13236 + }
13237 ++set_ioprio_changed:
13238 + if (old_raising_coeff != bfqq->raising_coeff)
13239 + entity->ioprio_changed = 1;
13240 + add_bfqq_busy:
13241 +@@ -754,90 +803,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)
13242 + spin_unlock_irq(bfqd->queue->queue_lock);
13243 + }
13244 +
13245 +-static int bfq_allow_merge(struct request_queue *q, struct request *rq,
13246 +- struct bio *bio)
13247 +-{
13248 +- struct bfq_data *bfqd = q->elevator->elevator_data;
13249 +- struct bfq_io_cq *bic;
13250 +- struct bfq_queue *bfqq;
13251 +-
13252 +- /*
13253 +- * Disallow merge of a sync bio into an async request.
13254 +- */
13255 +- if (bfq_bio_sync(bio) && !rq_is_sync(rq))
13256 +- return 0;
13257 +-
13258 +- /*
13259 +- * Lookup the bfqq that this bio will be queued with. Allow
13260 +- * merge only if rq is queued there.
13261 +- * Queue lock is held here.
13262 +- */
13263 +- bic = bfq_bic_lookup(bfqd, current->io_context);
13264 +- if (bic == NULL)
13265 +- return 0;
13266 +-
13267 +- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
13268 +- return bfqq == RQ_BFQQ(rq);
13269 +-}
13270 +-
13271 +-static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
13272 +- struct bfq_queue *bfqq)
13273 +-{
13274 +- if (bfqq != NULL) {
13275 +- bfq_mark_bfqq_must_alloc(bfqq);
13276 +- bfq_mark_bfqq_budget_new(bfqq);
13277 +- bfq_clear_bfqq_fifo_expire(bfqq);
13278 +-
13279 +- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
13280 +-
13281 +- bfq_log_bfqq(bfqd, bfqq,
13282 +- "set_in_service_queue, cur-budget = %lu",
13283 +- bfqq->entity.budget);
13284 +- }
13285 +-
13286 +- bfqd->in_service_queue = bfqq;
13287 +-}
13288 +-
13289 +-/*
13290 +- * Get and set a new queue for service.
13291 +- */
13292 +-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
13293 +- struct bfq_queue *bfqq)
13294 ++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
13295 + {
13296 +- if (!bfqq)
13297 +- bfqq = bfq_get_next_queue(bfqd);
13298 ++ if (request)
13299 ++ return blk_rq_pos(io_struct);
13300 + else
13301 +- bfq_get_next_queue_forced(bfqd, bfqq);
13302 +-
13303 +- __bfq_set_in_service_queue(bfqd, bfqq);
13304 +- return bfqq;
13305 ++ return ((struct bio *)io_struct)->bi_sector;
13306 + }
13307 +
13308 +-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
13309 +- struct request *rq)
13310 ++static inline sector_t bfq_dist_from(sector_t pos1,
13311 ++ sector_t pos2)
13312 + {
13313 +- if (blk_rq_pos(rq) >= bfqd->last_position)
13314 +- return blk_rq_pos(rq) - bfqd->last_position;
13315 ++ if (pos1 >= pos2)
13316 ++ return pos1 - pos2;
13317 + else
13318 +- return bfqd->last_position - blk_rq_pos(rq);
13319 ++ return pos2 - pos1;
13320 + }
13321 +
13322 +-/*
13323 +- * Return true if bfqq has no request pending and rq is close enough to
13324 +- * bfqd->last_position, or if rq is closer to bfqd->last_position than
13325 +- * bfqq->next_rq
13326 +- */
13327 +-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
13328 ++static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
13329 ++ sector_t sector)
13330 + {
13331 +- return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
13332 ++ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
13333 ++ BFQQ_SEEK_THR;
13334 + }
13335 +
13336 +-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13337 ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
13338 + {
13339 + struct rb_root *root = &bfqd->rq_pos_tree;
13340 + struct rb_node *parent, *node;
13341 + struct bfq_queue *__bfqq;
13342 +- sector_t sector = bfqd->last_position;
13343 +
13344 + if (RB_EMPTY_ROOT(root))
13345 + return NULL;
13346 +@@ -856,7 +850,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13347 + * position).
13348 + */
13349 + __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
13350 +- if (bfq_rq_close(bfqd, __bfqq->next_rq))
13351 ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
13352 + return __bfqq;
13353 +
13354 + if (blk_rq_pos(__bfqq->next_rq) < sector)
13355 +@@ -867,7 +861,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13356 + return NULL;
13357 +
13358 + __bfqq = rb_entry(node, struct bfq_queue, pos_node);
13359 +- if (bfq_rq_close(bfqd, __bfqq->next_rq))
13360 ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
13361 + return __bfqq;
13362 +
13363 + return NULL;
13364 +@@ -876,14 +870,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13365 + /*
13366 + * bfqd - obvious
13367 + * cur_bfqq - passed in so that we don't decide that the current queue
13368 +- * is closely cooperating with itself.
13369 +- *
13370 +- * We are assuming that cur_bfqq has dispatched at least one request,
13371 +- * and that bfqd->last_position reflects a position on the disk associated
13372 +- * with the I/O issued by cur_bfqq.
13373 ++ * is closely cooperating with itself
13374 ++ * sector - used as a reference point to search for a close queue
13375 + */
13376 + static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
13377 +- struct bfq_queue *cur_bfqq)
13378 ++ struct bfq_queue *cur_bfqq,
13379 ++ sector_t sector)
13380 + {
13381 + struct bfq_queue *bfqq;
13382 +
13383 +@@ -903,7 +895,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
13384 + * working closely on the same area of the disk. In that case,
13385 + * we can group them together and don't waste time idling.
13386 + */
13387 +- bfqq = bfqq_close(bfqd);
13388 ++ bfqq = bfqq_close(bfqd, sector);
13389 + if (bfqq == NULL || bfqq == cur_bfqq)
13390 + return NULL;
13391 +
13392 +@@ -930,6 +922,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
13393 + return bfqq;
13394 + }
13395 +
13396 ++static struct bfq_queue *
13397 ++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
13398 ++{
13399 ++ int process_refs, new_process_refs;
13400 ++ struct bfq_queue *__bfqq;
13401 ++
13402 ++ /*
13403 ++ * If there are no process references on the new_bfqq, then it is
13404 ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
13405 ++ * may have dropped their last reference (not just their last process
13406 ++ * reference).
13407 ++ */
13408 ++ if (!bfqq_process_refs(new_bfqq))
13409 ++ return NULL;
13410 ++
13411 ++ /* Avoid a circular list and skip interim queue merges. */
13412 ++ while ((__bfqq = new_bfqq->new_bfqq)) {
13413 ++ if (__bfqq == bfqq)
13414 ++ return NULL;
13415 ++ new_bfqq = __bfqq;
13416 ++ }
13417 ++
13418 ++ process_refs = bfqq_process_refs(bfqq);
13419 ++ new_process_refs = bfqq_process_refs(new_bfqq);
13420 ++ /*
13421 ++ * If the process for the bfqq has gone away, there is no
13422 ++ * sense in merging the queues.
13423 ++ */
13424 ++ if (process_refs == 0 || new_process_refs == 0)
13425 ++ return NULL;
13426 ++
13427 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
13428 ++ new_bfqq->pid);
13429 ++
13430 ++ /*
13431 ++ * Merging is just a redirection: the requests of the process owning
13432 ++ * one of the two queues are redirected to the other queue. The latter
13433 ++ * queue, in its turn, is set as shared if this is the first time that
13434 ++ * the requests of some process are redirected to it.
13435 ++ *
13436 ++ * We redirect bfqq to new_bfqq and not the opposite, because we
13437 ++ * are in the context of the process owning bfqq, hence we have the
13438 ++ * io_cq of this process. So we can immediately configure this io_cq
13439 ++ * to redirect the requests of the process to new_bfqq.
13440 ++ *
13441 ++ * NOTE, even if new_bfqq coincides with the in-service queue, the
13442 ++ * io_cq of new_bfqq is not available, because, if the in-service queue
13443 ++ * is shared, bfqd->in_service_bic may not point to the io_cq of the
13444 ++ * in-service queue.
13445 ++ * Redirecting the requests of the process owning bfqq to the currently
13446 ++ * in-service queue is in any case the best option, as we feed the
13447 ++ * in-service queue with new requests close to the last request served
13448 ++ * and, by doing so, hopefully increase the throughput.
13449 ++ */
13450 ++ bfqq->new_bfqq = new_bfqq;
13451 ++ atomic_add(process_refs, &new_bfqq->ref);
13452 ++ return new_bfqq;
13453 ++}
13454 ++
13455 ++/*
13456 ++ * Attempt to schedule a merge of bfqq with the currently in-service queue or
13457 ++ * with a close queue among the scheduled queues.
13458 ++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
13459 ++ * structure otherwise.
13460 ++ */
13461 ++static struct bfq_queue *
13462 ++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
13463 ++ void *io_struct, bool request)
13464 ++{
13465 ++ struct bfq_queue *in_service_bfqq, *new_bfqq;
13466 ++
13467 ++ if (bfqq->new_bfqq)
13468 ++ return bfqq->new_bfqq;
13469 ++
13470 ++ if (!io_struct)
13471 ++ return NULL;
13472 ++
13473 ++ in_service_bfqq = bfqd->in_service_queue;
13474 ++
13475 ++ if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||
13476 ++ !bfqd->in_service_bic)
13477 ++ goto check_scheduled;
13478 ++
13479 ++ if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))
13480 ++ goto check_scheduled;
13481 ++
13482 ++ if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))
13483 ++ goto check_scheduled;
13484 ++
13485 ++ if (in_service_bfqq->entity.parent != bfqq->entity.parent)
13486 ++ goto check_scheduled;
13487 ++
13488 ++ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
13489 ++ bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {
13490 ++ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
13491 ++ if (new_bfqq != NULL)
13492 ++ return new_bfqq; /* Merge with the in-service queue */
13493 ++ }
13494 ++
13495 ++ /*
13496 ++ * Check whether there is a cooperator among currently scheduled
13497 ++ * queues. The only thing we need is that the bio/request is not
13498 ++ * NULL, as we need it to establish whether a cooperator exists.
13499 ++ */
13500 ++check_scheduled:
13501 ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq,
13502 ++ bfq_io_struct_pos(io_struct, request));
13503 ++ if (new_bfqq)
13504 ++ return bfq_setup_merge(bfqq, new_bfqq);
13505 ++
13506 ++ return NULL;
13507 ++}
13508 ++
13509 ++static inline void
13510 ++bfq_bfqq_save_state(struct bfq_queue *bfqq)
13511 ++{
13512 ++ /*
13513 ++ * If bfqq->bic == NULL, the queue is already shared or its requests
13514 ++ * have already been redirected to a shared queue; both idle window
13515 ++ * and weight raising state have already been saved. Do nothing.
13516 ++ */
13517 ++ if (bfqq->bic == NULL)
13518 ++ return;
13519 ++ if (bfqq->bic->raising_time_left)
13520 ++ /*
13521 ++ * This is the queue of a just-started process, and would
13522 ++ * deserve weight raising: we set raising_time_left to the full
13523 ++ * weight-raising duration to trigger weight-raising when and
13524 ++ * if the queue is split and the first request of the queue
13525 ++ * is enqueued.
13526 ++ */
13527 ++ bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);
13528 ++ else if (bfqq->raising_coeff > 1) {
13529 ++ unsigned long wrais_duration =
13530 ++ jiffies - bfqq->last_rais_start_finish;
13531 ++ /*
13532 ++ * It may happen that a queue's weight raising period lasts
13533 ++ * longer than its raising_cur_max_time, as weight raising is
13534 ++ * handled only when a request is enqueued or dispatched (it
13535 ++ * does not use any timer). If the weight raising period is
13536 ++ * about to end, don't save it.
13537 ++ */
13538 ++ if (bfqq->raising_cur_max_time <= wrais_duration)
13539 ++ bfqq->bic->raising_time_left = 0;
13540 ++ else
13541 ++ bfqq->bic->raising_time_left =
13542 ++ bfqq->raising_cur_max_time - wrais_duration;
13543 ++ /*
13544 ++ * The bfq_queue is becoming shared or the requests of the
13545 ++ * process owning the queue are being redirected to a shared
13546 ++ * queue. Stop the weight raising period of the queue, as in
13547 ++ * both cases it should not be owned by an interactive or soft
13548 ++ * real-time application.
13549 ++ */
13550 ++ bfq_bfqq_end_raising(bfqq);
13551 ++ } else
13552 ++ bfqq->bic->raising_time_left = 0;
13553 ++ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
13554 ++}
13555 ++
13556 ++static inline void
13557 ++bfq_get_bic_reference(struct bfq_queue *bfqq)
13558 ++{
13559 ++ /*
13560 ++ * If bfqq->bic has a non-NULL value, the bic to which it belongs
13561 ++ * is about to begin using a shared bfq_queue.
13562 ++ */
13563 ++ if (bfqq->bic)
13564 ++ atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
13565 ++}
13566 ++
13567 ++static void
13568 ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
13569 ++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
13570 ++{
13571 ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
13572 ++ (long unsigned)new_bfqq->pid);
13573 ++ /* Save weight raising and idle window of the merged queues */
13574 ++ bfq_bfqq_save_state(bfqq);
13575 ++ bfq_bfqq_save_state(new_bfqq);
13576 ++ /*
13577 ++ * Grab a reference to the bic, to prevent it from being destroyed
13578 ++ * before being possibly touched by a bfq_split_bfqq().
13579 ++ */
13580 ++ bfq_get_bic_reference(bfqq);
13581 ++ bfq_get_bic_reference(new_bfqq);
13582 ++ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */
13583 ++ bic_set_bfqq(bic, new_bfqq, 1);
13584 ++ bfq_mark_bfqq_coop(new_bfqq);
13585 ++ /*
13586 ++ * new_bfqq now belongs to at least two bics (it is a shared queue): set
13587 ++ * new_bfqq->bic to NULL. bfqq either:
13588 ++ * - does not belong to any bic any more, and hence bfqq->bic must
13589 ++ * be set to NULL, or
13590 ++ * - is a queue whose owning bics have already been redirected to a
13591 ++ * different queue, hence the queue is destined to not belong to any
13592 ++ * bic soon and bfqq->bic is already NULL (therefore the next
13593 ++ * assignment causes no harm).
13594 ++ */
13595 ++ new_bfqq->bic = NULL;
13596 ++ bfqq->bic = NULL;
13597 ++ bfq_put_queue(bfqq);
13598 ++}
13599 ++
13600 ++static int bfq_allow_merge(struct request_queue *q, struct request *rq,
13601 ++ struct bio *bio)
13602 ++{
13603 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
13604 ++ struct bfq_io_cq *bic;
13605 ++ struct bfq_queue *bfqq, *new_bfqq;
13606 ++
13607 ++ /*
13608 ++ * Disallow merge of a sync bio into an async request.
13609 ++ */
13610 ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
13611 ++ return 0;
13612 ++
13613 ++ /*
13614 ++ * Lookup the bfqq that this bio will be queued with. Allow
13615 ++ * merge only if rq is queued there.
13616 ++ * Queue lock is held here.
13617 ++ */
13618 ++ bic = bfq_bic_lookup(bfqd, current->io_context);
13619 ++ if (bic == NULL)
13620 ++ return 0;
13621 ++
13622 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
13623 ++ /*
13624 ++ * We take advantage of this function to perform an early merge
13625 ++ * of the queues of possible cooperating processes.
13626 ++ */
13627 ++ if (bfqq != NULL) {
13628 ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
13629 ++ if (new_bfqq != NULL) {
13630 ++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
13631 ++ /*
13632 ++ * If we get here, the bio will be queued in the shared queue,
13633 ++ * i.e., new_bfqq, so use new_bfqq to decide whether bio and
13634 ++ * rq can be merged.
13635 ++ */
13636 ++ bfqq = new_bfqq;
13637 ++ }
13638 ++ }
13639 ++
13640 ++ return bfqq == RQ_BFQQ(rq);
13641 ++}
13642 ++
13643 ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
13644 ++ struct bfq_queue *bfqq)
13645 ++{
13646 ++ if (bfqq != NULL) {
13647 ++ bfq_mark_bfqq_must_alloc(bfqq);
13648 ++ bfq_mark_bfqq_budget_new(bfqq);
13649 ++ bfq_clear_bfqq_fifo_expire(bfqq);
13650 ++
13651 ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
13652 ++
13653 ++ bfq_log_bfqq(bfqd, bfqq,
13654 ++ "set_in_service_queue, cur-budget = %lu",
13655 ++ bfqq->entity.budget);
13656 ++ }
13657 ++
13658 ++ bfqd->in_service_queue = bfqq;
13659 ++}
13660 ++
13661 ++/*
13662 ++ * Get and set a new queue for service.
13663 ++ */
13664 ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
13665 ++{
13666 ++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
13667 ++
13668 ++ __bfq_set_in_service_queue(bfqd, bfqq);
13669 ++ return bfqq;
13670 ++}
13671 ++
13672 + /*
13673 + * If enough samples have been computed, return the current max budget
13674 + * stored in bfqd, which is dynamically updated according to the
13675 +@@ -1077,63 +1345,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
13676 + return rq;
13677 + }
13678 +
13679 +-/*
13680 +- * Must be called with the queue_lock held.
13681 +- */
13682 +-static int bfqq_process_refs(struct bfq_queue *bfqq)
13683 +-{
13684 +- int process_refs, io_refs;
13685 +-
13686 +- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
13687 +- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
13688 +- BUG_ON(process_refs < 0);
13689 +- return process_refs;
13690 +-}
13691 +-
13692 +-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
13693 +-{
13694 +- int process_refs, new_process_refs;
13695 +- struct bfq_queue *__bfqq;
13696 +-
13697 +- /*
13698 +- * If there are no process references on the new_bfqq, then it is
13699 +- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
13700 +- * may have dropped their last reference (not just their last process
13701 +- * reference).
13702 +- */
13703 +- if (!bfqq_process_refs(new_bfqq))
13704 +- return;
13705 +-
13706 +- /* Avoid a circular list and skip interim queue merges. */
13707 +- while ((__bfqq = new_bfqq->new_bfqq)) {
13708 +- if (__bfqq == bfqq)
13709 +- return;
13710 +- new_bfqq = __bfqq;
13711 +- }
13712 +-
13713 +- process_refs = bfqq_process_refs(bfqq);
13714 +- new_process_refs = bfqq_process_refs(new_bfqq);
13715 +- /*
13716 +- * If the process for the bfqq has gone away, there is no
13717 +- * sense in merging the queues.
13718 +- */
13719 +- if (process_refs == 0 || new_process_refs == 0)
13720 +- return;
13721 +-
13722 +- /*
13723 +- * Merge in the direction of the lesser amount of work.
13724 +- */
13725 +- if (new_process_refs >= process_refs) {
13726 +- bfqq->new_bfqq = new_bfqq;
13727 +- atomic_add(process_refs, &new_bfqq->ref);
13728 +- } else {
13729 +- new_bfqq->new_bfqq = bfqq;
13730 +- atomic_add(new_process_refs, &bfqq->ref);
13731 +- }
13732 +- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
13733 +- new_bfqq->pid);
13734 +-}
13735 +-
13736 + static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
13737 + {
13738 + struct bfq_entity *entity = &bfqq->entity;
13739 +@@ -1703,7 +1914,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
13740 + */
13741 + static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
13742 + {
13743 +- struct bfq_queue *bfqq, *new_bfqq = NULL;
13744 ++ struct bfq_queue *bfqq;
13745 + struct request *next_rq;
13746 + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
13747 +
13748 +@@ -1713,17 +1924,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
13749 +
13750 + bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
13751 +
13752 +- /*
13753 +- * If another queue has a request waiting within our mean seek
13754 +- * distance, let it run. The expire code will check for close
13755 +- * cooperators and put the close queue at the front of the
13756 +- * service tree. If possible, merge the expiring queue with the
13757 +- * new bfqq.
13758 +- */
13759 +- new_bfqq = bfq_close_cooperator(bfqd, bfqq);
13760 +- if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
13761 +- bfq_setup_merge(bfqq, new_bfqq);
13762 +-
13763 + if (bfq_may_expire_for_budg_timeout(bfqq) &&
13764 + !timer_pending(&bfqd->idle_slice_timer) &&
13765 + !bfq_bfqq_must_idle(bfqq))
13766 +@@ -1760,36 +1960,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
13767 + bfq_clear_bfqq_wait_request(bfqq);
13768 + del_timer(&bfqd->idle_slice_timer);
13769 + }
13770 +- if (new_bfqq == NULL)
13771 +- goto keep_queue;
13772 +- else
13773 +- goto expire;
13774 ++ goto keep_queue;
13775 + }
13776 + }
13777 +
13778 + /*
13779 +- * No requests pending. If the in-service queue has no cooperator and
13780 +- * still has requests in flight (possibly waiting for a completion)
13781 +- * or is idling for a new request, then keep it.
13782 ++ * No requests pending. If the in-service queue still has requests in
13783 ++ * flight (possibly waiting for a completion) or is idling for a new
13784 ++ * request, then keep it.
13785 + */
13786 +- if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
13787 +- (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
13788 ++ if (timer_pending(&bfqd->idle_slice_timer) ||
13789 ++ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {
13790 + bfqq = NULL;
13791 + goto keep_queue;
13792 +- } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
13793 +- /*
13794 +- * Expiring the queue because there is a close cooperator,
13795 +- * cancel timer.
13796 +- */
13797 +- bfq_clear_bfqq_wait_request(bfqq);
13798 +- del_timer(&bfqd->idle_slice_timer);
13799 + }
13800 +
13801 + reason = BFQ_BFQQ_NO_MORE_REQUESTS;
13802 + expire:
13803 + bfq_bfqq_expire(bfqd, bfqq, 0, reason);
13804 + new_queue:
13805 +- bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
13806 ++ bfqq = bfq_set_in_service_queue(bfqd);
13807 + bfq_log(bfqd, "select_queue: new queue %d returned",
13808 + bfqq != NULL ? bfqq->pid : 0);
13809 + keep_queue:
13810 +@@ -1799,9 +1989,8 @@ keep_queue:
13811 + static void bfq_update_raising_data(struct bfq_data *bfqd,
13812 + struct bfq_queue *bfqq)
13813 + {
13814 ++ struct bfq_entity *entity = &bfqq->entity;
13815 + if (bfqq->raising_coeff > 1) { /* queue is being boosted */
13816 +- struct bfq_entity *entity = &bfqq->entity;
13817 +-
13818 + bfq_log_bfqq(bfqd, bfqq,
13819 + "raising period dur %u/%u msec, "
13820 + "old raising coeff %u, w %d(%d)",
13821 +@@ -1818,7 +2007,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,
13822 + "WARN: pending prio change");
13823 + /*
13824 + * If too much time has elapsed from the beginning
13825 +- * of this weight-raising, stop it.
13826 ++ * of this weight-raising period, stop it.
13827 + */
13828 + if (jiffies - bfqq->last_rais_start_finish >
13829 + bfqq->raising_cur_max_time) {
13830 +@@ -1830,11 +2019,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,
13831 + jiffies_to_msecs(bfqq->
13832 + raising_cur_max_time));
13833 + bfq_bfqq_end_raising(bfqq);
13834 +- __bfq_entity_update_weight_prio(
13835 +- bfq_entity_service_tree(entity),
13836 +- entity);
13837 + }
13838 + }
13839 ++ /* Update weight both if it must be raised and if it must be lowered */
13840 ++ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))
13841 ++ __bfq_entity_update_weight_prio(
13842 ++ bfq_entity_service_tree(entity),
13843 ++ entity);
13844 + }
13845 +
13846 + /*
13847 +@@ -2075,6 +2266,25 @@ static void bfq_init_icq(struct io_cq *icq)
13848 + struct bfq_io_cq *bic = icq_to_bic(icq);
13849 +
13850 + bic->ttime.last_end_request = jiffies;
13851 ++ /*
13852 ++ * A newly created bic indicates that the process has just
13853 ++ * started doing I/O, and is probably mapping into memory its
13854 ++ * executable and libraries: it definitely needs weight raising.
13855 ++ * There is however the possibility that the process performs,
13856 ++ * for a while, I/O close to some other process. EQM intercepts
13857 ++ * this behavior and may merge the queue corresponding to the
13858 ++ * process with some other queue, BEFORE the weight of the queue
13859 ++ * is raised. Merged queues are not weight-raised (they are assumed
13860 ++ * to belong to processes that benefit only from high throughput).
13861 ++ * If the merge is basically the consequence of an accident, then
13862 ++ * the queue will be split soon and will get back its old weight.
13863 ++ * It is then important to write down somewhere that this queue
13864 ++ * does need weight raising, even if it did not make it to get its
13865 ++ * weight raised before being merged. To this purpose, we overload
13866 ++ * the field raising_time_left and assign 1 to it, to mark the queue
13867 ++ * as needing weight raising.
13868 ++ */
13869 ++ bic->raising_time_left = 1;
13870 + }
13871 +
13872 + static void bfq_exit_icq(struct io_cq *icq)
13873 +@@ -2088,6 +2298,13 @@ static void bfq_exit_icq(struct io_cq *icq)
13874 + }
13875 +
13876 + if (bic->bfqq[BLK_RW_SYNC]) {
13877 ++ /*
13878 ++ * If the bic is using a shared queue, put the reference
13879 ++ * taken on the io_context when the bic started using a
13880 ++ * shared bfq_queue.
13881 ++ */
13882 ++ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
13883 ++ put_io_context(icq->ioc);
13884 + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
13885 + bic->bfqq[BLK_RW_SYNC] = NULL;
13886 + }
13887 +@@ -2375,6 +2592,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
13888 + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
13889 + return;
13890 +
13891 ++ /* Idle window just restored, statistics are meaningless. */
13892 ++ if (bfq_bfqq_just_split(bfqq))
13893 ++ return;
13894 ++
13895 + enable_idle = bfq_bfqq_idle_window(bfqq);
13896 +
13897 + if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
13898 +@@ -2415,6 +2636,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
13899 + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
13900 + !BFQQ_SEEKY(bfqq))
13901 + bfq_update_idle_window(bfqd, bfqq, bic);
13902 ++ bfq_clear_bfqq_just_split(bfqq);
13903 +
13904 + bfq_log_bfqq(bfqd, bfqq,
13905 + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
13906 +@@ -2475,13 +2697,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
13907 + static void bfq_insert_request(struct request_queue *q, struct request *rq)
13908 + {
13909 + struct bfq_data *bfqd = q->elevator->elevator_data;
13910 +- struct bfq_queue *bfqq = RQ_BFQQ(rq);
13911 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
13912 +
13913 + assert_spin_locked(bfqd->queue->queue_lock);
13914 ++
13915 ++ /*
13916 ++ * An unplug may trigger a requeue of a request from the device
13917 ++ * driver: make sure we are in process context while trying to
13918 ++ * merge two bfq_queues.
13919 ++ */
13920 ++ if (!in_interrupt()) {
13921 ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
13922 ++ if (new_bfqq != NULL) {
13923 ++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
13924 ++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
13925 ++ /*
13926 ++ * Release the request's reference to the old bfqq
13927 ++ * and make sure one is taken to the shared queue.
13928 ++ */
13929 ++ new_bfqq->allocated[rq_data_dir(rq)]++;
13930 ++ bfqq->allocated[rq_data_dir(rq)]--;
13931 ++ atomic_inc(&new_bfqq->ref);
13932 ++ bfq_put_queue(bfqq);
13933 ++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
13934 ++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
13935 ++ bfqq, new_bfqq);
13936 ++ rq->elv.priv[1] = new_bfqq;
13937 ++ bfqq = new_bfqq;
13938 ++ }
13939 ++ }
13940 ++
13941 + bfq_init_prio_data(bfqq, RQ_BIC(rq));
13942 +
13943 + bfq_add_rq_rb(rq);
13944 +
13945 ++ /*
13946 ++ * Here a newly-created bfq_queue has already started a weight-raising
13947 ++ * period: clear raising_time_left to prevent bfq_bfqq_save_state()
13948 ++ * from assigning it a full weight-raising period. See the detailed
13949 ++ * comments about this field in bfq_init_icq().
13950 ++ */
13951 ++ if (bfqq->bic != NULL)
13952 ++ bfqq->bic->raising_time_left = 0;
13953 + rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
13954 + list_add_tail(&rq->queuelist, &bfqq->fifo);
13955 +
13956 +@@ -2629,18 +2886,6 @@ static void bfq_put_request(struct request *rq)
13957 + }
13958 + }
13959 +
13960 +-static struct bfq_queue *
13961 +-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
13962 +- struct bfq_queue *bfqq)
13963 +-{
13964 +- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
13965 +- (long unsigned)bfqq->new_bfqq->pid);
13966 +- bic_set_bfqq(bic, bfqq->new_bfqq, 1);
13967 +- bfq_mark_bfqq_coop(bfqq->new_bfqq);
13968 +- bfq_put_queue(bfqq);
13969 +- return bic_to_bfqq(bic, 1);
13970 +-}
13971 +-
13972 + /*
13973 + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
13974 + * was the last process referring to said bfqq.
13975 +@@ -2649,6 +2894,9 @@ static struct bfq_queue *
13976 + bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
13977 + {
13978 + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
13979 ++
13980 ++ put_io_context(bic->icq.ioc);
13981 ++
13982 + if (bfqq_process_refs(bfqq) == 1) {
13983 + bfqq->pid = current->pid;
13984 + bfq_clear_bfqq_coop(bfqq);
13985 +@@ -2677,6 +2925,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
13986 + struct bfq_queue *bfqq;
13987 + struct bfq_group *bfqg;
13988 + unsigned long flags;
13989 ++ bool split = false;
13990 +
13991 + might_sleep_if(gfp_mask & __GFP_WAIT);
13992 +
13993 +@@ -2695,24 +2944,14 @@ new_queue:
13994 + bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
13995 + bic_set_bfqq(bic, bfqq, is_sync);
13996 + } else {
13997 +- /*
13998 +- * If the queue was seeky for too long, break it apart.
13999 +- */
14000 ++ /* If the queue was seeky for too long, break it apart. */
14001 + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
14002 + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
14003 + bfqq = bfq_split_bfqq(bic, bfqq);
14004 ++ split = true;
14005 + if (!bfqq)
14006 + goto new_queue;
14007 + }
14008 +-
14009 +- /*
14010 +- * Check to see if this queue is scheduled to merge with
14011 +- * another closely cooperating queue. The merging of queues
14012 +- * happens here as it must be done in process context.
14013 +- * The reference on new_bfqq was taken in merge_bfqqs.
14014 +- */
14015 +- if (bfqq->new_bfqq != NULL)
14016 +- bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
14017 + }
14018 +
14019 + bfqq->allocated[rw]++;
14020 +@@ -2723,6 +2962,26 @@ new_queue:
14021 + rq->elv.priv[0] = bic;
14022 + rq->elv.priv[1] = bfqq;
14023 +
14024 ++ /*
14025 ++ * If a bfq_queue has only one process reference, it is owned
14026 ++ * by only one bfq_io_cq: we can set the bic field of the
14027 ++ * bfq_queue to the address of that structure. Also, if the
14028 ++ * queue has just been split, mark a flag so that the
14029 ++ * information is available to the other scheduler hooks.
14030 ++ */
14031 ++ if (bfqq_process_refs(bfqq) == 1) {
14032 ++ bfqq->bic = bic;
14033 ++ if (split) {
14034 ++ bfq_mark_bfqq_just_split(bfqq);
14035 ++ /*
14036 ++ * If the queue has just been split from a shared queue,
14037 ++ * restore the idle window and the possible weight
14038 ++ * raising period.
14039 ++ */
14040 ++ bfq_bfqq_resume_state(bfqq, bic);
14041 ++ }
14042 ++ }
14043 ++
14044 + spin_unlock_irqrestore(q->queue_lock, flags);
14045 +
14046 + return 0;
14047 +diff --git a/block/bfq-sched.c b/block/bfq-sched.c
14048 +index 30df81c..47e66a8 100644
14049 +--- a/block/bfq-sched.c
14050 ++++ b/block/bfq-sched.c
14051 +@@ -979,34 +979,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
14052 + return bfqq;
14053 + }
14054 +
14055 +-/*
14056 +- * Forced extraction of the given queue.
14057 +- */
14058 +-static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
14059 +- struct bfq_queue *bfqq)
14060 +-{
14061 +- struct bfq_entity *entity;
14062 +- struct bfq_sched_data *sd;
14063 +-
14064 +- BUG_ON(bfqd->in_service_queue != NULL);
14065 +-
14066 +- entity = &bfqq->entity;
14067 +- /*
14068 +- * Bubble up extraction/update from the leaf to the root.
14069 +- */
14070 +- for_each_entity(entity) {
14071 +- sd = entity->sched_data;
14072 +- bfq_update_budget(entity);
14073 +- bfq_update_vtime(bfq_entity_service_tree(entity));
14074 +- bfq_active_extract(bfq_entity_service_tree(entity), entity);
14075 +- sd->active_entity = entity;
14076 +- sd->next_active = NULL;
14077 +- entity->service = 0;
14078 +- }
14079 +-
14080 +- return;
14081 +-}
14082 +-
14083 + static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
14084 + {
14085 + if (bfqd->in_service_bic != NULL) {
14086 +diff --git a/block/bfq.h b/block/bfq.h
14087 +index 68b28e3..438f560 100644
14088 +--- a/block/bfq.h
14089 ++++ b/block/bfq.h
14090 +@@ -192,6 +192,8 @@ struct bfq_group;
14091 + * idle to backlogged
14092 + * @service_from_backlogged: cumulative service received from the @bfq_queue
14093 + * since the last transition from idle to backlogged
14094 ++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
14095 ++ * queue is shared
14096 + *
14097 + * A bfq_queue is a leaf request queue; it can be associated to an io_context
14098 + * or more (if it is an async one). @cgroup holds a reference to the
14099 +@@ -235,6 +237,7 @@ struct bfq_queue {
14100 + sector_t last_request_pos;
14101 +
14102 + pid_t pid;
14103 ++ struct bfq_io_cq *bic;
14104 +
14105 + /* weight-raising fields */
14106 + unsigned int raising_cur_max_time;
14107 +@@ -264,12 +267,23 @@ struct bfq_ttime {
14108 + * @icq: associated io_cq structure
14109 + * @bfqq: array of two process queues, the sync and the async
14110 + * @ttime: associated @bfq_ttime struct
14111 ++ * @raising_time_left: snapshot of the time left before weight raising ends
14112 ++ * for the sync queue associated to this process; this
14113 ++ * snapshot is taken to remember this value while the weight
14114 ++ * raising is suspended because the queue is merged with a
14115 ++ * shared queue, and is used to set @raising_cur_max_time
14116 ++ * when the queue is split from the shared queue and its
14117 ++ * weight is raised again
14118 ++ * @saved_idle_window: same purpose as the previous field for the idle window
14119 + */
14120 + struct bfq_io_cq {
14121 + struct io_cq icq; /* must be the first member */
14122 + struct bfq_queue *bfqq[2];
14123 + struct bfq_ttime ttime;
14124 + int ioprio;
14125 ++
14126 ++ unsigned int raising_time_left;
14127 ++ unsigned int saved_idle_window;
14128 + };
14129 +
14130 + /**
14131 +@@ -411,6 +425,7 @@ enum bfqq_state_flags {
14132 + BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
14133 + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
14134 + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
14135 ++ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */
14136 + BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */
14137 + };
14138 +
14139 +@@ -438,6 +453,7 @@ BFQ_BFQQ_FNS(sync);
14140 + BFQ_BFQQ_FNS(budget_new);
14141 + BFQ_BFQQ_FNS(coop);
14142 + BFQ_BFQQ_FNS(split_coop);
14143 ++BFQ_BFQQ_FNS(just_split);
14144 + BFQ_BFQQ_FNS(softrt_update);
14145 + #undef BFQ_BFQQ_FNS
14146 +
14147 +--
14148 +1.8.5.2
14149 +
14150
14151 Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch
14152 ===================================================================
14153 --- genpatches-2.6/trunk/3.14/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch 2014-01-29 14:41:45 UTC (rev 2660)
14154 +++ genpatches-2.6/trunk/3.14/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch 2014-01-30 16:49:47 UTC (rev 2661)
14155 @@ -1,362 +0,0 @@
14156 -From 2e1646d06515b7dd1344db547dfcf9a4640dee8e Mon Sep 17 00:00:00 2001
14157 -From: Arianna Avanzini <avanzini.arianna@×××××.com>
14158 -Date: Wed, 11 Sep 2013 22:26:47 +0200
14159 -Subject: [PATCH] block: Switch from BFQ-v6r2 for 3.11.0 to BFQ-v6r2 for
14160 - 3.12.0-rc1
14161 -
14162 ----
14163 - block/bfq-cgroup.c | 115 +++++++++++++++++++++++++++++++----------------------
14164 - block/bfq.h | 2 +
14165 - 2 files changed, 70 insertions(+), 47 deletions(-)
14166 -
14167 -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
14168 -index bb9b851..afae4ca 100644
14169 ---- a/block/bfq-cgroup.c
14170 -+++ b/block/bfq-cgroup.c
14171 -@@ -16,9 +16,9 @@
14172 -
14173 - static DEFINE_MUTEX(bfqio_mutex);
14174 -
14175 --static bool bfqio_is_removed(struct cgroup *cgroup)
14176 -+static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
14177 - {
14178 -- return test_bit(CGRP_DEAD, &cgroup->flags);
14179 -+ return bgrp ? !bgrp->online : false;
14180 - }
14181 -
14182 - static struct bfqio_cgroup bfqio_root_cgroup = {
14183 -@@ -38,10 +38,9 @@ static inline void bfq_init_entity(struct bfq_entity *entity,
14184 - entity->sched_data = &bfqg->sched_data;
14185 - }
14186 -
14187 --static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)
14188 -+static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
14189 - {
14190 -- return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),
14191 -- struct bfqio_cgroup, css);
14192 -+ return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
14193 - }
14194 -
14195 - /*
14196 -@@ -103,20 +102,20 @@ static inline void bfq_group_set_parent(struct bfq_group *bfqg,
14197 - /**
14198 - * bfq_group_chain_alloc - allocate a chain of groups.
14199 - * @bfqd: queue descriptor.
14200 -- * @cgroup: the leaf cgroup this chain starts from.
14201 -+ * @css: the leaf cgroup_subsys_state this chain starts from.
14202 - *
14203 - * Allocate a chain of groups starting from the one belonging to
14204 - * @cgroup up to the root cgroup. Stop if a cgroup on the chain
14205 - * to the root has already an allocated group on @bfqd.
14206 - */
14207 - static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
14208 -- struct cgroup *cgroup)
14209 -+ struct cgroup_subsys_state *css)
14210 - {
14211 - struct bfqio_cgroup *bgrp;
14212 - struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
14213 -
14214 -- for (; cgroup != NULL; cgroup = cgroup->parent) {
14215 -- bgrp = cgroup_to_bfqio(cgroup);
14216 -+ for (; css != NULL; css = css->parent) {
14217 -+ bgrp = css_to_bfqio(css);
14218 -
14219 - bfqg = bfqio_lookup_group(bgrp, bfqd);
14220 - if (bfqg != NULL) {
14221 -@@ -165,7 +164,7 @@ cleanup:
14222 - /**
14223 - * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
14224 - * @bfqd: the queue descriptor.
14225 -- * @cgroup: the leaf cgroup to start from.
14226 -+ * @css: the leaf cgroup_subsys_state to start from.
14227 - * @leaf: the leaf group (to be associated to @cgroup).
14228 - *
14229 - * Try to link a chain of groups to a cgroup hierarchy, connecting the
14230 -@@ -177,7 +176,8 @@ cleanup:
14231 - * per device) while the bfqio_cgroup lock protects the list of groups
14232 - * belonging to the same cgroup.
14233 - */
14234 --static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
14235 -+static void bfq_group_chain_link(struct bfq_data *bfqd,
14236 -+ struct cgroup_subsys_state *css,
14237 - struct bfq_group *leaf)
14238 - {
14239 - struct bfqio_cgroup *bgrp;
14240 -@@ -186,8 +186,8 @@ static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
14241 -
14242 - assert_spin_locked(bfqd->queue->queue_lock);
14243 -
14244 -- for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {
14245 -- bgrp = cgroup_to_bfqio(cgroup);
14246 -+ for (; css != NULL && leaf != NULL; css = css->parent) {
14247 -+ bgrp = css_to_bfqio(css);
14248 - next = leaf->bfqd;
14249 -
14250 - bfqg = bfqio_lookup_group(bgrp, bfqd);
14251 -@@ -205,9 +205,9 @@ static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
14252 - leaf = next;
14253 - }
14254 -
14255 -- BUG_ON(cgroup == NULL && leaf != NULL);
14256 -- if (cgroup != NULL && prev != NULL) {
14257 -- bgrp = cgroup_to_bfqio(cgroup);
14258 -+ BUG_ON(css == NULL && leaf != NULL);
14259 -+ if (css != NULL && prev != NULL) {
14260 -+ bgrp = css_to_bfqio(css);
14261 - bfqg = bfqio_lookup_group(bgrp, bfqd);
14262 - bfq_group_set_parent(prev, bfqg);
14263 - }
14264 -@@ -233,18 +233,18 @@ static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
14265 - * have been successful.
14266 - */
14267 - static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
14268 -- struct cgroup *cgroup)
14269 -+ struct cgroup_subsys_state *css)
14270 - {
14271 -- struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
14272 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
14273 - struct bfq_group *bfqg;
14274 -
14275 - bfqg = bfqio_lookup_group(bgrp, bfqd);
14276 - if (bfqg != NULL)
14277 - return bfqg;
14278 -
14279 -- bfqg = bfq_group_chain_alloc(bfqd, cgroup);
14280 -+ bfqg = bfq_group_chain_alloc(bfqd, css);
14281 - if (bfqg != NULL)
14282 -- bfq_group_chain_link(bfqd, cgroup, bfqg);
14283 -+ bfq_group_chain_link(bfqd, css, bfqg);
14284 - else
14285 - bfqg = bfqd->root_group;
14286 -
14287 -@@ -315,8 +315,8 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
14288 - * time here, at the price of slightly more complex code.
14289 - */
14290 - static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
14291 -- struct bfq_io_cq *bic,
14292 -- struct cgroup *cgroup)
14293 -+ struct bfq_io_cq *bic,
14294 -+ struct cgroup_subsys_state *css)
14295 - {
14296 - struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
14297 - struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
14298 -@@ -324,9 +324,9 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
14299 - struct bfq_group *bfqg;
14300 - struct bfqio_cgroup *bgrp;
14301 -
14302 -- bgrp = cgroup_to_bfqio(cgroup);
14303 -+ bgrp = css_to_bfqio(css);
14304 -
14305 -- bfqg = bfq_find_alloc_group(bfqd, cgroup);
14306 -+ bfqg = bfq_find_alloc_group(bfqd, css);
14307 - if (async_bfqq != NULL) {
14308 - entity = &async_bfqq->entity;
14309 -
14310 -@@ -357,14 +357,14 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
14311 - * moved into its new parent group.
14312 - */
14313 - static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
14314 -- struct cgroup *cgroup)
14315 -+ struct cgroup_subsys_state *css)
14316 - {
14317 - struct bfq_data *bfqd;
14318 - unsigned long uninitialized_var(flags);
14319 -
14320 - bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
14321 - if (bfqd != NULL) {
14322 -- __bfq_bic_change_cgroup(bfqd, bic, cgroup);
14323 -+ __bfq_bic_change_cgroup(bfqd, bic, css);
14324 - bfq_put_bfqd_unlock(bfqd, &flags);
14325 - }
14326 - }
14327 -@@ -394,13 +394,13 @@ static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
14328 - {
14329 - struct bfq_data *bfqd = bic_to_bfqd(bic);
14330 - struct bfq_group *bfqg;
14331 -- struct cgroup *cgroup;
14332 -+ struct cgroup_subsys_state *css;
14333 -
14334 - BUG_ON(bfqd == NULL);
14335 -
14336 - rcu_read_lock();
14337 -- cgroup = task_cgroup(current, bfqio_subsys_id);
14338 -- bfqg = __bfq_bic_change_cgroup(bfqd, bic, cgroup);
14339 -+ css = task_css(current, bfqio_subsys_id);
14340 -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
14341 - rcu_read_unlock();
14342 -
14343 - return bfqg;
14344 -@@ -622,17 +622,16 @@ static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
14345 - }
14346 -
14347 - #define SHOW_FUNCTION(__VAR) \
14348 --static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup, \
14349 -+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
14350 - struct cftype *cftype) \
14351 - { \
14352 -- struct bfqio_cgroup *bgrp; \
14353 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
14354 - u64 ret = -ENODEV; \
14355 - \
14356 - mutex_lock(&bfqio_mutex); \
14357 -- if (bfqio_is_removed(cgroup)) \
14358 -+ if (bfqio_is_removed(bgrp)) \
14359 - goto out_unlock; \
14360 - \
14361 -- bgrp = cgroup_to_bfqio(cgroup); \
14362 - spin_lock_irq(&bgrp->lock); \
14363 - ret = bgrp->__VAR; \
14364 - spin_unlock_irq(&bgrp->lock); \
14365 -@@ -648,11 +647,11 @@ SHOW_FUNCTION(ioprio_class);
14366 - #undef SHOW_FUNCTION
14367 -
14368 - #define STORE_FUNCTION(__VAR, __MIN, __MAX) \
14369 --static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \
14370 -+static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
14371 - struct cftype *cftype, \
14372 - u64 val) \
14373 - { \
14374 -- struct bfqio_cgroup *bgrp; \
14375 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
14376 - struct bfq_group *bfqg; \
14377 - int ret = -EINVAL; \
14378 - \
14379 -@@ -661,12 +660,10 @@ static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \
14380 - \
14381 - ret = -ENODEV; \
14382 - mutex_lock(&bfqio_mutex); \
14383 -- if (bfqio_is_removed(cgroup)) \
14384 -+ if (bfqio_is_removed(bgrp)) \
14385 - goto out_unlock; \
14386 - ret = 0; \
14387 - \
14388 -- bgrp = cgroup_to_bfqio(cgroup); \
14389 -- \
14390 - spin_lock_irq(&bgrp->lock); \
14391 - bgrp->__VAR = (unsigned short)val; \
14392 - hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
14393 -@@ -713,11 +710,11 @@ static struct cftype bfqio_files[] = {
14394 - { }, /* terminate */
14395 - };
14396 -
14397 --static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)
14398 -+static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state *parent_css)
14399 - {
14400 - struct bfqio_cgroup *bgrp;
14401 -
14402 -- if (cgroup->parent != NULL) {
14403 -+ if (parent_css != NULL) {
14404 - bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
14405 - if (bgrp == NULL)
14406 - return ERR_PTR(-ENOMEM);
14407 -@@ -740,13 +737,14 @@ static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)
14408 - * behavior is that a group containing a task that forked using CLONE_IO
14409 - * will not be destroyed until the tasks sharing the ioc die.
14410 - */
14411 --static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
14412 -+static int bfqio_can_attach(struct cgroup_subsys_state *css,
14413 -+ struct cgroup_taskset *tset)
14414 - {
14415 - struct task_struct *task;
14416 - struct io_context *ioc;
14417 - int ret = 0;
14418 -
14419 -- cgroup_taskset_for_each(task, cgroup, tset) {
14420 -+ cgroup_taskset_for_each(task, css, tset) {
14421 - /* task_lock() is needed to avoid races with exit_io_context() */
14422 - task_lock(task);
14423 - ioc = task->io_context;
14424 -@@ -766,7 +764,8 @@ static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
14425 - return ret;
14426 - }
14427 -
14428 --static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
14429 -+static void bfqio_attach(struct cgroup_subsys_state *css,
14430 -+ struct cgroup_taskset *tset)
14431 - {
14432 - struct task_struct *task;
14433 - struct io_context *ioc;
14434 -@@ -776,7 +775,7 @@ static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
14435 - * IMPORTANT NOTE: The move of more than one process at a time to a
14436 - * new group has not yet been tested.
14437 - */
14438 -- cgroup_taskset_for_each(task, cgroup, tset) {
14439 -+ cgroup_taskset_for_each(task, css, tset) {
14440 - ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
14441 - if (ioc) {
14442 - /*
14443 -@@ -787,16 +786,16 @@ static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
14444 - if (!strncmp(icq->q->elevator->type->elevator_name,
14445 - "bfq", ELV_NAME_MAX))
14446 - bfq_bic_change_cgroup(icq_to_bic(icq),
14447 -- cgroup);
14448 -+ css);
14449 - rcu_read_unlock();
14450 - put_io_context(ioc);
14451 - }
14452 - }
14453 - }
14454 -
14455 --static void bfqio_destroy(struct cgroup *cgroup)
14456 -+static void bfqio_destroy(struct cgroup_subsys_state *css)
14457 - {
14458 -- struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
14459 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
14460 - struct hlist_node *tmp;
14461 - struct bfq_group *bfqg;
14462 -
14463 -@@ -815,9 +814,31 @@ static void bfqio_destroy(struct cgroup *cgroup)
14464 - kfree(bgrp);
14465 - }
14466 -
14467 -+static int bfqio_css_online(struct cgroup_subsys_state *css)
14468 -+{
14469 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
14470 -+
14471 -+ mutex_lock(&bfqio_mutex);
14472 -+ bgrp->online = true;
14473 -+ mutex_unlock(&bfqio_mutex);
14474 -+
14475 -+ return 0;
14476 -+}
14477 -+
14478 -+static void bfqio_css_offline(struct cgroup_subsys_state *css)
14479 -+{
14480 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
14481 -+
14482 -+ mutex_lock(&bfqio_mutex);
14483 -+ bgrp->online = false;
14484 -+ mutex_unlock(&bfqio_mutex);
14485 -+}
14486 -+
14487 - struct cgroup_subsys bfqio_subsys = {
14488 - .name = "bfqio",
14489 - .css_alloc = bfqio_create,
14490 -+ .css_online = bfqio_css_online,
14491 -+ .css_offline = bfqio_css_offline,
14492 - .can_attach = bfqio_can_attach,
14493 - .attach = bfqio_attach,
14494 - .css_free = bfqio_destroy,
14495 -diff --git a/block/bfq.h b/block/bfq.h
14496 -index bb52975..885e62c 100644
14497 ---- a/block/bfq.h
14498 -+++ b/block/bfq.h
14499 -@@ -510,6 +510,7 @@ struct bfq_group {
14500 - /**
14501 - * struct bfqio_cgroup - bfq cgroup data structure.
14502 - * @css: subsystem state for bfq in the containing cgroup.
14503 -+ * @online: flag marked when the subsystem is inserted.
14504 - * @weight: cgroup weight.
14505 - * @ioprio: cgroup ioprio.
14506 - * @ioprio_class: cgroup ioprio_class.
14507 -@@ -521,6 +522,7 @@ struct bfq_group {
14508 - */
14509 - struct bfqio_cgroup {
14510 - struct cgroup_subsys_state css;
14511 -+ bool online;
14512 -
14513 - unsigned short weight, ioprio, ioprio_class;
14514 -
14515 ---
14516 -1.8.1.4
14517 -