Gentoo Archives: gentoo-commits

From: "Tom Wijsman (tomwij)" <tomwij@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] linux-patches r2565 - genpatches-2.6/trunk/3.12
Date: Mon, 04 Nov 2013 10:09:40
Message-Id: 20131104100931.E34C52004B@flycatcher.gentoo.org
1 Author: tomwij
2 Date: 2013-11-04 10:09:31 +0000 (Mon, 04 Nov 2013)
3 New Revision: 2565
4
5 Added:
6 genpatches-2.6/trunk/3.12/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch
7 genpatches-2.6/trunk/3.12/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1
8 genpatches-2.6/trunk/3.12/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1
9 genpatches-2.6/trunk/3.12/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch
10 Removed:
11 genpatches-2.6/trunk/3.12/5000-BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch
12 genpatches-2.6/trunk/3.12/5000-BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1
13 genpatches-2.6/trunk/3.12/5000-BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1
14 Modified:
15 genpatches-2.6/trunk/3.12/0000_README
16 genpatches-2.6/trunk/3.12/2700_ThinkPad-30-brightness-control-fix.patch
17 genpatches-2.6/trunk/3.12/4500_support-for-pogoplug-e02.patch
18 genpatches-2.6/trunk/3.12/4567_distro-Gentoo-Kconfig.patch
19 Log:
20 Updated patches to work with 3.12, updated Kconfig to remove HOTPLUG for systemd.
21
22 Modified: genpatches-2.6/trunk/3.12/0000_README
23 ===================================================================
24 --- genpatches-2.6/trunk/3.12/0000_README 2013-11-04 00:52:35 UTC (rev 2564)
25 +++ genpatches-2.6/trunk/3.12/0000_README 2013-11-04 10:09:31 UTC (rev 2565)
26 @@ -82,14 +82,18 @@
27 From: Tom Wijsman <TomWij@g.o>
28 Desc: Add Gentoo Linux support config settings and defaults.
29
30 -Patch: 5000-BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch
31 +Patch: 5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch
32 From: http://algo.ing.unimo.it/people/paolo/disk_sched/
33 Desc: BFQ v6r2 patch 1 for 3.11: Build, cgroups and kconfig bits
34
35 -Patch: 5000-BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1
36 +Patch: 5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1
37 From: http://algo.ing.unimo.it/people/paolo/disk_sched/
38 Desc: BFQ v6r2 patch 2 for 3.10: BFQ Scheduler
39
40 -Patch: 5000-BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1
41 +Patch: 5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1
42 From: http://algo.ing.unimo.it/people/paolo/disk_sched/
43 Desc: BFQ v6r2 patch 3 for 3.10: Early Queue Merge (EQM)
44 +
45 +Patch: 5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch
46 +From: http://algo.ing.unimo.it/people/paolo/disk_sched/
47 +Desc: BFQ v6r2 for 3.11.0 to BFQ v6r2 for 3.12.0.
48
49 Modified: genpatches-2.6/trunk/3.12/2700_ThinkPad-30-brightness-control-fix.patch
50 ===================================================================
51 --- genpatches-2.6/trunk/3.12/2700_ThinkPad-30-brightness-control-fix.patch 2013-11-04 00:52:35 UTC (rev 2564)
52 +++ genpatches-2.6/trunk/3.12/2700_ThinkPad-30-brightness-control-fix.patch 2013-11-04 10:09:31 UTC (rev 2565)
53 @@ -2,20 +2,6 @@
54 index cb96296..6c242ed 100644
55 --- a/drivers/acpi/blacklist.c
56 +++ b/drivers/acpi/blacklist.c
57 -@@ -193,6 +193,13 @@ static int __init dmi_disable_osi_win7(const struct dmi_system_id *d)
58 - return 0;
59 - }
60 -
61 -+static int __init dmi_disable_osi_win8(const struct dmi_system_id *d)
62 -+{
63 -+ printk(KERN_NOTICE PREFIX "DMI detected: %s\n", d->ident);
64 -+ acpi_osi_setup("!Windows 2012");
65 -+ return 0;
66 -+}
67 -+
68 - static struct dmi_system_id acpi_osi_dmi_table[] __initdata = {
69 - {
70 - .callback = dmi_disable_osi_vista,
71 @@ -269,6 +276,61 @@ static struct dmi_system_id acpi_osi_dmi_table[] __initdata = {
72 },
73
74
75 Modified: genpatches-2.6/trunk/3.12/4500_support-for-pogoplug-e02.patch
76 ===================================================================
77 --- genpatches-2.6/trunk/3.12/4500_support-for-pogoplug-e02.patch 2013-11-04 00:52:35 UTC (rev 2564)
78 +++ genpatches-2.6/trunk/3.12/4500_support-for-pogoplug-e02.patch 2013-11-04 10:09:31 UTC (rev 2565)
79 @@ -3,9 +3,9 @@
80 --- a/arch/arm/configs/kirkwood_defconfig
81 +++ b/arch/arm/configs/kirkwood_defconfig
82 @@ -20,6 +20,7 @@ CONFIG_MACH_NET2BIG_V2=y
83 + CONFIG_MACH_D2NET_V2=y
84 + CONFIG_MACH_NET2BIG_V2=y
85 CONFIG_MACH_NET5BIG_V2=y
86 - CONFIG_MACH_NETSPACE_MAX_V2=y
87 - CONFIG_MACH_NETSPACE_V2=y
88 +CONFIG_MACH_POGO_E02=n
89 CONFIG_MACH_OPENRD_BASE=y
90 CONFIG_MACH_OPENRD_CLIENT=y
91 @@ -35,13 +35,13 @@
92 --- a/arch/arm/mach-kirkwood/Makefile
93 +++ b/arch/arm/mach-kirkwood/Makefile
94 @@ -2,6 +2,7 @@ obj-y += common.o irq.o pcie.o mpp.o
95 -
96 obj-$(CONFIG_MACH_D2NET_V2) += d2net_v2-setup.o lacie_v2-common.o
97 - obj-$(CONFIG_MACH_DOCKSTAR) += dockstar-setup.o
98 + obj-$(CONFIG_MACH_NET2BIG_V2) += netxbig_v2-setup.o lacie_v2-common.o
99 + obj-$(CONFIG_MACH_NET5BIG_V2) += netxbig_v2-setup.o lacie_v2-common.o
100 +obj-$(CONFIG_MACH_POGO_E02) += pogo_e02-setup.o
101 - obj-$(CONFIG_MACH_ESATA_SHEEVAPLUG) += sheevaplug-setup.o
102 - obj-$(CONFIG_MACH_GURUPLUG) += guruplug-setup.o
103 - obj-$(CONFIG_MACH_INETSPACE_V2) += netspace_v2-setup.o lacie_v2-common.o
104 + obj-$(CONFIG_MACH_OPENRD) += openrd-setup.o
105 + obj-$(CONFIG_MACH_RD88F6192_NAS) += rd88f6192-nas-setup.o
106 + obj-$(CONFIG_MACH_RD88F6281) += rd88f6281-setup.o
107 diff --git a/arch/arm/mach-kirkwood/pogo_e02-setup.c b/arch/arm/mach-kirkwood/pogo_e02-setup.c
108 new file mode 100644
109 index 0000000..f57e8f7
110
111 Modified: genpatches-2.6/trunk/3.12/4567_distro-Gentoo-Kconfig.patch
112 ===================================================================
113 --- genpatches-2.6/trunk/3.12/4567_distro-Gentoo-Kconfig.patch 2013-11-04 00:52:35 UTC (rev 2564)
114 +++ genpatches-2.6/trunk/3.12/4567_distro-Gentoo-Kconfig.patch 2013-11-04 10:09:31 UTC (rev 2565)
115 @@ -9,7 +9,7 @@
116 source "arch/$SRCARCH/Kconfig"
117 --- /dev/null
118 +++ b/distro/Kconfig
119 -@@ -0,0 +1,109 @@
120 +@@ -0,0 +1,107 @@
121 +menu "Gentoo Linux"
122 +
123 +config GENTOO_LINUX
124 @@ -35,7 +35,6 @@
125 + select TMPFS
126 +
127 + select MMU
128 -+ select HOTPLUG
129 + select SHMEM
130 +
131 + help
132 @@ -91,7 +90,6 @@
133 + select EPOLL
134 + select FANOTIFY
135 + select FHANDLE
136 -+ select HOTPLUG
137 + select INOTIFY_USER
138 + select NET
139 + select PROC_FS
140
141 Deleted: genpatches-2.6/trunk/3.12/5000-BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch
142 ===================================================================
143 --- genpatches-2.6/trunk/3.12/5000-BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch 2013-11-04 00:52:35 UTC (rev 2564)
144 +++ genpatches-2.6/trunk/3.12/5000-BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch 2013-11-04 10:09:31 UTC (rev 2565)
145 @@ -1,97 +0,0 @@
146 -From 3728677b4d3cd39d83be87f9939328201b871c48 Mon Sep 17 00:00:00 2001
147 -From: Arianna Avanzini <avanzini.arianna@×××××.com>
148 -Date: Tue, 3 Sep 2013 16:50:42 +0200
149 -Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v6r2-3.11
150 -
151 -Update Kconfig.iosched and do the related Makefile changes to include
152 -kernel configuration options for BFQ. Also add the bfqio controller
153 -to the cgroups subsystem.
154 -
155 -Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
156 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
157 ----
158 - block/Kconfig.iosched | 25 +++++++++++++++++++++++++
159 - block/Makefile | 1 +
160 - include/linux/cgroup_subsys.h | 4 ++++
161 - 3 files changed, 30 insertions(+)
162 -
163 -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
164 -index 421bef9..695e064 100644
165 ---- a/block/Kconfig.iosched
166 -+++ b/block/Kconfig.iosched
167 -@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED
168 - ---help---
169 - Enable group IO scheduling in CFQ.
170 -
171 -+config IOSCHED_BFQ
172 -+ tristate "BFQ I/O scheduler"
173 -+ default n
174 -+ ---help---
175 -+ The BFQ I/O scheduler tries to distribute bandwidth among
176 -+ all processes according to their weights.
177 -+ It aims at distributing the bandwidth as desired, independently of
178 -+ the disk parameters and with any workload. It also tries to
179 -+ guarantee low latency to interactive and soft real-time
180 -+ applications. If compiled built-in (saying Y here), BFQ can
181 -+ be configured to support hierarchical scheduling.
182 -+
183 -+config CGROUP_BFQIO
184 -+ bool "BFQ hierarchical scheduling support"
185 -+ depends on CGROUPS && IOSCHED_BFQ=y
186 -+ default n
187 -+ ---help---
188 -+ Enable hierarchical scheduling in BFQ, using the cgroups
189 -+ filesystem interface. The name of the subsystem will be
190 -+ bfqio.
191 -+
192 - choice
193 - prompt "Default I/O scheduler"
194 - default DEFAULT_CFQ
195 -@@ -52,6 +73,9 @@ choice
196 - config DEFAULT_CFQ
197 - bool "CFQ" if IOSCHED_CFQ=y
198 -
199 -+ config DEFAULT_BFQ
200 -+ bool "BFQ" if IOSCHED_BFQ=y
201 -+
202 - config DEFAULT_NOOP
203 - bool "No-op"
204 -
205 -@@ -61,6 +85,7 @@ config DEFAULT_IOSCHED
206 - string
207 - default "deadline" if DEFAULT_DEADLINE
208 - default "cfq" if DEFAULT_CFQ
209 -+ default "bfq" if DEFAULT_BFQ
210 - default "noop" if DEFAULT_NOOP
211 -
212 - endmenu
213 -diff --git a/block/Makefile b/block/Makefile
214 -index 39b76ba..c0d20fa 100644
215 ---- a/block/Makefile
216 -+++ b/block/Makefile
217 -@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
218 - obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
219 - obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
220 - obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
221 -+obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
222 -
223 - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
224 - obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
225 -diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
226 -index b613ffd..43c5dc9 100644
227 ---- a/include/linux/cgroup_subsys.h
228 -+++ b/include/linux/cgroup_subsys.h
229 -@@ -39,6 +39,10 @@ SUBSYS(net_cls)
230 - SUBSYS(blkio)
231 - #endif
232 -
233 -+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)
234 -+SUBSYS(bfqio)
235 -+#endif
236 -+
237 - #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)
238 - SUBSYS(perf)
239 - #endif
240 ---
241 -1.8.1.4
242 -
243
244 Deleted: genpatches-2.6/trunk/3.12/5000-BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1
245 ===================================================================
246 --- genpatches-2.6/trunk/3.12/5000-BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1 2013-11-04 00:52:35 UTC (rev 2564)
247 +++ genpatches-2.6/trunk/3.12/5000-BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1 2013-11-04 10:09:31 UTC (rev 2565)
248 @@ -1,5773 +0,0 @@
249 -From 009b78bafe1763f71e6bdbb4f536b564a73b7db5 Mon Sep 17 00:00:00 2001
250 -From: Arianna Avanzini <avanzini.arianna@×××××.com>
251 -Date: Thu, 9 May 2013 19:10:02 +0200
252 -Subject: [PATCH 2/3] block: introduce the BFQ-v6r2 I/O sched for 3.11
253 -
254 -Add the BFQ-v6r2 I/O scheduler to 3.11.
255 -The general structure is borrowed from CFQ, as much code. A (bfq_)queue
256 -is associated to each task doing I/O on a device, and each time a
257 -scheduling decision has to be made a queue is selected and served until
258 -it expires.
259 -
260 - - Slices are given in the service domain: tasks are assigned
261 - budgets, measured in number of sectors. Once got the disk, a task
262 - must however consume its assigned budget within a configurable
263 - maximum time (by default, the maximum possible value of the
264 - budgets is automatically computed to comply with this timeout).
265 - This allows the desired latency vs "throughput boosting" tradeoff
266 - to be set.
267 -
268 - - Budgets are scheduled according to a variant of WF2Q+, implemented
269 - using an augmented rb-tree to take eligibility into account while
270 - preserving an O(log N) overall complexity.
271 -
272 - - A low-latency tunable is provided; if enabled, both interactive
273 - and soft real-time applications are guaranteed very low latency.
274 -
275 - - Latency guarantees are preserved also in presence of NCQ.
276 -
277 - - Also with flash-based devices, a high throughput is achieved while
278 - still preserving latency guarantees.
279 -
280 - - Useful features borrowed from CFQ: cooperating-queues merging (with
281 - some additional optimizations with respect to the original CFQ version),
282 - static fallback queue for OOM.
283 -
284 - - BFQ supports full hierarchical scheduling, exporting a cgroups
285 - interface. Each node has a full scheduler, so each group can
286 - be assigned its own ioprio (mapped to a weight, see next point)
287 - and an ioprio_class.
288 -
289 - - If the cgroups interface is used, weights can be explictly
290 - assigned, otherwise ioprio values are mapped to weights using the
291 - relation weight = IOPRIO_BE_NR - ioprio.
292 -
293 - - ioprio classes are served in strict priority order, i.e., lower
294 - priority queues are not served as long as there are higher
295 - priority queues. Among queues in the same class the bandwidth is
296 - distributed in proportion to the weight of each queue. A very
297 - thin extra bandwidth is however guaranteed to the Idle class, to
298 - prevent it from starving.
299 -
300 -Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
301 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
302 ----
303 - block/bfq-cgroup.c | 881 +++++++++++++++
304 - block/bfq-ioc.c | 36 +
305 - block/bfq-iosched.c | 3082 +++++++++++++++++++++++++++++++++++++++++++++++++++
306 - block/bfq-sched.c | 1072 ++++++++++++++++++
307 - block/bfq.h | 603 ++++++++++
308 - 5 files changed, 5674 insertions(+)
309 - create mode 100644 block/bfq-cgroup.c
310 - create mode 100644 block/bfq-ioc.c
311 - create mode 100644 block/bfq-iosched.c
312 - create mode 100644 block/bfq-sched.c
313 - create mode 100644 block/bfq.h
314 -
315 -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
316 -new file mode 100644
317 -index 0000000..bb9b851
318 ---- /dev/null
319 -+++ b/block/bfq-cgroup.c
320 -@@ -0,0 +1,881 @@
321 -+/*
322 -+ * BFQ: CGROUPS support.
323 -+ *
324 -+ * Based on ideas and code from CFQ:
325 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
326 -+ *
327 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
328 -+ * Paolo Valente <paolo.valente@×××××××.it>
329 -+ *
330 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
331 -+ *
332 -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
333 -+ */
334 -+
335 -+#ifdef CONFIG_CGROUP_BFQIO
336 -+
337 -+static DEFINE_MUTEX(bfqio_mutex);
338 -+
339 -+static bool bfqio_is_removed(struct cgroup *cgroup)
340 -+{
341 -+ return test_bit(CGRP_DEAD, &cgroup->flags);
342 -+}
343 -+
344 -+static struct bfqio_cgroup bfqio_root_cgroup = {
345 -+ .weight = BFQ_DEFAULT_GRP_WEIGHT,
346 -+ .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
347 -+ .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
348 -+};
349 -+
350 -+static inline void bfq_init_entity(struct bfq_entity *entity,
351 -+ struct bfq_group *bfqg)
352 -+{
353 -+ entity->weight = entity->new_weight;
354 -+ entity->orig_weight = entity->new_weight;
355 -+ entity->ioprio = entity->new_ioprio;
356 -+ entity->ioprio_class = entity->new_ioprio_class;
357 -+ entity->parent = bfqg->my_entity;
358 -+ entity->sched_data = &bfqg->sched_data;
359 -+}
360 -+
361 -+static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)
362 -+{
363 -+ return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),
364 -+ struct bfqio_cgroup, css);
365 -+}
366 -+
367 -+/*
368 -+ * Search the bfq_group for bfqd into the hash table (by now only a list)
369 -+ * of bgrp. Must be called under rcu_read_lock().
370 -+ */
371 -+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
372 -+ struct bfq_data *bfqd)
373 -+{
374 -+ struct bfq_group *bfqg;
375 -+ void *key;
376 -+
377 -+ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
378 -+ key = rcu_dereference(bfqg->bfqd);
379 -+ if (key == bfqd)
380 -+ return bfqg;
381 -+ }
382 -+
383 -+ return NULL;
384 -+}
385 -+
386 -+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
387 -+ struct bfq_group *bfqg)
388 -+{
389 -+ struct bfq_entity *entity = &bfqg->entity;
390 -+
391 -+ /*
392 -+ * If the weight of the entity has never been set via the sysfs
393 -+ * interface, then bgrp->weight == 0. In this case we initialize
394 -+ * the weight from the current ioprio value. Otherwise, the group
395 -+ * weight, if set, has priority over the ioprio value.
396 -+ */
397 -+ if (bgrp->weight == 0) {
398 -+ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
399 -+ entity->new_ioprio = bgrp->ioprio;
400 -+ } else {
401 -+ entity->new_weight = bgrp->weight;
402 -+ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
403 -+ }
404 -+ entity->orig_weight = entity->weight = entity->new_weight;
405 -+ entity->ioprio = entity->new_ioprio;
406 -+ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
407 -+ entity->my_sched_data = &bfqg->sched_data;
408 -+}
409 -+
410 -+static inline void bfq_group_set_parent(struct bfq_group *bfqg,
411 -+ struct bfq_group *parent)
412 -+{
413 -+ struct bfq_entity *entity;
414 -+
415 -+ BUG_ON(parent == NULL);
416 -+ BUG_ON(bfqg == NULL);
417 -+
418 -+ entity = &bfqg->entity;
419 -+ entity->parent = parent->my_entity;
420 -+ entity->sched_data = &parent->sched_data;
421 -+}
422 -+
423 -+/**
424 -+ * bfq_group_chain_alloc - allocate a chain of groups.
425 -+ * @bfqd: queue descriptor.
426 -+ * @cgroup: the leaf cgroup this chain starts from.
427 -+ *
428 -+ * Allocate a chain of groups starting from the one belonging to
429 -+ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
430 -+ * to the root has already an allocated group on @bfqd.
431 -+ */
432 -+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
433 -+ struct cgroup *cgroup)
434 -+{
435 -+ struct bfqio_cgroup *bgrp;
436 -+ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
437 -+
438 -+ for (; cgroup != NULL; cgroup = cgroup->parent) {
439 -+ bgrp = cgroup_to_bfqio(cgroup);
440 -+
441 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
442 -+ if (bfqg != NULL) {
443 -+ /*
444 -+ * All the cgroups in the path from there to the
445 -+ * root must have a bfq_group for bfqd, so we don't
446 -+ * need any more allocations.
447 -+ */
448 -+ break;
449 -+ }
450 -+
451 -+ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
452 -+ if (bfqg == NULL)
453 -+ goto cleanup;
454 -+
455 -+ bfq_group_init_entity(bgrp, bfqg);
456 -+ bfqg->my_entity = &bfqg->entity;
457 -+
458 -+ if (leaf == NULL) {
459 -+ leaf = bfqg;
460 -+ prev = leaf;
461 -+ } else {
462 -+ bfq_group_set_parent(prev, bfqg);
463 -+ /*
464 -+ * Build a list of allocated nodes using the bfqd
465 -+ * filed, that is still unused and will be initialized
466 -+ * only after the node will be connected.
467 -+ */
468 -+ prev->bfqd = bfqg;
469 -+ prev = bfqg;
470 -+ }
471 -+ }
472 -+
473 -+ return leaf;
474 -+
475 -+cleanup:
476 -+ while (leaf != NULL) {
477 -+ prev = leaf;
478 -+ leaf = leaf->bfqd;
479 -+ kfree(prev);
480 -+ }
481 -+
482 -+ return NULL;
483 -+}
484 -+
485 -+/**
486 -+ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
487 -+ * @bfqd: the queue descriptor.
488 -+ * @cgroup: the leaf cgroup to start from.
489 -+ * @leaf: the leaf group (to be associated to @cgroup).
490 -+ *
491 -+ * Try to link a chain of groups to a cgroup hierarchy, connecting the
492 -+ * nodes bottom-up, so we can be sure that when we find a cgroup in the
493 -+ * hierarchy that already as a group associated to @bfqd all the nodes
494 -+ * in the path to the root cgroup have one too.
495 -+ *
496 -+ * On locking: the queue lock protects the hierarchy (there is a hierarchy
497 -+ * per device) while the bfqio_cgroup lock protects the list of groups
498 -+ * belonging to the same cgroup.
499 -+ */
500 -+static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
501 -+ struct bfq_group *leaf)
502 -+{
503 -+ struct bfqio_cgroup *bgrp;
504 -+ struct bfq_group *bfqg, *next, *prev = NULL;
505 -+ unsigned long flags;
506 -+
507 -+ assert_spin_locked(bfqd->queue->queue_lock);
508 -+
509 -+ for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {
510 -+ bgrp = cgroup_to_bfqio(cgroup);
511 -+ next = leaf->bfqd;
512 -+
513 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
514 -+ BUG_ON(bfqg != NULL);
515 -+
516 -+ spin_lock_irqsave(&bgrp->lock, flags);
517 -+
518 -+ rcu_assign_pointer(leaf->bfqd, bfqd);
519 -+ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
520 -+ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
521 -+
522 -+ spin_unlock_irqrestore(&bgrp->lock, flags);
523 -+
524 -+ prev = leaf;
525 -+ leaf = next;
526 -+ }
527 -+
528 -+ BUG_ON(cgroup == NULL && leaf != NULL);
529 -+ if (cgroup != NULL && prev != NULL) {
530 -+ bgrp = cgroup_to_bfqio(cgroup);
531 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
532 -+ bfq_group_set_parent(prev, bfqg);
533 -+ }
534 -+}
535 -+
536 -+/**
537 -+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
538 -+ * @bfqd: queue descriptor.
539 -+ * @cgroup: cgroup being searched for.
540 -+ *
541 -+ * Return a group associated to @bfqd in @cgroup, allocating one if
542 -+ * necessary. When a group is returned all the cgroups in the path
543 -+ * to the root have a group associated to @bfqd.
544 -+ *
545 -+ * If the allocation fails, return the root group: this breaks guarantees
546 -+ * but is a safe fallbak. If this loss becames a problem it can be
547 -+ * mitigated using the equivalent weight (given by the product of the
548 -+ * weights of the groups in the path from @group to the root) in the
549 -+ * root scheduler.
550 -+ *
551 -+ * We allocate all the missing nodes in the path from the leaf cgroup
552 -+ * to the root and we connect the nodes only after all the allocations
553 -+ * have been successful.
554 -+ */
555 -+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
556 -+ struct cgroup *cgroup)
557 -+{
558 -+ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
559 -+ struct bfq_group *bfqg;
560 -+
561 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
562 -+ if (bfqg != NULL)
563 -+ return bfqg;
564 -+
565 -+ bfqg = bfq_group_chain_alloc(bfqd, cgroup);
566 -+ if (bfqg != NULL)
567 -+ bfq_group_chain_link(bfqd, cgroup, bfqg);
568 -+ else
569 -+ bfqg = bfqd->root_group;
570 -+
571 -+ return bfqg;
572 -+}
573 -+
574 -+/**
575 -+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
576 -+ * @bfqd: queue descriptor.
577 -+ * @bfqq: the queue to move.
578 -+ * @entity: @bfqq's entity.
579 -+ * @bfqg: the group to move to.
580 -+ *
581 -+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
582 -+ * it on the new one. Avoid putting the entity on the old group idle tree.
583 -+ *
584 -+ * Must be called under the queue lock; the cgroup owning @bfqg must
585 -+ * not disappear (by now this just means that we are called under
586 -+ * rcu_read_lock()).
587 -+ */
588 -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
589 -+ struct bfq_entity *entity, struct bfq_group *bfqg)
590 -+{
591 -+ int busy, resume;
592 -+
593 -+ busy = bfq_bfqq_busy(bfqq);
594 -+ resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
595 -+
596 -+ BUG_ON(resume && !entity->on_st);
597 -+ BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue);
598 -+
599 -+ if (busy) {
600 -+ BUG_ON(atomic_read(&bfqq->ref) < 2);
601 -+
602 -+ if (!resume)
603 -+ bfq_del_bfqq_busy(bfqd, bfqq, 0);
604 -+ else
605 -+ bfq_deactivate_bfqq(bfqd, bfqq, 0);
606 -+ } else if (entity->on_st)
607 -+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
608 -+
609 -+ /*
610 -+ * Here we use a reference to bfqg. We don't need a refcounter
611 -+ * as the cgroup reference will not be dropped, so that its
612 -+ * destroy() callback will not be invoked.
613 -+ */
614 -+ entity->parent = bfqg->my_entity;
615 -+ entity->sched_data = &bfqg->sched_data;
616 -+
617 -+ if (busy && resume)
618 -+ bfq_activate_bfqq(bfqd, bfqq);
619 -+
620 -+ if (bfqd->active_queue == NULL && !bfqd->rq_in_driver)
621 -+ bfq_schedule_dispatch(bfqd);
622 -+}
623 -+
624 -+/**
625 -+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
626 -+ * @bfqd: the queue descriptor.
627 -+ * @bic: the bic to move.
628 -+ * @cgroup: the cgroup to move to.
629 -+ *
630 -+ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
631 -+ * has to make sure that the reference to cgroup is valid across the call.
632 -+ *
633 -+ * NOTE: an alternative approach might have been to store the current
634 -+ * cgroup in bfqq and getting a reference to it, reducing the lookup
635 -+ * time here, at the price of slightly more complex code.
636 -+ */
637 -+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
638 -+ struct bfq_io_cq *bic,
639 -+ struct cgroup *cgroup)
640 -+{
641 -+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
642 -+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
643 -+ struct bfq_entity *entity;
644 -+ struct bfq_group *bfqg;
645 -+ struct bfqio_cgroup *bgrp;
646 -+
647 -+ bgrp = cgroup_to_bfqio(cgroup);
648 -+
649 -+ bfqg = bfq_find_alloc_group(bfqd, cgroup);
650 -+ if (async_bfqq != NULL) {
651 -+ entity = &async_bfqq->entity;
652 -+
653 -+ if (entity->sched_data != &bfqg->sched_data) {
654 -+ bic_set_bfqq(bic, NULL, 0);
655 -+ bfq_log_bfqq(bfqd, async_bfqq,
656 -+ "bic_change_group: %p %d",
657 -+ async_bfqq, atomic_read(&async_bfqq->ref));
658 -+ bfq_put_queue(async_bfqq);
659 -+ }
660 -+ }
661 -+
662 -+ if (sync_bfqq != NULL) {
663 -+ entity = &sync_bfqq->entity;
664 -+ if (entity->sched_data != &bfqg->sched_data)
665 -+ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
666 -+ }
667 -+
668 -+ return bfqg;
669 -+}
670 -+
671 -+/**
672 -+ * bfq_bic_change_cgroup - move @bic to @cgroup.
673 -+ * @bic: the bic being migrated.
674 -+ * @cgroup: the destination cgroup.
675 -+ *
676 -+ * When the task owning @bic is moved to @cgroup, @bic is immediately
677 -+ * moved into its new parent group.
678 -+ */
679 -+static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
680 -+ struct cgroup *cgroup)
681 -+{
682 -+ struct bfq_data *bfqd;
683 -+ unsigned long uninitialized_var(flags);
684 -+
685 -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
686 -+ if (bfqd != NULL) {
687 -+ __bfq_bic_change_cgroup(bfqd, bic, cgroup);
688 -+ bfq_put_bfqd_unlock(bfqd, &flags);
689 -+ }
690 -+}
691 -+
692 -+/**
693 -+ * bfq_bic_update_cgroup - update the cgroup of @bic.
694 -+ * @bic: the @bic to update.
695 -+ *
696 -+ * Make sure that @bic is enqueued in the cgroup of the current task.
697 -+ * We need this in addition to moving bics during the cgroup attach
698 -+ * phase because the task owning @bic could be at its first disk
699 -+ * access or we may end up in the root cgroup as the result of a
700 -+ * memory allocation failure and here we try to move to the right
701 -+ * group.
702 -+ *
703 -+ * Must be called under the queue lock. It is safe to use the returned
704 -+ * value even after the rcu_read_unlock() as the migration/destruction
705 -+ * paths act under the queue lock too. IOW it is impossible to race with
706 -+ * group migration/destruction and end up with an invalid group as:
707 -+ * a) here cgroup has not yet been destroyed, nor its destroy callback
708 -+ * has started execution, as current holds a reference to it,
709 -+ * b) if it is destroyed after rcu_read_unlock() [after current is
710 -+ * migrated to a different cgroup] its attach() callback will have
711 -+ * taken care of remove all the references to the old cgroup data.
712 -+ */
713 -+static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
714 -+{
715 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
716 -+ struct bfq_group *bfqg;
717 -+ struct cgroup *cgroup;
718 -+
719 -+ BUG_ON(bfqd == NULL);
720 -+
721 -+ rcu_read_lock();
722 -+ cgroup = task_cgroup(current, bfqio_subsys_id);
723 -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, cgroup);
724 -+ rcu_read_unlock();
725 -+
726 -+ return bfqg;
727 -+}
728 -+
729 -+/**
730 -+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
731 -+ * @st: the service tree being flushed.
732 -+ */
733 -+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
734 -+{
735 -+ struct bfq_entity *entity = st->first_idle;
736 -+
737 -+ for (; entity != NULL; entity = st->first_idle)
738 -+ __bfq_deactivate_entity(entity, 0);
739 -+}
740 -+
741 -+/**
742 -+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
743 -+ * @bfqd: the device data structure with the root group.
744 -+ * @entity: the entity to move.
745 -+ */
746 -+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
747 -+ struct bfq_entity *entity)
748 -+{
749 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
750 -+
751 -+ BUG_ON(bfqq == NULL);
752 -+ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
753 -+ return;
754 -+}
755 -+
756 -+/**
757 -+ * bfq_reparent_active_entities - move to the root group all active entities.
758 -+ * @bfqd: the device data structure with the root group.
759 -+ * @bfqg: the group to move from.
760 -+ * @st: the service tree with the entities.
761 -+ *
762 -+ * Needs queue_lock to be taken and reference to be valid over the call.
763 -+ */
764 -+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
765 -+ struct bfq_group *bfqg,
766 -+ struct bfq_service_tree *st)
767 -+{
768 -+ struct rb_root *active = &st->active;
769 -+ struct bfq_entity *entity = NULL;
770 -+
771 -+ if (!RB_EMPTY_ROOT(&st->active))
772 -+ entity = bfq_entity_of(rb_first(active));
773 -+
774 -+ for (; entity != NULL ; entity = bfq_entity_of(rb_first(active)))
775 -+ bfq_reparent_leaf_entity(bfqd, entity);
776 -+
777 -+ if (bfqg->sched_data.active_entity != NULL)
778 -+ bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);
779 -+
780 -+ return;
781 -+}
782 -+
783 -+/**
784 -+ * bfq_destroy_group - destroy @bfqg.
785 -+ * @bgrp: the bfqio_cgroup containing @bfqg.
786 -+ * @bfqg: the group being destroyed.
787 -+ *
788 -+ * Destroy @bfqg, making sure that it is not referenced from its parent.
789 -+ */
790 -+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
791 -+{
792 -+ struct bfq_data *bfqd;
793 -+ struct bfq_service_tree *st;
794 -+ struct bfq_entity *entity = bfqg->my_entity;
795 -+ unsigned long uninitialized_var(flags);
796 -+ int i;
797 -+
798 -+ hlist_del(&bfqg->group_node);
799 -+
800 -+ /*
801 -+ * Empty all service_trees belonging to this group before deactivating
802 -+ * the group itself.
803 -+ */
804 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
805 -+ st = bfqg->sched_data.service_tree + i;
806 -+
807 -+ /*
808 -+ * The idle tree may still contain bfq_queues belonging
809 -+ * to exited task because they never migrated to a different
810 -+ * cgroup from the one being destroyed now. Noone else
811 -+ * can access them so it's safe to act without any lock.
812 -+ */
813 -+ bfq_flush_idle_tree(st);
814 -+
815 -+ /*
816 -+ * It may happen that some queues are still active
817 -+ * (busy) upon group destruction (if the corresponding
818 -+ * processes have been forced to terminate). We move
819 -+ * all the leaf entities corresponding to these queues
820 -+ * to the root_group.
821 -+ * Also, it may happen that the group has an entity
822 -+ * under service, which is disconnected from the active
823 -+ * tree: it must be moved, too.
824 -+ * There is no need to put the sync queues, as the
825 -+ * scheduler has taken no reference.
826 -+ */
827 -+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
828 -+ if (bfqd != NULL) {
829 -+ bfq_reparent_active_entities(bfqd, bfqg, st);
830 -+ bfq_put_bfqd_unlock(bfqd, &flags);
831 -+ }
832 -+ BUG_ON(!RB_EMPTY_ROOT(&st->active));
833 -+ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
834 -+ }
835 -+ BUG_ON(bfqg->sched_data.next_active != NULL);
836 -+ BUG_ON(bfqg->sched_data.active_entity != NULL);
837 -+
838 -+ /*
839 -+ * We may race with device destruction, take extra care when
840 -+ * dereferencing bfqg->bfqd.
841 -+ */
842 -+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
843 -+ if (bfqd != NULL) {
844 -+ hlist_del(&bfqg->bfqd_node);
845 -+ __bfq_deactivate_entity(entity, 0);
846 -+ bfq_put_async_queues(bfqd, bfqg);
847 -+ bfq_put_bfqd_unlock(bfqd, &flags);
848 -+ }
849 -+ BUG_ON(entity->tree != NULL);
850 -+
851 -+ /*
852 -+ * No need to defer the kfree() to the end of the RCU grace
853 -+ * period: we are called from the destroy() callback of our
854 -+ * cgroup, so we can be sure that noone is a) still using
855 -+ * this cgroup or b) doing lookups in it.
856 -+ */
857 -+ kfree(bfqg);
858 -+}
859 -+
860 -+static void bfq_end_raising_async(struct bfq_data *bfqd)
861 -+{
862 -+ struct hlist_node *tmp;
863 -+ struct bfq_group *bfqg;
864 -+
865 -+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
866 -+ bfq_end_raising_async_queues(bfqd, bfqg);
867 -+}
868 -+
869 -+/**
870 -+ * bfq_disconnect_groups - diconnect @bfqd from all its groups.
871 -+ * @bfqd: the device descriptor being exited.
872 -+ *
873 -+ * When the device exits we just make sure that no lookup can return
874 -+ * the now unused group structures. They will be deallocated on cgroup
875 -+ * destruction.
876 -+ */
877 -+static void bfq_disconnect_groups(struct bfq_data *bfqd)
878 -+{
879 -+ struct hlist_node *tmp;
880 -+ struct bfq_group *bfqg;
881 -+
882 -+ bfq_log(bfqd, "disconnect_groups beginning") ;
883 -+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
884 -+ hlist_del(&bfqg->bfqd_node);
885 -+
886 -+ __bfq_deactivate_entity(bfqg->my_entity, 0);
887 -+
888 -+ /*
889 -+ * Don't remove from the group hash, just set an
890 -+ * invalid key. No lookups can race with the
891 -+ * assignment as bfqd is being destroyed; this
892 -+ * implies also that new elements cannot be added
893 -+ * to the list.
894 -+ */
895 -+ rcu_assign_pointer(bfqg->bfqd, NULL);
896 -+
897 -+ bfq_log(bfqd, "disconnect_groups: put async for group %p",
898 -+ bfqg) ;
899 -+ bfq_put_async_queues(bfqd, bfqg);
900 -+ }
901 -+}
902 -+
903 -+static inline void bfq_free_root_group(struct bfq_data *bfqd)
904 -+{
905 -+ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
906 -+ struct bfq_group *bfqg = bfqd->root_group;
907 -+
908 -+ bfq_put_async_queues(bfqd, bfqg);
909 -+
910 -+ spin_lock_irq(&bgrp->lock);
911 -+ hlist_del_rcu(&bfqg->group_node);
912 -+ spin_unlock_irq(&bgrp->lock);
913 -+
914 -+ /*
915 -+ * No need to synchronize_rcu() here: since the device is gone
916 -+ * there cannot be any read-side access to its root_group.
917 -+ */
918 -+ kfree(bfqg);
919 -+}
920 -+
921 -+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
922 -+{
923 -+ struct bfq_group *bfqg;
924 -+ struct bfqio_cgroup *bgrp;
925 -+ int i;
926 -+
927 -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
928 -+ if (bfqg == NULL)
929 -+ return NULL;
930 -+
931 -+ bfqg->entity.parent = NULL;
932 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
933 -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
934 -+
935 -+ bgrp = &bfqio_root_cgroup;
936 -+ spin_lock_irq(&bgrp->lock);
937 -+ rcu_assign_pointer(bfqg->bfqd, bfqd);
938 -+ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
939 -+ spin_unlock_irq(&bgrp->lock);
940 -+
941 -+ return bfqg;
942 -+}
943 -+
944 -+#define SHOW_FUNCTION(__VAR) \
945 -+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup, \
946 -+ struct cftype *cftype) \
947 -+{ \
948 -+ struct bfqio_cgroup *bgrp; \
949 -+ u64 ret = -ENODEV; \
950 -+ \
951 -+ mutex_lock(&bfqio_mutex); \
952 -+ if (bfqio_is_removed(cgroup)) \
953 -+ goto out_unlock; \
954 -+ \
955 -+ bgrp = cgroup_to_bfqio(cgroup); \
956 -+ spin_lock_irq(&bgrp->lock); \
957 -+ ret = bgrp->__VAR; \
958 -+ spin_unlock_irq(&bgrp->lock); \
959 -+ \
960 -+out_unlock: \
961 -+ mutex_unlock(&bfqio_mutex); \
962 -+ return ret; \
963 -+}
964 -+
965 -+SHOW_FUNCTION(weight);
966 -+SHOW_FUNCTION(ioprio);
967 -+SHOW_FUNCTION(ioprio_class);
968 -+#undef SHOW_FUNCTION
969 -+
970 -+#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
971 -+static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \
972 -+ struct cftype *cftype, \
973 -+ u64 val) \
974 -+{ \
975 -+ struct bfqio_cgroup *bgrp; \
976 -+ struct bfq_group *bfqg; \
977 -+ int ret = -EINVAL; \
978 -+ \
979 -+ if (val < (__MIN) || val > (__MAX)) \
980 -+ return ret; \
981 -+ \
982 -+ ret = -ENODEV; \
983 -+ mutex_lock(&bfqio_mutex); \
984 -+ if (bfqio_is_removed(cgroup)) \
985 -+ goto out_unlock; \
986 -+ ret = 0; \
987 -+ \
988 -+ bgrp = cgroup_to_bfqio(cgroup); \
989 -+ \
990 -+ spin_lock_irq(&bgrp->lock); \
991 -+ bgrp->__VAR = (unsigned short)val; \
992 -+ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
993 -+ /* \
994 -+ * Setting the ioprio_changed flag of the entity \
995 -+ * to 1 with new_##__VAR == ##__VAR would re-set \
996 -+ * the value of the weight to its ioprio mapping. \
997 -+ * Set the flag only if necessary. \
998 -+ */ \
999 -+ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
1000 -+ bfqg->entity.new_##__VAR = (unsigned short)val; \
1001 -+ smp_wmb(); \
1002 -+ bfqg->entity.ioprio_changed = 1; \
1003 -+ } \
1004 -+ } \
1005 -+ spin_unlock_irq(&bgrp->lock); \
1006 -+ \
1007 -+out_unlock: \
1008 -+ mutex_unlock(&bfqio_mutex); \
1009 -+ return ret; \
1010 -+}
1011 -+
1012 -+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
1013 -+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
1014 -+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
1015 -+#undef STORE_FUNCTION
1016 -+
1017 -+static struct cftype bfqio_files[] = {
1018 -+ {
1019 -+ .name = "weight",
1020 -+ .read_u64 = bfqio_cgroup_weight_read,
1021 -+ .write_u64 = bfqio_cgroup_weight_write,
1022 -+ },
1023 -+ {
1024 -+ .name = "ioprio",
1025 -+ .read_u64 = bfqio_cgroup_ioprio_read,
1026 -+ .write_u64 = bfqio_cgroup_ioprio_write,
1027 -+ },
1028 -+ {
1029 -+ .name = "ioprio_class",
1030 -+ .read_u64 = bfqio_cgroup_ioprio_class_read,
1031 -+ .write_u64 = bfqio_cgroup_ioprio_class_write,
1032 -+ },
1033 -+ { }, /* terminate */
1034 -+};
1035 -+
1036 -+static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)
1037 -+{
1038 -+ struct bfqio_cgroup *bgrp;
1039 -+
1040 -+ if (cgroup->parent != NULL) {
1041 -+ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
1042 -+ if (bgrp == NULL)
1043 -+ return ERR_PTR(-ENOMEM);
1044 -+ } else
1045 -+ bgrp = &bfqio_root_cgroup;
1046 -+
1047 -+ spin_lock_init(&bgrp->lock);
1048 -+ INIT_HLIST_HEAD(&bgrp->group_data);
1049 -+ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
1050 -+ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
1051 -+
1052 -+ return &bgrp->css;
1053 -+}
1054 -+
1055 -+/*
1056 -+ * We cannot support shared io contexts, as we have no means to support
1057 -+ * two tasks with the same ioc in two different groups without major rework
1058 -+ * of the main bic/bfqq data structures. By now we allow a task to change
1059 -+ * its cgroup only if it's the only owner of its ioc; the drawback of this
1060 -+ * behavior is that a group containing a task that forked using CLONE_IO
1061 -+ * will not be destroyed until the tasks sharing the ioc die.
1062 -+ */
1063 -+static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
1064 -+{
1065 -+ struct task_struct *task;
1066 -+ struct io_context *ioc;
1067 -+ int ret = 0;
1068 -+
1069 -+ cgroup_taskset_for_each(task, cgroup, tset) {
1070 -+ /* task_lock() is needed to avoid races with exit_io_context() */
1071 -+ task_lock(task);
1072 -+ ioc = task->io_context;
1073 -+ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
1074 -+ /*
1075 -+ * ioc == NULL means that the task is either too young or
1076 -+ * exiting: if it has still no ioc the ioc can't be shared,
1077 -+ * if the task is exiting the attach will fail anyway, no
1078 -+ * matter what we return here.
1079 -+ */
1080 -+ ret = -EINVAL;
1081 -+ task_unlock(task);
1082 -+ if (ret)
1083 -+ break;
1084 -+ }
1085 -+
1086 -+ return ret;
1087 -+}
1088 -+
1089 -+static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
1090 -+{
1091 -+ struct task_struct *task;
1092 -+ struct io_context *ioc;
1093 -+ struct io_cq *icq;
1094 -+
1095 -+ /*
1096 -+ * IMPORTANT NOTE: The move of more than one process at a time to a
1097 -+ * new group has not yet been tested.
1098 -+ */
1099 -+ cgroup_taskset_for_each(task, cgroup, tset) {
1100 -+ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1101 -+ if (ioc) {
1102 -+ /*
1103 -+ * Handle cgroup change here.
1104 -+ */
1105 -+ rcu_read_lock();
1106 -+ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
1107 -+ if (!strncmp(icq->q->elevator->type->elevator_name,
1108 -+ "bfq", ELV_NAME_MAX))
1109 -+ bfq_bic_change_cgroup(icq_to_bic(icq),
1110 -+ cgroup);
1111 -+ rcu_read_unlock();
1112 -+ put_io_context(ioc);
1113 -+ }
1114 -+ }
1115 -+}
1116 -+
1117 -+static void bfqio_destroy(struct cgroup *cgroup)
1118 -+{
1119 -+ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
1120 -+ struct hlist_node *tmp;
1121 -+ struct bfq_group *bfqg;
1122 -+
1123 -+ /*
1124 -+ * Since we are destroying the cgroup, there are no more tasks
1125 -+ * referencing it, and all the RCU grace periods that may have
1126 -+ * referenced it are ended (as the destruction of the parent
1127 -+ * cgroup is RCU-safe); bgrp->group_data will not be accessed by
1128 -+ * anything else and we don't need any synchronization.
1129 -+ */
1130 -+ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
1131 -+ bfq_destroy_group(bgrp, bfqg);
1132 -+
1133 -+ BUG_ON(!hlist_empty(&bgrp->group_data));
1134 -+
1135 -+ kfree(bgrp);
1136 -+}
1137 -+
1138 -+struct cgroup_subsys bfqio_subsys = {
1139 -+ .name = "bfqio",
1140 -+ .css_alloc = bfqio_create,
1141 -+ .can_attach = bfqio_can_attach,
1142 -+ .attach = bfqio_attach,
1143 -+ .css_free = bfqio_destroy,
1144 -+ .subsys_id = bfqio_subsys_id,
1145 -+ .base_cftypes = bfqio_files,
1146 -+};
1147 -+#else
1148 -+static inline void bfq_init_entity(struct bfq_entity *entity,
1149 -+ struct bfq_group *bfqg)
1150 -+{
1151 -+ entity->weight = entity->new_weight;
1152 -+ entity->orig_weight = entity->new_weight;
1153 -+ entity->ioprio = entity->new_ioprio;
1154 -+ entity->ioprio_class = entity->new_ioprio_class;
1155 -+ entity->sched_data = &bfqg->sched_data;
1156 -+}
1157 -+
1158 -+static inline struct bfq_group *
1159 -+bfq_bic_update_cgroup(struct bfq_io_cq *bic)
1160 -+{
1161 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
1162 -+ return bfqd->root_group;
1163 -+}
1164 -+
1165 -+static inline void bfq_bfqq_move(struct bfq_data *bfqd,
1166 -+ struct bfq_queue *bfqq,
1167 -+ struct bfq_entity *entity,
1168 -+ struct bfq_group *bfqg)
1169 -+{
1170 -+}
1171 -+
1172 -+static void bfq_end_raising_async(struct bfq_data *bfqd)
1173 -+{
1174 -+ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
1175 -+}
1176 -+
1177 -+static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
1178 -+{
1179 -+ bfq_put_async_queues(bfqd, bfqd->root_group);
1180 -+}
1181 -+
1182 -+static inline void bfq_free_root_group(struct bfq_data *bfqd)
1183 -+{
1184 -+ kfree(bfqd->root_group);
1185 -+}
1186 -+
1187 -+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
1188 -+{
1189 -+ struct bfq_group *bfqg;
1190 -+ int i;
1191 -+
1192 -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
1193 -+ if (bfqg == NULL)
1194 -+ return NULL;
1195 -+
1196 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
1197 -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
1198 -+
1199 -+ return bfqg;
1200 -+}
1201 -+#endif
1202 -diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
1203 -new file mode 100644
1204 -index 0000000..326e3ec
1205 ---- /dev/null
1206 -+++ b/block/bfq-ioc.c
1207 -@@ -0,0 +1,36 @@
1208 -+/*
1209 -+ * BFQ: I/O context handling.
1210 -+ *
1211 -+ * Based on ideas and code from CFQ:
1212 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1213 -+ *
1214 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1215 -+ * Paolo Valente <paolo.valente@×××××××.it>
1216 -+ *
1217 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1218 -+ */
1219 -+
1220 -+/**
1221 -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
1222 -+ * @icq: the iocontext queue.
1223 -+ */
1224 -+static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
1225 -+{
1226 -+ /* bic->icq is the first member, %NULL will convert to %NULL */
1227 -+ return container_of(icq, struct bfq_io_cq, icq);
1228 -+}
1229 -+
1230 -+/**
1231 -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
1232 -+ * @bfqd: the lookup key.
1233 -+ * @ioc: the io_context of the process doing I/O.
1234 -+ *
1235 -+ * Queue lock must be held.
1236 -+ */
1237 -+static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
1238 -+ struct io_context *ioc)
1239 -+{
1240 -+ if(ioc)
1241 -+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
1242 -+ return NULL;
1243 -+}
1244 -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
1245 -new file mode 100644
1246 -index 0000000..0ed2746
1247 ---- /dev/null
1248 -+++ b/block/bfq-iosched.c
1249 -@@ -0,0 +1,3082 @@
1250 -+/*
1251 -+ * BFQ, or Budget Fair Queueing, disk scheduler.
1252 -+ *
1253 -+ * Based on ideas and code from CFQ:
1254 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1255 -+ *
1256 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1257 -+ * Paolo Valente <paolo.valente@×××××××.it>
1258 -+ *
1259 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1260 -+ *
1261 -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
1262 -+ *
1263 -+ * BFQ is a proportional share disk scheduling algorithm based on the
1264 -+ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
1265 -+ * measured in number of sectors, to tasks instead of time slices.
1266 -+ * The disk is not granted to the active task for a given time slice,
1267 -+ * but until it has exahusted its assigned budget. This change from
1268 -+ * the time to the service domain allows BFQ to distribute the disk
1269 -+ * bandwidth among tasks as desired, without any distortion due to
1270 -+ * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc
1271 -+ * internal scheduler, called B-WF2Q+, to schedule tasks according to
1272 -+ * their budgets. Thanks to this accurate scheduler, BFQ can afford
1273 -+ * to assign high budgets to disk-bound non-seeky tasks (to boost the
1274 -+ * throughput), and yet guarantee low latencies to interactive and
1275 -+ * soft real-time applications.
1276 -+ *
1277 -+ * BFQ has been introduced in [1], where the interested reader can
1278 -+ * find an accurate description of the algorithm, the bandwidth
1279 -+ * distribution and latency guarantees it provides, plus formal proofs
1280 -+ * of all the properties. With respect to the algorithm presented in
1281 -+ * the paper, this implementation adds several little heuristics, and
1282 -+ * a hierarchical extension, based on H-WF2Q+.
1283 -+ *
1284 -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
1285 -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
1286 -+ * complexity derives from the one introduced with EEVDF in [3].
1287 -+ *
1288 -+ * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
1289 -+ * with Deterministic Guarantees on Bandwidth Distribution,'',
1290 -+ * IEEE Transactions on Computer, May 2010.
1291 -+ *
1292 -+ * http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf
1293 -+ *
1294 -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
1295 -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
1296 -+ * Oct 1997.
1297 -+ *
1298 -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
1299 -+ *
1300 -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
1301 -+ * First: A Flexible and Accurate Mechanism for Proportional Share
1302 -+ * Resource Allocation,'' technical report.
1303 -+ *
1304 -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
1305 -+ */
1306 -+#include <linux/module.h>
1307 -+#include <linux/slab.h>
1308 -+#include <linux/blkdev.h>
1309 -+#include <linux/cgroup.h>
1310 -+#include <linux/elevator.h>
1311 -+#include <linux/jiffies.h>
1312 -+#include <linux/rbtree.h>
1313 -+#include <linux/ioprio.h>
1314 -+#include "bfq.h"
1315 -+#include "blk.h"
1316 -+
1317 -+/* Max number of dispatches in one round of service. */
1318 -+static const int bfq_quantum = 4;
1319 -+
1320 -+/* Expiration time of sync (0) and async (1) requests, in jiffies. */
1321 -+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
1322 -+
1323 -+/* Maximum backwards seek, in KiB. */
1324 -+static const int bfq_back_max = 16 * 1024;
1325 -+
1326 -+/* Penalty of a backwards seek, in number of sectors. */
1327 -+static const int bfq_back_penalty = 2;
1328 -+
1329 -+/* Idling period duration, in jiffies. */
1330 -+static int bfq_slice_idle = HZ / 125;
1331 -+
1332 -+/* Default maximum budget values, in sectors and number of requests. */
1333 -+static const int bfq_default_max_budget = 16 * 1024;
1334 -+static const int bfq_max_budget_async_rq = 4;
1335 -+
1336 -+/*
1337 -+ * Async to sync throughput distribution is controlled as follows:
1338 -+ * when an async request is served, the entity is charged the number
1339 -+ * of sectors of the request, multipled by the factor below
1340 -+ */
1341 -+static const int bfq_async_charge_factor = 10;
1342 -+
1343 -+/* Default timeout values, in jiffies, approximating CFQ defaults. */
1344 -+static const int bfq_timeout_sync = HZ / 8;
1345 -+static int bfq_timeout_async = HZ / 25;
1346 -+
1347 -+struct kmem_cache *bfq_pool;
1348 -+
1349 -+/* Below this threshold (in ms), we consider thinktime immediate. */
1350 -+#define BFQ_MIN_TT 2
1351 -+
1352 -+/* hw_tag detection: parallel requests threshold and min samples needed. */
1353 -+#define BFQ_HW_QUEUE_THRESHOLD 4
1354 -+#define BFQ_HW_QUEUE_SAMPLES 32
1355 -+
1356 -+#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
1357 -+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
1358 -+
1359 -+/* Min samples used for peak rate estimation (for autotuning). */
1360 -+#define BFQ_PEAK_RATE_SAMPLES 32
1361 -+
1362 -+/* Shift used for peak rate fixed precision calculations. */
1363 -+#define BFQ_RATE_SHIFT 16
1364 -+
1365 -+/*
1366 -+ * The duration of the weight raising for interactive applications is
1367 -+ * computed automatically (as default behaviour), using the following
1368 -+ * formula: duration = (R / r) * T, where r is the peak rate of the
1369 -+ * disk, and R and T are two reference parameters. In particular, R is
1370 -+ * the peak rate of a reference disk, and T is about the maximum time
1371 -+ * for starting popular large applications on that disk, under BFQ and
1372 -+ * while reading two files in parallel. Finally, BFQ uses two
1373 -+ * different pairs (R, T) depending on whether the disk is rotational
1374 -+ * or non-rotational.
1375 -+ */
1376 -+#define T_rot (msecs_to_jiffies(5500))
1377 -+#define T_nonrot (msecs_to_jiffies(2000))
1378 -+/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
1379 -+#define R_rot 17415
1380 -+#define R_nonrot 34791
1381 -+
1382 -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
1383 -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1384 -+
1385 -+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
1386 -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
1387 -+
1388 -+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
1389 -+
1390 -+#include "bfq-ioc.c"
1391 -+#include "bfq-sched.c"
1392 -+#include "bfq-cgroup.c"
1393 -+
1394 -+#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
1395 -+ IOPRIO_CLASS_IDLE)
1396 -+#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
1397 -+ IOPRIO_CLASS_RT)
1398 -+
1399 -+#define bfq_sample_valid(samples) ((samples) > 80)
1400 -+
1401 -+/*
1402 -+ * We regard a request as SYNC, if either it's a read or has the SYNC bit
1403 -+ * set (in which case it could also be a direct WRITE).
1404 -+ */
1405 -+static inline int bfq_bio_sync(struct bio *bio)
1406 -+{
1407 -+ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
1408 -+ return 1;
1409 -+
1410 -+ return 0;
1411 -+}
1412 -+
1413 -+/*
1414 -+ * Scheduler run of queue, if there are requests pending and no one in the
1415 -+ * driver that will restart queueing.
1416 -+ */
1417 -+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
1418 -+{
1419 -+ if (bfqd->queued != 0) {
1420 -+ bfq_log(bfqd, "schedule dispatch");
1421 -+ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
1422 -+ }
1423 -+}
1424 -+
1425 -+/*
1426 -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1427 -+ * We choose the request that is closesr to the head right now. Distance
1428 -+ * behind the head is penalized and only allowed to a certain extent.
1429 -+ */
1430 -+static struct request *bfq_choose_req(struct bfq_data *bfqd,
1431 -+ struct request *rq1,
1432 -+ struct request *rq2,
1433 -+ sector_t last)
1434 -+{
1435 -+ sector_t s1, s2, d1 = 0, d2 = 0;
1436 -+ unsigned long back_max;
1437 -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
1438 -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
1439 -+ unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1440 -+
1441 -+ if (rq1 == NULL || rq1 == rq2)
1442 -+ return rq2;
1443 -+ if (rq2 == NULL)
1444 -+ return rq1;
1445 -+
1446 -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
1447 -+ return rq1;
1448 -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
1449 -+ return rq2;
1450 -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
1451 -+ return rq1;
1452 -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
1453 -+ return rq2;
1454 -+
1455 -+ s1 = blk_rq_pos(rq1);
1456 -+ s2 = blk_rq_pos(rq2);
1457 -+
1458 -+ /*
1459 -+ * By definition, 1KiB is 2 sectors.
1460 -+ */
1461 -+ back_max = bfqd->bfq_back_max * 2;
1462 -+
1463 -+ /*
1464 -+ * Strict one way elevator _except_ in the case where we allow
1465 -+ * short backward seeks which are biased as twice the cost of a
1466 -+ * similar forward seek.
1467 -+ */
1468 -+ if (s1 >= last)
1469 -+ d1 = s1 - last;
1470 -+ else if (s1 + back_max >= last)
1471 -+ d1 = (last - s1) * bfqd->bfq_back_penalty;
1472 -+ else
1473 -+ wrap |= BFQ_RQ1_WRAP;
1474 -+
1475 -+ if (s2 >= last)
1476 -+ d2 = s2 - last;
1477 -+ else if (s2 + back_max >= last)
1478 -+ d2 = (last - s2) * bfqd->bfq_back_penalty;
1479 -+ else
1480 -+ wrap |= BFQ_RQ2_WRAP;
1481 -+
1482 -+ /* Found required data */
1483 -+
1484 -+ /*
1485 -+ * By doing switch() on the bit mask "wrap" we avoid having to
1486 -+ * check two variables for all permutations: --> faster!
1487 -+ */
1488 -+ switch (wrap) {
1489 -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1490 -+ if (d1 < d2)
1491 -+ return rq1;
1492 -+ else if (d2 < d1)
1493 -+ return rq2;
1494 -+ else {
1495 -+ if (s1 >= s2)
1496 -+ return rq1;
1497 -+ else
1498 -+ return rq2;
1499 -+ }
1500 -+
1501 -+ case BFQ_RQ2_WRAP:
1502 -+ return rq1;
1503 -+ case BFQ_RQ1_WRAP:
1504 -+ return rq2;
1505 -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
1506 -+ default:
1507 -+ /*
1508 -+ * Since both rqs are wrapped,
1509 -+ * start with the one that's further behind head
1510 -+ * (--> only *one* back seek required),
1511 -+ * since back seek takes more time than forward.
1512 -+ */
1513 -+ if (s1 <= s2)
1514 -+ return rq1;
1515 -+ else
1516 -+ return rq2;
1517 -+ }
1518 -+}
1519 -+
1520 -+static struct bfq_queue *
1521 -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
1522 -+ sector_t sector, struct rb_node **ret_parent,
1523 -+ struct rb_node ***rb_link)
1524 -+{
1525 -+ struct rb_node **p, *parent;
1526 -+ struct bfq_queue *bfqq = NULL;
1527 -+
1528 -+ parent = NULL;
1529 -+ p = &root->rb_node;
1530 -+ while (*p) {
1531 -+ struct rb_node **n;
1532 -+
1533 -+ parent = *p;
1534 -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
1535 -+
1536 -+ /*
1537 -+ * Sort strictly based on sector. Smallest to the left,
1538 -+ * largest to the right.
1539 -+ */
1540 -+ if (sector > blk_rq_pos(bfqq->next_rq))
1541 -+ n = &(*p)->rb_right;
1542 -+ else if (sector < blk_rq_pos(bfqq->next_rq))
1543 -+ n = &(*p)->rb_left;
1544 -+ else
1545 -+ break;
1546 -+ p = n;
1547 -+ bfqq = NULL;
1548 -+ }
1549 -+
1550 -+ *ret_parent = parent;
1551 -+ if (rb_link)
1552 -+ *rb_link = p;
1553 -+
1554 -+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
1555 -+ (long long unsigned)sector,
1556 -+ bfqq != NULL ? bfqq->pid : 0);
1557 -+
1558 -+ return bfqq;
1559 -+}
1560 -+
1561 -+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1562 -+{
1563 -+ struct rb_node **p, *parent;
1564 -+ struct bfq_queue *__bfqq;
1565 -+
1566 -+ if (bfqq->pos_root != NULL) {
1567 -+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
1568 -+ bfqq->pos_root = NULL;
1569 -+ }
1570 -+
1571 -+ if (bfq_class_idle(bfqq))
1572 -+ return;
1573 -+ if (!bfqq->next_rq)
1574 -+ return;
1575 -+
1576 -+ bfqq->pos_root = &bfqd->rq_pos_tree;
1577 -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
1578 -+ blk_rq_pos(bfqq->next_rq), &parent, &p);
1579 -+ if (__bfqq == NULL) {
1580 -+ rb_link_node(&bfqq->pos_node, parent, p);
1581 -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
1582 -+ } else
1583 -+ bfqq->pos_root = NULL;
1584 -+}
1585 -+
1586 -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
1587 -+ struct bfq_queue *bfqq,
1588 -+ struct request *last)
1589 -+{
1590 -+ struct rb_node *rbnext = rb_next(&last->rb_node);
1591 -+ struct rb_node *rbprev = rb_prev(&last->rb_node);
1592 -+ struct request *next = NULL, *prev = NULL;
1593 -+
1594 -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1595 -+
1596 -+ if (rbprev != NULL)
1597 -+ prev = rb_entry_rq(rbprev);
1598 -+
1599 -+ if (rbnext != NULL)
1600 -+ next = rb_entry_rq(rbnext);
1601 -+ else {
1602 -+ rbnext = rb_first(&bfqq->sort_list);
1603 -+ if (rbnext && rbnext != &last->rb_node)
1604 -+ next = rb_entry_rq(rbnext);
1605 -+ }
1606 -+
1607 -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
1608 -+}
1609 -+
1610 -+static void bfq_del_rq_rb(struct request *rq)
1611 -+{
1612 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1613 -+ struct bfq_data *bfqd = bfqq->bfqd;
1614 -+ const int sync = rq_is_sync(rq);
1615 -+
1616 -+ BUG_ON(bfqq->queued[sync] == 0);
1617 -+ bfqq->queued[sync]--;
1618 -+ bfqd->queued--;
1619 -+
1620 -+ elv_rb_del(&bfqq->sort_list, rq);
1621 -+
1622 -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
1623 -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue)
1624 -+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
1625 -+ /*
1626 -+ * Remove queue from request-position tree as it is empty.
1627 -+ */
1628 -+ if (bfqq->pos_root != NULL) {
1629 -+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
1630 -+ bfqq->pos_root = NULL;
1631 -+ }
1632 -+ }
1633 -+}
1634 -+
1635 -+/* see the definition of bfq_async_charge_factor for details */
1636 -+static inline unsigned long bfq_serv_to_charge(struct request *rq,
1637 -+ struct bfq_queue *bfqq)
1638 -+{
1639 -+ return blk_rq_sectors(rq) *
1640 -+ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
1641 -+ bfq_async_charge_factor));
1642 -+}
1643 -+
1644 -+/**
1645 -+ * bfq_updated_next_req - update the queue after a new next_rq selection.
1646 -+ * @bfqd: the device data the queue belongs to.
1647 -+ * @bfqq: the queue to update.
1648 -+ *
1649 -+ * If the first request of a queue changes we make sure that the queue
1650 -+ * has enough budget to serve at least its first request (if the
1651 -+ * request has grown). We do this because if the queue has not enough
1652 -+ * budget for its first request, it has to go through two dispatch
1653 -+ * rounds to actually get it dispatched.
1654 -+ */
1655 -+static void bfq_updated_next_req(struct bfq_data *bfqd,
1656 -+ struct bfq_queue *bfqq)
1657 -+{
1658 -+ struct bfq_entity *entity = &bfqq->entity;
1659 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1660 -+ struct request *next_rq = bfqq->next_rq;
1661 -+ unsigned long new_budget;
1662 -+
1663 -+ if (next_rq == NULL)
1664 -+ return;
1665 -+
1666 -+ if (bfqq == bfqd->active_queue)
1667 -+ /*
1668 -+ * In order not to break guarantees, budgets cannot be
1669 -+ * changed after an entity has been selected.
1670 -+ */
1671 -+ return;
1672 -+
1673 -+ BUG_ON(entity->tree != &st->active);
1674 -+ BUG_ON(entity == entity->sched_data->active_entity);
1675 -+
1676 -+ new_budget = max_t(unsigned long, bfqq->max_budget,
1677 -+ bfq_serv_to_charge(next_rq, bfqq));
1678 -+ entity->budget = new_budget;
1679 -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
1680 -+ bfq_activate_bfqq(bfqd, bfqq);
1681 -+}
1682 -+
1683 -+static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
1684 -+{
1685 -+ u64 dur;
1686 -+
1687 -+ if (bfqd->bfq_raising_max_time > 0)
1688 -+ return bfqd->bfq_raising_max_time;
1689 -+
1690 -+ dur = bfqd->RT_prod;
1691 -+ do_div(dur, bfqd->peak_rate);
1692 -+
1693 -+ return dur;
1694 -+}
1695 -+
1696 -+static void bfq_add_rq_rb(struct request *rq)
1697 -+{
1698 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1699 -+ struct bfq_entity *entity = &bfqq->entity;
1700 -+ struct bfq_data *bfqd = bfqq->bfqd;
1701 -+ struct request *next_rq, *prev;
1702 -+ unsigned long old_raising_coeff = bfqq->raising_coeff;
1703 -+ int idle_for_long_time = bfqq->budget_timeout +
1704 -+ bfqd->bfq_raising_min_idle_time < jiffies;
1705 -+
1706 -+ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
1707 -+ bfqq->queued[rq_is_sync(rq)]++;
1708 -+ bfqd->queued++;
1709 -+
1710 -+ elv_rb_add(&bfqq->sort_list, rq);
1711 -+
1712 -+ /*
1713 -+ * Check if this request is a better next-serve candidate.
1714 -+ */
1715 -+ prev = bfqq->next_rq;
1716 -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
1717 -+ BUG_ON(next_rq == NULL);
1718 -+ bfqq->next_rq = next_rq;
1719 -+
1720 -+ /*
1721 -+ * Adjust priority tree position, if next_rq changes.
1722 -+ */
1723 -+ if (prev != bfqq->next_rq)
1724 -+ bfq_rq_pos_tree_add(bfqd, bfqq);
1725 -+
1726 -+ if (!bfq_bfqq_busy(bfqq)) {
1727 -+ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
1728 -+ bfqq->soft_rt_next_start < jiffies;
1729 -+ entity->budget = max_t(unsigned long, bfqq->max_budget,
1730 -+ bfq_serv_to_charge(next_rq, bfqq));
1731 -+
1732 -+ if (! bfqd->low_latency)
1733 -+ goto add_bfqq_busy;
1734 -+
1735 -+ /*
1736 -+ * If the queue is not being boosted and has been idle
1737 -+ * for enough time, start a weight-raising period
1738 -+ */
1739 -+ if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {
1740 -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1741 -+ if (idle_for_long_time)
1742 -+ bfqq->raising_cur_max_time =
1743 -+ bfq_wrais_duration(bfqd);
1744 -+ else
1745 -+ bfqq->raising_cur_max_time =
1746 -+ bfqd->bfq_raising_rt_max_time;
1747 -+ bfq_log_bfqq(bfqd, bfqq,
1748 -+ "wrais starting at %llu msec,"
1749 -+ "rais_max_time %u",
1750 -+ bfqq->last_rais_start_finish,
1751 -+ jiffies_to_msecs(bfqq->
1752 -+ raising_cur_max_time));
1753 -+ } else if (old_raising_coeff > 1) {
1754 -+ if (idle_for_long_time)
1755 -+ bfqq->raising_cur_max_time =
1756 -+ bfq_wrais_duration(bfqd);
1757 -+ else if (bfqq->raising_cur_max_time ==
1758 -+ bfqd->bfq_raising_rt_max_time &&
1759 -+ !soft_rt) {
1760 -+ bfqq->raising_coeff = 1;
1761 -+ bfq_log_bfqq(bfqd, bfqq,
1762 -+ "wrais ending at %llu msec,"
1763 -+ "rais_max_time %u",
1764 -+ bfqq->last_rais_start_finish,
1765 -+ jiffies_to_msecs(bfqq->
1766 -+ raising_cur_max_time));
1767 -+ }
1768 -+ }
1769 -+ if (old_raising_coeff != bfqq->raising_coeff)
1770 -+ entity->ioprio_changed = 1;
1771 -+add_bfqq_busy:
1772 -+ bfq_add_bfqq_busy(bfqd, bfqq);
1773 -+ } else {
1774 -+ if(bfqd->low_latency && old_raising_coeff == 1 &&
1775 -+ !rq_is_sync(rq) &&
1776 -+ bfqq->last_rais_start_finish +
1777 -+ bfqd->bfq_raising_min_inter_arr_async < jiffies) {
1778 -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1779 -+ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
1780 -+
1781 -+ entity->ioprio_changed = 1;
1782 -+ bfq_log_bfqq(bfqd, bfqq,
1783 -+ "non-idle wrais starting at %llu msec,"
1784 -+ "rais_max_time %u",
1785 -+ bfqq->last_rais_start_finish,
1786 -+ jiffies_to_msecs(bfqq->
1787 -+ raising_cur_max_time));
1788 -+ }
1789 -+ bfq_updated_next_req(bfqd, bfqq);
1790 -+ }
1791 -+
1792 -+ if(bfqd->low_latency &&
1793 -+ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
1794 -+ idle_for_long_time))
1795 -+ bfqq->last_rais_start_finish = jiffies;
1796 -+}
1797 -+
1798 -+static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
1799 -+{
1800 -+ elv_rb_del(&bfqq->sort_list, rq);
1801 -+ bfqq->queued[rq_is_sync(rq)]--;
1802 -+ bfqq->bfqd->queued--;
1803 -+ bfq_add_rq_rb(rq);
1804 -+}
1805 -+
1806 -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
1807 -+ struct bio *bio)
1808 -+{
1809 -+ struct task_struct *tsk = current;
1810 -+ struct bfq_io_cq *bic;
1811 -+ struct bfq_queue *bfqq;
1812 -+
1813 -+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
1814 -+ if (bic == NULL)
1815 -+ return NULL;
1816 -+
1817 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
1818 -+ if (bfqq != NULL) {
1819 -+ sector_t sector = bio->bi_sector + bio_sectors(bio);
1820 -+
1821 -+ return elv_rb_find(&bfqq->sort_list, sector);
1822 -+ }
1823 -+
1824 -+ return NULL;
1825 -+}
1826 -+
1827 -+static void bfq_activate_request(struct request_queue *q, struct request *rq)
1828 -+{
1829 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1830 -+
1831 -+ bfqd->rq_in_driver++;
1832 -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1833 -+ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
1834 -+ (long long unsigned)bfqd->last_position);
1835 -+}
1836 -+
1837 -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
1838 -+{
1839 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1840 -+
1841 -+ WARN_ON(bfqd->rq_in_driver == 0);
1842 -+ bfqd->rq_in_driver--;
1843 -+}
1844 -+
1845 -+static void bfq_remove_request(struct request *rq)
1846 -+{
1847 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1848 -+ struct bfq_data *bfqd = bfqq->bfqd;
1849 -+
1850 -+ if (bfqq->next_rq == rq) {
1851 -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
1852 -+ bfq_updated_next_req(bfqd, bfqq);
1853 -+ }
1854 -+
1855 -+ list_del_init(&rq->queuelist);
1856 -+ bfq_del_rq_rb(rq);
1857 -+
1858 -+ if (rq->cmd_flags & REQ_META) {
1859 -+ WARN_ON(bfqq->meta_pending == 0);
1860 -+ bfqq->meta_pending--;
1861 -+ }
1862 -+}
1863 -+
1864 -+static int bfq_merge(struct request_queue *q, struct request **req,
1865 -+ struct bio *bio)
1866 -+{
1867 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1868 -+ struct request *__rq;
1869 -+
1870 -+ __rq = bfq_find_rq_fmerge(bfqd, bio);
1871 -+ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
1872 -+ *req = __rq;
1873 -+ return ELEVATOR_FRONT_MERGE;
1874 -+ }
1875 -+
1876 -+ return ELEVATOR_NO_MERGE;
1877 -+}
1878 -+
1879 -+static void bfq_merged_request(struct request_queue *q, struct request *req,
1880 -+ int type)
1881 -+{
1882 -+ if (type == ELEVATOR_FRONT_MERGE) {
1883 -+ struct bfq_queue *bfqq = RQ_BFQQ(req);
1884 -+
1885 -+ bfq_reposition_rq_rb(bfqq, req);
1886 -+ }
1887 -+}
1888 -+
1889 -+static void bfq_merged_requests(struct request_queue *q, struct request *rq,
1890 -+ struct request *next)
1891 -+{
1892 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1893 -+
1894 -+ /*
1895 -+ * Reposition in fifo if next is older than rq.
1896 -+ */
1897 -+ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
1898 -+ time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
1899 -+ list_move(&rq->queuelist, &next->queuelist);
1900 -+ rq_set_fifo_time(rq, rq_fifo_time(next));
1901 -+ }
1902 -+
1903 -+ if (bfqq->next_rq == next)
1904 -+ bfqq->next_rq = rq;
1905 -+
1906 -+ bfq_remove_request(next);
1907 -+}
1908 -+
1909 -+/* Must be called with bfqq != NULL */
1910 -+static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)
1911 -+{
1912 -+ BUG_ON(bfqq == NULL);
1913 -+ bfqq->raising_coeff = 1;
1914 -+ bfqq->raising_cur_max_time = 0;
1915 -+ /* Trigger a weight change on the next activation of the queue */
1916 -+ bfqq->entity.ioprio_changed = 1;
1917 -+}
1918 -+
1919 -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
1920 -+ struct bfq_group *bfqg)
1921 -+{
1922 -+ int i, j;
1923 -+
1924 -+ for (i = 0; i < 2; i++)
1925 -+ for (j = 0; j < IOPRIO_BE_NR; j++)
1926 -+ if (bfqg->async_bfqq[i][j] != NULL)
1927 -+ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);
1928 -+ if (bfqg->async_idle_bfqq != NULL)
1929 -+ bfq_bfqq_end_raising(bfqg->async_idle_bfqq);
1930 -+}
1931 -+
1932 -+static void bfq_end_raising(struct bfq_data *bfqd)
1933 -+{
1934 -+ struct bfq_queue *bfqq;
1935 -+
1936 -+ spin_lock_irq(bfqd->queue->queue_lock);
1937 -+
1938 -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
1939 -+ bfq_bfqq_end_raising(bfqq);
1940 -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
1941 -+ bfq_bfqq_end_raising(bfqq);
1942 -+ bfq_end_raising_async(bfqd);
1943 -+
1944 -+ spin_unlock_irq(bfqd->queue->queue_lock);
1945 -+}
1946 -+
1947 -+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
1948 -+ struct bio *bio)
1949 -+{
1950 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1951 -+ struct bfq_io_cq *bic;
1952 -+ struct bfq_queue *bfqq;
1953 -+
1954 -+ /*
1955 -+ * Disallow merge of a sync bio into an async request.
1956 -+ */
1957 -+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
1958 -+ return 0;
1959 -+
1960 -+ /*
1961 -+ * Lookup the bfqq that this bio will be queued with. Allow
1962 -+ * merge only if rq is queued there.
1963 -+ * Queue lock is held here.
1964 -+ */
1965 -+ bic = bfq_bic_lookup(bfqd, current->io_context);
1966 -+ if (bic == NULL)
1967 -+ return 0;
1968 -+
1969 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
1970 -+ return bfqq == RQ_BFQQ(rq);
1971 -+}
1972 -+
1973 -+static void __bfq_set_active_queue(struct bfq_data *bfqd,
1974 -+ struct bfq_queue *bfqq)
1975 -+{
1976 -+ if (bfqq != NULL) {
1977 -+ bfq_mark_bfqq_must_alloc(bfqq);
1978 -+ bfq_mark_bfqq_budget_new(bfqq);
1979 -+ bfq_clear_bfqq_fifo_expire(bfqq);
1980 -+
1981 -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
1982 -+
1983 -+ bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
1984 -+ bfqq->entity.budget);
1985 -+ }
1986 -+
1987 -+ bfqd->active_queue = bfqq;
1988 -+}
1989 -+
1990 -+/*
1991 -+ * Get and set a new active queue for service.
1992 -+ */
1993 -+static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,
1994 -+ struct bfq_queue *bfqq)
1995 -+{
1996 -+ if (!bfqq)
1997 -+ bfqq = bfq_get_next_queue(bfqd);
1998 -+ else
1999 -+ bfq_get_next_queue_forced(bfqd, bfqq);
2000 -+
2001 -+ __bfq_set_active_queue(bfqd, bfqq);
2002 -+ return bfqq;
2003 -+}
2004 -+
2005 -+static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
2006 -+ struct request *rq)
2007 -+{
2008 -+ if (blk_rq_pos(rq) >= bfqd->last_position)
2009 -+ return blk_rq_pos(rq) - bfqd->last_position;
2010 -+ else
2011 -+ return bfqd->last_position - blk_rq_pos(rq);
2012 -+}
2013 -+
2014 -+/*
2015 -+ * Return true if bfqq has no request pending and rq is close enough to
2016 -+ * bfqd->last_position, or if rq is closer to bfqd->last_position than
2017 -+ * bfqq->next_rq
2018 -+ */
2019 -+static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
2020 -+{
2021 -+ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
2022 -+}
2023 -+
2024 -+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
2025 -+{
2026 -+ struct rb_root *root = &bfqd->rq_pos_tree;
2027 -+ struct rb_node *parent, *node;
2028 -+ struct bfq_queue *__bfqq;
2029 -+ sector_t sector = bfqd->last_position;
2030 -+
2031 -+ if (RB_EMPTY_ROOT(root))
2032 -+ return NULL;
2033 -+
2034 -+ /*
2035 -+ * First, if we find a request starting at the end of the last
2036 -+ * request, choose it.
2037 -+ */
2038 -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
2039 -+ if (__bfqq != NULL)
2040 -+ return __bfqq;
2041 -+
2042 -+ /*
2043 -+ * If the exact sector wasn't found, the parent of the NULL leaf
2044 -+ * will contain the closest sector (rq_pos_tree sorted by next_request
2045 -+ * position).
2046 -+ */
2047 -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
2048 -+ if (bfq_rq_close(bfqd, __bfqq->next_rq))
2049 -+ return __bfqq;
2050 -+
2051 -+ if (blk_rq_pos(__bfqq->next_rq) < sector)
2052 -+ node = rb_next(&__bfqq->pos_node);
2053 -+ else
2054 -+ node = rb_prev(&__bfqq->pos_node);
2055 -+ if (node == NULL)
2056 -+ return NULL;
2057 -+
2058 -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
2059 -+ if (bfq_rq_close(bfqd, __bfqq->next_rq))
2060 -+ return __bfqq;
2061 -+
2062 -+ return NULL;
2063 -+}
2064 -+
2065 -+/*
2066 -+ * bfqd - obvious
2067 -+ * cur_bfqq - passed in so that we don't decide that the current queue
2068 -+ * is closely cooperating with itself.
2069 -+ *
2070 -+ * We are assuming that cur_bfqq has dispatched at least one request,
2071 -+ * and that bfqd->last_position reflects a position on the disk associated
2072 -+ * with the I/O issued by cur_bfqq.
2073 -+ */
2074 -+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
2075 -+ struct bfq_queue *cur_bfqq)
2076 -+{
2077 -+ struct bfq_queue *bfqq;
2078 -+
2079 -+ if (bfq_class_idle(cur_bfqq))
2080 -+ return NULL;
2081 -+ if (!bfq_bfqq_sync(cur_bfqq))
2082 -+ return NULL;
2083 -+ if (BFQQ_SEEKY(cur_bfqq))
2084 -+ return NULL;
2085 -+
2086 -+ /* If device has only one backlogged bfq_queue, don't search. */
2087 -+ if (bfqd->busy_queues == 1)
2088 -+ return NULL;
2089 -+
2090 -+ /*
2091 -+ * We should notice if some of the queues are cooperating, e.g.
2092 -+ * working closely on the same area of the disk. In that case,
2093 -+ * we can group them together and don't waste time idling.
2094 -+ */
2095 -+ bfqq = bfqq_close(bfqd);
2096 -+ if (bfqq == NULL || bfqq == cur_bfqq)
2097 -+ return NULL;
2098 -+
2099 -+ /*
2100 -+ * Do not merge queues from different bfq_groups.
2101 -+ */
2102 -+ if (bfqq->entity.parent != cur_bfqq->entity.parent)
2103 -+ return NULL;
2104 -+
2105 -+ /*
2106 -+ * It only makes sense to merge sync queues.
2107 -+ */
2108 -+ if (!bfq_bfqq_sync(bfqq))
2109 -+ return NULL;
2110 -+ if (BFQQ_SEEKY(bfqq))
2111 -+ return NULL;
2112 -+
2113 -+ /*
2114 -+ * Do not merge queues of different priority classes.
2115 -+ */
2116 -+ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
2117 -+ return NULL;
2118 -+
2119 -+ return bfqq;
2120 -+}
2121 -+
2122 -+/*
2123 -+ * If enough samples have been computed, return the current max budget
2124 -+ * stored in bfqd, which is dynamically updated according to the
2125 -+ * estimated disk peak rate; otherwise return the default max budget
2126 -+ */
2127 -+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
2128 -+{
2129 -+ if (bfqd->budgets_assigned < 194)
2130 -+ return bfq_default_max_budget;
2131 -+ else
2132 -+ return bfqd->bfq_max_budget;
2133 -+}
2134 -+
2135 -+/*
2136 -+ * Return min budget, which is a fraction of the current or default
2137 -+ * max budget (trying with 1/32)
2138 -+ */
2139 -+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
2140 -+{
2141 -+ if (bfqd->budgets_assigned < 194)
2142 -+ return bfq_default_max_budget / 32;
2143 -+ else
2144 -+ return bfqd->bfq_max_budget / 32;
2145 -+}
2146 -+
2147 -+/*
2148 -+ * Decides whether idling should be done for given device and
2149 -+ * given active queue.
2150 -+ */
2151 -+static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
2152 -+ struct bfq_queue *active_bfqq)
2153 -+{
2154 -+ if (active_bfqq == NULL)
2155 -+ return false;
2156 -+ /*
2157 -+ * If device is SSD it has no seek penalty, disable idling; but
2158 -+ * do so only if:
2159 -+ * - device does not support queuing, otherwise we still have
2160 -+ * a problem with sync vs async workloads;
2161 -+ * - the queue is not weight-raised, to preserve guarantees.
2162 -+ */
2163 -+ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
2164 -+ active_bfqq->raising_coeff == 1);
2165 -+}
2166 -+
2167 -+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
2168 -+{
2169 -+ struct bfq_queue *bfqq = bfqd->active_queue;
2170 -+ struct bfq_io_cq *bic;
2171 -+ unsigned long sl;
2172 -+
2173 -+ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
2174 -+
2175 -+ /* Tasks have exited, don't wait. */
2176 -+ bic = bfqd->active_bic;
2177 -+ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
2178 -+ return;
2179 -+
2180 -+ bfq_mark_bfqq_wait_request(bfqq);
2181 -+
2182 -+ /*
2183 -+ * We don't want to idle for seeks, but we do want to allow
2184 -+ * fair distribution of slice time for a process doing back-to-back
2185 -+ * seeks. So allow a little bit of time for him to submit a new rq.
2186 -+ *
2187 -+ * To prevent processes with (partly) seeky workloads from
2188 -+ * being too ill-treated, grant them a small fraction of the
2189 -+ * assigned budget before reducing the waiting time to
2190 -+ * BFQ_MIN_TT. This happened to help reduce latency.
2191 -+ */
2192 -+ sl = bfqd->bfq_slice_idle;
2193 -+ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
2194 -+ bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
2195 -+ bfqq->raising_coeff == 1)
2196 -+ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
2197 -+ else if (bfqq->raising_coeff > 1)
2198 -+ sl = sl * 3;
2199 -+ bfqd->last_idling_start = ktime_get();
2200 -+ mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
2201 -+ bfq_log(bfqd, "arm idle: %u/%u ms",
2202 -+ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
2203 -+}
2204 -+
2205 -+/*
2206 -+ * Set the maximum time for the active queue to consume its
2207 -+ * budget. This prevents seeky processes from lowering the disk
2208 -+ * throughput (always guaranteed with a time slice scheme as in CFQ).
2209 -+ */
2210 -+static void bfq_set_budget_timeout(struct bfq_data *bfqd)
2211 -+{
2212 -+ struct bfq_queue *bfqq = bfqd->active_queue;
2213 -+ unsigned int timeout_coeff;
2214 -+ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
2215 -+ timeout_coeff = 1;
2216 -+ else
2217 -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
2218 -+
2219 -+ bfqd->last_budget_start = ktime_get();
2220 -+
2221 -+ bfq_clear_bfqq_budget_new(bfqq);
2222 -+ bfqq->budget_timeout = jiffies +
2223 -+ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
2224 -+
2225 -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
2226 -+ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
2227 -+ timeout_coeff));
2228 -+}
2229 -+
2230 -+/*
2231 -+ * Move request from internal lists to the request queue dispatch list.
2232 -+ */
2233 -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
2234 -+{
2235 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
2236 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
2237 -+
2238 -+ bfq_remove_request(rq);
2239 -+ bfqq->dispatched++;
2240 -+ elv_dispatch_sort(q, rq);
2241 -+
2242 -+ if (bfq_bfqq_sync(bfqq))
2243 -+ bfqd->sync_flight++;
2244 -+}
2245 -+
2246 -+/*
2247 -+ * Return expired entry, or NULL to just start from scratch in rbtree.
2248 -+ */
2249 -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
2250 -+{
2251 -+ struct request *rq = NULL;
2252 -+
2253 -+ if (bfq_bfqq_fifo_expire(bfqq))
2254 -+ return NULL;
2255 -+
2256 -+ bfq_mark_bfqq_fifo_expire(bfqq);
2257 -+
2258 -+ if (list_empty(&bfqq->fifo))
2259 -+ return NULL;
2260 -+
2261 -+ rq = rq_entry_fifo(bfqq->fifo.next);
2262 -+
2263 -+ if (time_before(jiffies, rq_fifo_time(rq)))
2264 -+ return NULL;
2265 -+
2266 -+ return rq;
2267 -+}
2268 -+
2269 -+/*
2270 -+ * Must be called with the queue_lock held.
2271 -+ */
2272 -+static int bfqq_process_refs(struct bfq_queue *bfqq)
2273 -+{
2274 -+ int process_refs, io_refs;
2275 -+
2276 -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
2277 -+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
2278 -+ BUG_ON(process_refs < 0);
2279 -+ return process_refs;
2280 -+}
2281 -+
2282 -+static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
2283 -+{
2284 -+ int process_refs, new_process_refs;
2285 -+ struct bfq_queue *__bfqq;
2286 -+
2287 -+ /*
2288 -+ * If there are no process references on the new_bfqq, then it is
2289 -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
2290 -+ * may have dropped their last reference (not just their last process
2291 -+ * reference).
2292 -+ */
2293 -+ if (!bfqq_process_refs(new_bfqq))
2294 -+ return;
2295 -+
2296 -+ /* Avoid a circular list and skip interim queue merges. */
2297 -+ while ((__bfqq = new_bfqq->new_bfqq)) {
2298 -+ if (__bfqq == bfqq)
2299 -+ return;
2300 -+ new_bfqq = __bfqq;
2301 -+ }
2302 -+
2303 -+ process_refs = bfqq_process_refs(bfqq);
2304 -+ new_process_refs = bfqq_process_refs(new_bfqq);
2305 -+ /*
2306 -+ * If the process for the bfqq has gone away, there is no
2307 -+ * sense in merging the queues.
2308 -+ */
2309 -+ if (process_refs == 0 || new_process_refs == 0)
2310 -+ return;
2311 -+
2312 -+ /*
2313 -+ * Merge in the direction of the lesser amount of work.
2314 -+ */
2315 -+ if (new_process_refs >= process_refs) {
2316 -+ bfqq->new_bfqq = new_bfqq;
2317 -+ atomic_add(process_refs, &new_bfqq->ref);
2318 -+ } else {
2319 -+ new_bfqq->new_bfqq = bfqq;
2320 -+ atomic_add(new_process_refs, &bfqq->ref);
2321 -+ }
2322 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
2323 -+ new_bfqq->pid);
2324 -+}
2325 -+
2326 -+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
2327 -+{
2328 -+ struct bfq_entity *entity = &bfqq->entity;
2329 -+ return entity->budget - entity->service;
2330 -+}
2331 -+
2332 -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2333 -+{
2334 -+ BUG_ON(bfqq != bfqd->active_queue);
2335 -+
2336 -+ __bfq_bfqd_reset_active(bfqd);
2337 -+
2338 -+ /*
2339 -+ * If this bfqq is shared between multiple processes, check
2340 -+ * to make sure that those processes are still issuing I/Os
2341 -+ * within the mean seek distance. If not, it may be time to
2342 -+ * break the queues apart again.
2343 -+ */
2344 -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
2345 -+ bfq_mark_bfqq_split_coop(bfqq);
2346 -+
2347 -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
2348 -+ /*
2349 -+ * overloading budget_timeout field to store when
2350 -+ * the queue remains with no backlog, used by
2351 -+ * the weight-raising mechanism
2352 -+ */
2353 -+ bfqq->budget_timeout = jiffies ;
2354 -+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
2355 -+ } else {
2356 -+ bfq_activate_bfqq(bfqd, bfqq);
2357 -+ /*
2358 -+ * Resort priority tree of potential close cooperators.
2359 -+ */
2360 -+ bfq_rq_pos_tree_add(bfqd, bfqq);
2361 -+ }
2362 -+}
2363 -+
2364 -+/**
2365 -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
2366 -+ * @bfqd: device data.
2367 -+ * @bfqq: queue to update.
2368 -+ * @reason: reason for expiration.
2369 -+ *
2370 -+ * Handle the feedback on @bfqq budget. See the body for detailed
2371 -+ * comments.
2372 -+ */
2373 -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
2374 -+ struct bfq_queue *bfqq,
2375 -+ enum bfqq_expiration reason)
2376 -+{
2377 -+ struct request *next_rq;
2378 -+ unsigned long budget, min_budget;
2379 -+
2380 -+ budget = bfqq->max_budget;
2381 -+ min_budget = bfq_min_budget(bfqd);
2382 -+
2383 -+ BUG_ON(bfqq != bfqd->active_queue);
2384 -+
2385 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
2386 -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
2387 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
2388 -+ budget, bfq_min_budget(bfqd));
2389 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
2390 -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));
2391 -+
2392 -+ if (bfq_bfqq_sync(bfqq)) {
2393 -+ switch (reason) {
2394 -+ /*
2395 -+ * Caveat: in all the following cases we trade latency
2396 -+ * for throughput.
2397 -+ */
2398 -+ case BFQ_BFQQ_TOO_IDLE:
2399 -+ /*
2400 -+ * This is the only case where we may reduce
2401 -+ * the budget: if there is no requets of the
2402 -+ * process still waiting for completion, then
2403 -+ * we assume (tentatively) that the timer has
2404 -+ * expired because the batch of requests of
2405 -+ * the process could have been served with a
2406 -+ * smaller budget. Hence, betting that
2407 -+ * process will behave in the same way when it
2408 -+ * becomes backlogged again, we reduce its
2409 -+ * next budget. As long as we guess right,
2410 -+ * this budget cut reduces the latency
2411 -+ * experienced by the process.
2412 -+ *
2413 -+ * However, if there are still outstanding
2414 -+ * requests, then the process may have not yet
2415 -+ * issued its next request just because it is
2416 -+ * still waiting for the completion of some of
2417 -+ * the still oustanding ones. So in this
2418 -+ * subcase we do not reduce its budget, on the
2419 -+ * contrary we increase it to possibly boost
2420 -+ * the throughput, as discussed in the
2421 -+ * comments to the BUDGET_TIMEOUT case.
2422 -+ */
2423 -+ if (bfqq->dispatched > 0) /* still oustanding reqs */
2424 -+ budget = min(budget * 2, bfqd->bfq_max_budget);
2425 -+ else {
2426 -+ if (budget > 5 * min_budget)
2427 -+ budget -= 4 * min_budget;
2428 -+ else
2429 -+ budget = min_budget;
2430 -+ }
2431 -+ break;
2432 -+ case BFQ_BFQQ_BUDGET_TIMEOUT:
2433 -+ /*
2434 -+ * We double the budget here because: 1) it
2435 -+ * gives the chance to boost the throughput if
2436 -+ * this is not a seeky process (which may have
2437 -+ * bumped into this timeout because of, e.g.,
2438 -+ * ZBR), 2) together with charge_full_budget
2439 -+ * it helps give seeky processes higher
2440 -+ * timestamps, and hence be served less
2441 -+ * frequently.
2442 -+ */
2443 -+ budget = min(budget * 2, bfqd->bfq_max_budget);
2444 -+ break;
2445 -+ case BFQ_BFQQ_BUDGET_EXHAUSTED:
2446 -+ /*
2447 -+ * The process still has backlog, and did not
2448 -+ * let either the budget timeout or the disk
2449 -+ * idling timeout expire. Hence it is not
2450 -+ * seeky, has a short thinktime and may be
2451 -+ * happy with a higher budget too. So
2452 -+ * definitely increase the budget of this good
2453 -+ * candidate to boost the disk throughput.
2454 -+ */
2455 -+ budget = min(budget * 4, bfqd->bfq_max_budget);
2456 -+ break;
2457 -+ case BFQ_BFQQ_NO_MORE_REQUESTS:
2458 -+ /*
2459 -+ * Leave the budget unchanged.
2460 -+ */
2461 -+ default:
2462 -+ return;
2463 -+ }
2464 -+ } else /* async queue */
2465 -+ /* async queues get always the maximum possible budget
2466 -+ * (their ability to dispatch is limited by
2467 -+ * @bfqd->bfq_max_budget_async_rq).
2468 -+ */
2469 -+ budget = bfqd->bfq_max_budget;
2470 -+
2471 -+ bfqq->max_budget = budget;
2472 -+
2473 -+ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
2474 -+ bfqq->max_budget > bfqd->bfq_max_budget)
2475 -+ bfqq->max_budget = bfqd->bfq_max_budget;
2476 -+
2477 -+ /*
2478 -+ * Make sure that we have enough budget for the next request.
2479 -+ * Since the finish time of the bfqq must be kept in sync with
2480 -+ * the budget, be sure to call __bfq_bfqq_expire() after the
2481 -+ * update.
2482 -+ */
2483 -+ next_rq = bfqq->next_rq;
2484 -+ if (next_rq != NULL)
2485 -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
2486 -+ bfq_serv_to_charge(next_rq, bfqq));
2487 -+ else
2488 -+ bfqq->entity.budget = bfqq->max_budget;
2489 -+
2490 -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
2491 -+ next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
2492 -+ bfqq->entity.budget);
2493 -+}
2494 -+
2495 -+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
2496 -+{
2497 -+ unsigned long max_budget;
2498 -+
2499 -+ /*
2500 -+ * The max_budget calculated when autotuning is equal to the
2501 -+ * amount of sectors transfered in timeout_sync at the
2502 -+ * estimated peak rate.
2503 -+ */
2504 -+ max_budget = (unsigned long)(peak_rate * 1000 *
2505 -+ timeout >> BFQ_RATE_SHIFT);
2506 -+
2507 -+ return max_budget;
2508 -+}
2509 -+
2510 -+/*
2511 -+ * In addition to updating the peak rate, checks whether the process
2512 -+ * is "slow", and returns 1 if so. This slow flag is used, in addition
2513 -+ * to the budget timeout, to reduce the amount of service provided to
2514 -+ * seeky processes, and hence reduce their chances to lower the
2515 -+ * throughput. See the code for more details.
2516 -+ */
2517 -+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2518 -+ int compensate, enum bfqq_expiration reason)
2519 -+{
2520 -+ u64 bw, usecs, expected, timeout;
2521 -+ ktime_t delta;
2522 -+ int update = 0;
2523 -+
2524 -+ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
2525 -+ return 0;
2526 -+
2527 -+ if (compensate)
2528 -+ delta = bfqd->last_idling_start;
2529 -+ else
2530 -+ delta = ktime_get();
2531 -+ delta = ktime_sub(delta, bfqd->last_budget_start);
2532 -+ usecs = ktime_to_us(delta);
2533 -+
2534 -+ /* Don't trust short/unrealistic values. */
2535 -+ if (usecs < 100 || usecs >= LONG_MAX)
2536 -+ return 0;
2537 -+
2538 -+ /*
2539 -+ * Calculate the bandwidth for the last slice. We use a 64 bit
2540 -+ * value to store the peak rate, in sectors per usec in fixed
2541 -+ * point math. We do so to have enough precision in the estimate
2542 -+ * and to avoid overflows.
2543 -+ */
2544 -+ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
2545 -+ do_div(bw, (unsigned long)usecs);
2546 -+
2547 -+ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
2548 -+
2549 -+ /*
2550 -+ * Use only long (> 20ms) intervals to filter out spikes for
2551 -+ * the peak rate estimation.
2552 -+ */
2553 -+ if (usecs > 20000) {
2554 -+ if (bw > bfqd->peak_rate ||
2555 -+ (!BFQQ_SEEKY(bfqq) &&
2556 -+ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
2557 -+ bfq_log(bfqd, "measured bw =%llu", bw);
2558 -+ /*
2559 -+ * To smooth oscillations use a low-pass filter with
2560 -+ * alpha=7/8, i.e.,
2561 -+ * new_rate = (7/8) * old_rate + (1/8) * bw
2562 -+ */
2563 -+ do_div(bw, 8);
2564 -+ if (bw == 0)
2565 -+ return 0;
2566 -+ bfqd->peak_rate *= 7;
2567 -+ do_div(bfqd->peak_rate, 8);
2568 -+ bfqd->peak_rate += bw;
2569 -+ update = 1;
2570 -+ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
2571 -+ }
2572 -+
2573 -+ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
2574 -+
2575 -+ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
2576 -+ bfqd->peak_rate_samples++;
2577 -+
2578 -+ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
2579 -+ update && bfqd->bfq_user_max_budget == 0) {
2580 -+ bfqd->bfq_max_budget =
2581 -+ bfq_calc_max_budget(bfqd->peak_rate, timeout);
2582 -+ bfq_log(bfqd, "new max_budget=%lu",
2583 -+ bfqd->bfq_max_budget);
2584 -+ }
2585 -+ }
2586 -+
2587 -+ /*
2588 -+ * If the process has been served for a too short time
2589 -+ * interval to let its possible sequential accesses prevail on
2590 -+ * the initial seek time needed to move the disk head on the
2591 -+ * first sector it requested, then give the process a chance
2592 -+ * and for the moment return false.
2593 -+ */
2594 -+ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
2595 -+ return 0;
2596 -+
2597 -+ /*
2598 -+ * A process is considered ``slow'' (i.e., seeky, so that we
2599 -+ * cannot treat it fairly in the service domain, as it would
2600 -+ * slow down too much the other processes) if, when a slice
2601 -+ * ends for whatever reason, it has received service at a
2602 -+ * rate that would not be high enough to complete the budget
2603 -+ * before the budget timeout expiration.
2604 -+ */
2605 -+ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
2606 -+
2607 -+ /*
2608 -+ * Caveat: processes doing IO in the slower disk zones will
2609 -+ * tend to be slow(er) even if not seeky. And the estimated
2610 -+ * peak rate will actually be an average over the disk
2611 -+ * surface. Hence, to not be too harsh with unlucky processes,
2612 -+ * we keep a budget/3 margin of safety before declaring a
2613 -+ * process slow.
2614 -+ */
2615 -+ return expected > (4 * bfqq->entity.budget) / 3;
2616 -+}
2617 -+
2618 -+/**
2619 -+ * bfq_bfqq_expire - expire a queue.
2620 -+ * @bfqd: device owning the queue.
2621 -+ * @bfqq: the queue to expire.
2622 -+ * @compensate: if true, compensate for the time spent idling.
2623 -+ * @reason: the reason causing the expiration.
2624 -+ *
2625 -+ *
2626 -+ * If the process associated to the queue is slow (i.e., seeky), or in
2627 -+ * case of budget timeout, or, finally, if it is async, we
2628 -+ * artificially charge it an entire budget (independently of the
2629 -+ * actual service it received). As a consequence, the queue will get
2630 -+ * higher timestamps than the correct ones upon reactivation, and
2631 -+ * hence it will be rescheduled as if it had received more service
2632 -+ * than what it actually received. In the end, this class of processes
2633 -+ * will receive less service in proportion to how slowly they consume
2634 -+ * their budgets (and hence how seriously they tend to lower the
2635 -+ * throughput).
2636 -+ *
2637 -+ * In contrast, when a queue expires because it has been idling for
2638 -+ * too much or because it exhausted its budget, we do not touch the
2639 -+ * amount of service it has received. Hence when the queue will be
2640 -+ * reactivated and its timestamps updated, the latter will be in sync
2641 -+ * with the actual service received by the queue until expiration.
2642 -+ *
2643 -+ * Charging a full budget to the first type of queues and the exact
2644 -+ * service to the others has the effect of using the WF2Q+ policy to
2645 -+ * schedule the former on a timeslice basis, without violating the
2646 -+ * service domain guarantees of the latter.
2647 -+ */
2648 -+static void bfq_bfqq_expire(struct bfq_data *bfqd,
2649 -+ struct bfq_queue *bfqq,
2650 -+ int compensate,
2651 -+ enum bfqq_expiration reason)
2652 -+{
2653 -+ int slow;
2654 -+ BUG_ON(bfqq != bfqd->active_queue);
2655 -+
2656 -+ /* Update disk peak rate for autotuning and check whether the
2657 -+ * process is slow (see bfq_update_peak_rate).
2658 -+ */
2659 -+ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
2660 -+
2661 -+ /*
2662 -+ * As above explained, 'punish' slow (i.e., seeky), timed-out
2663 -+ * and async queues, to favor sequential sync workloads.
2664 -+ *
2665 -+ * Processes doing IO in the slower disk zones will tend to be
2666 -+ * slow(er) even if not seeky. Hence, since the estimated peak
2667 -+ * rate is actually an average over the disk surface, these
2668 -+ * processes may timeout just for bad luck. To avoid punishing
2669 -+ * them we do not charge a full budget to a process that
2670 -+ * succeeded in consuming at least 2/3 of its budget.
2671 -+ */
2672 -+ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
2673 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
2674 -+ bfq_bfqq_charge_full_budget(bfqq);
2675 -+
2676 -+ if (bfqd->low_latency && bfqq->raising_coeff == 1)
2677 -+ bfqq->last_rais_start_finish = jiffies;
2678 -+
2679 -+ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {
2680 -+ if(reason != BFQ_BFQQ_BUDGET_TIMEOUT)
2681 -+ bfqq->soft_rt_next_start =
2682 -+ jiffies +
2683 -+ HZ * bfqq->entity.service /
2684 -+ bfqd->bfq_raising_max_softrt_rate;
2685 -+ else
2686 -+ bfqq->soft_rt_next_start = -1; /* infinity */
2687 -+ }
2688 -+ bfq_log_bfqq(bfqd, bfqq,
2689 -+ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
2690 -+ bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
2691 -+
2692 -+ /* Increase, decrease or leave budget unchanged according to reason */
2693 -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
2694 -+ __bfq_bfqq_expire(bfqd, bfqq);
2695 -+}
2696 -+
2697 -+/*
2698 -+ * Budget timeout is not implemented through a dedicated timer, but
2699 -+ * just checked on request arrivals and completions, as well as on
2700 -+ * idle timer expirations.
2701 -+ */
2702 -+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
2703 -+{
2704 -+ if (bfq_bfqq_budget_new(bfqq))
2705 -+ return 0;
2706 -+
2707 -+ if (time_before(jiffies, bfqq->budget_timeout))
2708 -+ return 0;
2709 -+
2710 -+ return 1;
2711 -+}
2712 -+
2713 -+/*
2714 -+ * If we expire a queue that is waiting for the arrival of a new
2715 -+ * request, we may prevent the fictitious timestamp backshifting that
2716 -+ * allows the guarantees of the queue to be preserved (see [1] for
2717 -+ * this tricky aspect). Hence we return true only if this condition
2718 -+ * does not hold, or if the queue is slow enough to deserve only to be
2719 -+ * kicked off for preserving a high throughput.
2720 -+*/
2721 -+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
2722 -+{
2723 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
2724 -+ "may_budget_timeout: wr %d left %d timeout %d",
2725 -+ bfq_bfqq_wait_request(bfqq),
2726 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
2727 -+ bfq_bfqq_budget_timeout(bfqq));
2728 -+
2729 -+ return (!bfq_bfqq_wait_request(bfqq) ||
2730 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
2731 -+ &&
2732 -+ bfq_bfqq_budget_timeout(bfqq);
2733 -+}
2734 -+
2735 -+/*
2736 -+ * If the active queue is empty, but it is sync and either of the following
2737 -+ * conditions holds, then: 1) the queue must remain active and cannot be
2738 -+ * expired, and 2) the disk must be idled to wait for the possible arrival
2739 -+ * of a new request for the queue. The conditions are:
2740 -+ * - the device is rotational and not performing NCQ, and the queue has its
2741 -+ * idle window set (in this case, waiting for a new request for the queue
2742 -+ * is likely to boost the disk throughput);
2743 -+ * - the queue is weight-raised (waiting for the request is necessary for
2744 -+ * providing the queue with fairness and latency guarantees).
2745 -+ */
2746 -+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
2747 -+ int budg_timeout)
2748 -+{
2749 -+ struct bfq_data *bfqd = bfqq->bfqd;
2750 -+
2751 -+ return (bfq_bfqq_sync(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&
2752 -+ bfqd->bfq_slice_idle != 0 &&
2753 -+ ((bfq_bfqq_idle_window(bfqq) && !bfqd->hw_tag &&
2754 -+ !blk_queue_nonrot(bfqd->queue))
2755 -+ || bfqq->raising_coeff > 1) &&
2756 -+ (bfqd->rq_in_driver == 0 ||
2757 -+ budg_timeout ||
2758 -+ bfqq->raising_coeff > 1) &&
2759 -+ !bfq_close_cooperator(bfqd, bfqq) &&
2760 -+ (!bfq_bfqq_coop(bfqq) ||
2761 -+ !bfq_bfqq_some_coop_idle(bfqq)) &&
2762 -+ !bfq_queue_nonrot_noidle(bfqd, bfqq));
2763 -+}
2764 -+
2765 -+/*
2766 -+ * Select a queue for service. If we have a current active queue,
2767 -+ * check whether to continue servicing it, or retrieve and set a new one.
2768 -+ */
2769 -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
2770 -+{
2771 -+ struct bfq_queue *bfqq, *new_bfqq = NULL;
2772 -+ struct request *next_rq;
2773 -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
2774 -+ int budg_timeout;
2775 -+
2776 -+ bfqq = bfqd->active_queue;
2777 -+ if (bfqq == NULL)
2778 -+ goto new_queue;
2779 -+
2780 -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
2781 -+
2782 -+ /*
2783 -+ * If another queue has a request waiting within our mean seek
2784 -+ * distance, let it run. The expire code will check for close
2785 -+ * cooperators and put the close queue at the front of the
2786 -+ * service tree. If possible, merge the expiring queue with the
2787 -+ * new bfqq.
2788 -+ */
2789 -+ new_bfqq = bfq_close_cooperator(bfqd, bfqq);
2790 -+ if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
2791 -+ bfq_setup_merge(bfqq, new_bfqq);
2792 -+
2793 -+ budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
2794 -+ if (budg_timeout &&
2795 -+ !bfq_bfqq_must_idle(bfqq, budg_timeout))
2796 -+ goto expire;
2797 -+
2798 -+ next_rq = bfqq->next_rq;
2799 -+ /*
2800 -+ * If bfqq has requests queued and it has enough budget left to
2801 -+ * serve them, keep the queue, otherwise expire it.
2802 -+ */
2803 -+ if (next_rq != NULL) {
2804 -+ if (bfq_serv_to_charge(next_rq, bfqq) >
2805 -+ bfq_bfqq_budget_left(bfqq)) {
2806 -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
2807 -+ goto expire;
2808 -+ } else {
2809 -+ /*
2810 -+ * The idle timer may be pending because we may not
2811 -+ * disable disk idling even when a new request arrives
2812 -+ */
2813 -+ if (timer_pending(&bfqd->idle_slice_timer)) {
2814 -+ /*
2815 -+ * If we get here: 1) at least a new request
2816 -+ * has arrived but we have not disabled the
2817 -+ * timer because the request was too small,
2818 -+ * 2) then the block layer has unplugged the
2819 -+ * device, causing the dispatch to be invoked.
2820 -+ *
2821 -+ * Since the device is unplugged, now the
2822 -+ * requests are probably large enough to
2823 -+ * provide a reasonable throughput.
2824 -+ * So we disable idling.
2825 -+ */
2826 -+ bfq_clear_bfqq_wait_request(bfqq);
2827 -+ del_timer(&bfqd->idle_slice_timer);
2828 -+ }
2829 -+ if (new_bfqq == NULL)
2830 -+ goto keep_queue;
2831 -+ else
2832 -+ goto expire;
2833 -+ }
2834 -+ }
2835 -+
2836 -+ /*
2837 -+ * No requests pending. If there is no cooperator, and the active
2838 -+ * queue still has requests in flight or is idling for a new request,
2839 -+ * then keep it.
2840 -+ */
2841 -+ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
2842 -+ (bfqq->dispatched != 0 &&
2843 -+ (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&
2844 -+ !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {
2845 -+ bfqq = NULL;
2846 -+ goto keep_queue;
2847 -+ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
2848 -+ /*
2849 -+ * Expiring the queue because there is a close cooperator,
2850 -+ * cancel timer.
2851 -+ */
2852 -+ bfq_clear_bfqq_wait_request(bfqq);
2853 -+ del_timer(&bfqd->idle_slice_timer);
2854 -+ }
2855 -+
2856 -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
2857 -+expire:
2858 -+ bfq_bfqq_expire(bfqd, bfqq, 0, reason);
2859 -+new_queue:
2860 -+ bfqq = bfq_set_active_queue(bfqd, new_bfqq);
2861 -+ bfq_log(bfqd, "select_queue: new queue %d returned",
2862 -+ bfqq != NULL ? bfqq->pid : 0);
2863 -+keep_queue:
2864 -+ return bfqq;
2865 -+}
2866 -+
2867 -+static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2868 -+{
2869 -+ if (bfqq->raising_coeff > 1) { /* queue is being boosted */
2870 -+ struct bfq_entity *entity = &bfqq->entity;
2871 -+
2872 -+ bfq_log_bfqq(bfqd, bfqq,
2873 -+ "raising period dur %u/%u msec, "
2874 -+ "old raising coeff %u, w %d(%d)",
2875 -+ jiffies_to_msecs(jiffies -
2876 -+ bfqq->last_rais_start_finish),
2877 -+ jiffies_to_msecs(bfqq->raising_cur_max_time),
2878 -+ bfqq->raising_coeff,
2879 -+ bfqq->entity.weight, bfqq->entity.orig_weight);
2880 -+
2881 -+ BUG_ON(bfqq != bfqd->active_queue && entity->weight !=
2882 -+ entity->orig_weight * bfqq->raising_coeff);
2883 -+ if(entity->ioprio_changed)
2884 -+ bfq_log_bfqq(bfqd, bfqq,
2885 -+ "WARN: pending prio change");
2886 -+ /*
2887 -+ * If too much time has elapsed from the beginning
2888 -+ * of this weight-raising period and process is not soft
2889 -+ * real-time, stop it
2890 -+ */
2891 -+ if (jiffies - bfqq->last_rais_start_finish >
2892 -+ bfqq->raising_cur_max_time) {
2893 -+ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
2894 -+ bfqq->soft_rt_next_start < jiffies;
2895 -+
2896 -+ bfqq->last_rais_start_finish = jiffies;
2897 -+ if (soft_rt)
2898 -+ bfqq->raising_cur_max_time =
2899 -+ bfqd->bfq_raising_rt_max_time;
2900 -+ else {
2901 -+ bfq_log_bfqq(bfqd, bfqq,
2902 -+ "wrais ending at %llu msec,"
2903 -+ "rais_max_time %u",
2904 -+ bfqq->last_rais_start_finish,
2905 -+ jiffies_to_msecs(bfqq->
2906 -+ raising_cur_max_time));
2907 -+ bfq_bfqq_end_raising(bfqq);
2908 -+ __bfq_entity_update_weight_prio(
2909 -+ bfq_entity_service_tree(entity),
2910 -+ entity);
2911 -+ }
2912 -+ }
2913 -+ }
2914 -+}
2915 -+
2916 -+/*
2917 -+ * Dispatch one request from bfqq, moving it to the request queue
2918 -+ * dispatch list.
2919 -+ */
2920 -+static int bfq_dispatch_request(struct bfq_data *bfqd,
2921 -+ struct bfq_queue *bfqq)
2922 -+{
2923 -+ int dispatched = 0;
2924 -+ struct request *rq;
2925 -+ unsigned long service_to_charge;
2926 -+
2927 -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
2928 -+
2929 -+ /* Follow expired path, else get first next available. */
2930 -+ rq = bfq_check_fifo(bfqq);
2931 -+ if (rq == NULL)
2932 -+ rq = bfqq->next_rq;
2933 -+ service_to_charge = bfq_serv_to_charge(rq, bfqq);
2934 -+
2935 -+ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
2936 -+ /*
2937 -+ * This may happen if the next rq is chosen
2938 -+ * in fifo order instead of sector order.
2939 -+ * The budget is properly dimensioned
2940 -+ * to be always sufficient to serve the next request
2941 -+ * only if it is chosen in sector order. The reason is
2942 -+ * that it would be quite inefficient and little useful
2943 -+ * to always make sure that the budget is large enough
2944 -+ * to serve even the possible next rq in fifo order.
2945 -+ * In fact, requests are seldom served in fifo order.
2946 -+ *
2947 -+ * Expire the queue for budget exhaustion, and
2948 -+ * make sure that the next act_budget is enough
2949 -+ * to serve the next request, even if it comes
2950 -+ * from the fifo expired path.
2951 -+ */
2952 -+ bfqq->next_rq = rq;
2953 -+ /*
2954 -+ * Since this dispatch is failed, make sure that
2955 -+ * a new one will be performed
2956 -+ */
2957 -+ if (!bfqd->rq_in_driver)
2958 -+ bfq_schedule_dispatch(bfqd);
2959 -+ goto expire;
2960 -+ }
2961 -+
2962 -+ /* Finally, insert request into driver dispatch list. */
2963 -+ bfq_bfqq_served(bfqq, service_to_charge);
2964 -+ bfq_dispatch_insert(bfqd->queue, rq);
2965 -+
2966 -+ update_raising_data(bfqd, bfqq);
2967 -+
2968 -+ bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), "
2969 -+ "budg left %lu",
2970 -+ blk_rq_sectors(rq),
2971 -+ (long long unsigned)blk_rq_pos(rq),
2972 -+ bfq_bfqq_budget_left(bfqq));
2973 -+
2974 -+ dispatched++;
2975 -+
2976 -+ if (bfqd->active_bic == NULL) {
2977 -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
2978 -+ bfqd->active_bic = RQ_BIC(rq);
2979 -+ }
2980 -+
2981 -+ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
2982 -+ dispatched >= bfqd->bfq_max_budget_async_rq) ||
2983 -+ bfq_class_idle(bfqq)))
2984 -+ goto expire;
2985 -+
2986 -+ return dispatched;
2987 -+
2988 -+expire:
2989 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
2990 -+ return dispatched;
2991 -+}
2992 -+
2993 -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
2994 -+{
2995 -+ int dispatched = 0;
2996 -+
2997 -+ while (bfqq->next_rq != NULL) {
2998 -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
2999 -+ dispatched++;
3000 -+ }
3001 -+
3002 -+ BUG_ON(!list_empty(&bfqq->fifo));
3003 -+ return dispatched;
3004 -+}
3005 -+
3006 -+/*
3007 -+ * Drain our current requests. Used for barriers and when switching
3008 -+ * io schedulers on-the-fly.
3009 -+ */
3010 -+static int bfq_forced_dispatch(struct bfq_data *bfqd)
3011 -+{
3012 -+ struct bfq_queue *bfqq, *n;
3013 -+ struct bfq_service_tree *st;
3014 -+ int dispatched = 0;
3015 -+
3016 -+ bfqq = bfqd->active_queue;
3017 -+ if (bfqq != NULL)
3018 -+ __bfq_bfqq_expire(bfqd, bfqq);
3019 -+
3020 -+ /*
3021 -+ * Loop through classes, and be careful to leave the scheduler
3022 -+ * in a consistent state, as feedback mechanisms and vtime
3023 -+ * updates cannot be disabled during the process.
3024 -+ */
3025 -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
3026 -+ st = bfq_entity_service_tree(&bfqq->entity);
3027 -+
3028 -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
3029 -+ bfqq->max_budget = bfq_max_budget(bfqd);
3030 -+
3031 -+ bfq_forget_idle(st);
3032 -+ }
3033 -+
3034 -+ BUG_ON(bfqd->busy_queues != 0);
3035 -+
3036 -+ return dispatched;
3037 -+}
3038 -+
3039 -+static int bfq_dispatch_requests(struct request_queue *q, int force)
3040 -+{
3041 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3042 -+ struct bfq_queue *bfqq;
3043 -+ int max_dispatch;
3044 -+
3045 -+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
3046 -+ if (bfqd->busy_queues == 0)
3047 -+ return 0;
3048 -+
3049 -+ if (unlikely(force))
3050 -+ return bfq_forced_dispatch(bfqd);
3051 -+
3052 -+ if((bfqq = bfq_select_queue(bfqd)) == NULL)
3053 -+ return 0;
3054 -+
3055 -+ max_dispatch = bfqd->bfq_quantum;
3056 -+ if (bfq_class_idle(bfqq))
3057 -+ max_dispatch = 1;
3058 -+
3059 -+ if (!bfq_bfqq_sync(bfqq))
3060 -+ max_dispatch = bfqd->bfq_max_budget_async_rq;
3061 -+
3062 -+ if (bfqq->dispatched >= max_dispatch) {
3063 -+ if (bfqd->busy_queues > 1)
3064 -+ return 0;
3065 -+ if (bfqq->dispatched >= 4 * max_dispatch)
3066 -+ return 0;
3067 -+ }
3068 -+
3069 -+ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
3070 -+ return 0;
3071 -+
3072 -+ bfq_clear_bfqq_wait_request(bfqq);
3073 -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
3074 -+
3075 -+ if (! bfq_dispatch_request(bfqd, bfqq))
3076 -+ return 0;
3077 -+
3078 -+ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d"
3079 -+ "(max_disp %d)", bfqq->pid, max_dispatch);
3080 -+
3081 -+ return 1;
3082 -+}
3083 -+
3084 -+/*
3085 -+ * Task holds one reference to the queue, dropped when task exits. Each rq
3086 -+ * in-flight on this queue also holds a reference, dropped when rq is freed.
3087 -+ *
3088 -+ * Queue lock must be held here.
3089 -+ */
3090 -+static void bfq_put_queue(struct bfq_queue *bfqq)
3091 -+{
3092 -+ struct bfq_data *bfqd = bfqq->bfqd;
3093 -+
3094 -+ BUG_ON(atomic_read(&bfqq->ref) <= 0);
3095 -+
3096 -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
3097 -+ atomic_read(&bfqq->ref));
3098 -+ if (!atomic_dec_and_test(&bfqq->ref))
3099 -+ return;
3100 -+
3101 -+ BUG_ON(rb_first(&bfqq->sort_list) != NULL);
3102 -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
3103 -+ BUG_ON(bfqq->entity.tree != NULL);
3104 -+ BUG_ON(bfq_bfqq_busy(bfqq));
3105 -+ BUG_ON(bfqd->active_queue == bfqq);
3106 -+
3107 -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
3108 -+
3109 -+ kmem_cache_free(bfq_pool, bfqq);
3110 -+}
3111 -+
3112 -+static void bfq_put_cooperator(struct bfq_queue *bfqq)
3113 -+{
3114 -+ struct bfq_queue *__bfqq, *next;
3115 -+
3116 -+ /*
3117 -+ * If this queue was scheduled to merge with another queue, be
3118 -+ * sure to drop the reference taken on that queue (and others in
3119 -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
3120 -+ */
3121 -+ __bfqq = bfqq->new_bfqq;
3122 -+ while (__bfqq) {
3123 -+ if (__bfqq == bfqq) {
3124 -+ WARN(1, "bfqq->new_bfqq loop detected.\n");
3125 -+ break;
3126 -+ }
3127 -+ next = __bfqq->new_bfqq;
3128 -+ bfq_put_queue(__bfqq);
3129 -+ __bfqq = next;
3130 -+ }
3131 -+}
3132 -+
3133 -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3134 -+{
3135 -+ if (bfqq == bfqd->active_queue) {
3136 -+ __bfq_bfqq_expire(bfqd, bfqq);
3137 -+ bfq_schedule_dispatch(bfqd);
3138 -+ }
3139 -+
3140 -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
3141 -+ atomic_read(&bfqq->ref));
3142 -+
3143 -+ bfq_put_cooperator(bfqq);
3144 -+
3145 -+ bfq_put_queue(bfqq);
3146 -+}
3147 -+
3148 -+static void bfq_init_icq(struct io_cq *icq)
3149 -+{
3150 -+ struct bfq_io_cq *bic = icq_to_bic(icq);
3151 -+
3152 -+ bic->ttime.last_end_request = jiffies;
3153 -+}
3154 -+
3155 -+static void bfq_exit_icq(struct io_cq *icq)
3156 -+{
3157 -+ struct bfq_io_cq *bic = icq_to_bic(icq);
3158 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
3159 -+
3160 -+ if (bic->bfqq[BLK_RW_ASYNC]) {
3161 -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
3162 -+ bic->bfqq[BLK_RW_ASYNC] = NULL;
3163 -+ }
3164 -+
3165 -+ if (bic->bfqq[BLK_RW_SYNC]) {
3166 -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
3167 -+ bic->bfqq[BLK_RW_SYNC] = NULL;
3168 -+ }
3169 -+}
3170 -+
3171 -+/*
3172 -+ * Update the entity prio values; note that the new values will not
3173 -+ * be used until the next (re)activation.
3174 -+ */
3175 -+static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
3176 -+{
3177 -+ struct task_struct *tsk = current;
3178 -+ int ioprio_class;
3179 -+
3180 -+ if (!bfq_bfqq_prio_changed(bfqq))
3181 -+ return;
3182 -+
3183 -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3184 -+ switch (ioprio_class) {
3185 -+ default:
3186 -+ printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);
3187 -+ case IOPRIO_CLASS_NONE:
3188 -+ /*
3189 -+ * No prio set, inherit CPU scheduling settings.
3190 -+ */
3191 -+ bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
3192 -+ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
3193 -+ break;
3194 -+ case IOPRIO_CLASS_RT:
3195 -+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3196 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
3197 -+ break;
3198 -+ case IOPRIO_CLASS_BE:
3199 -+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3200 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
3201 -+ break;
3202 -+ case IOPRIO_CLASS_IDLE:
3203 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
3204 -+ bfqq->entity.new_ioprio = 7;
3205 -+ bfq_clear_bfqq_idle_window(bfqq);
3206 -+ break;
3207 -+ }
3208 -+
3209 -+ bfqq->entity.ioprio_changed = 1;
3210 -+
3211 -+ /*
3212 -+ * Keep track of original prio settings in case we have to temporarily
3213 -+ * elevate the priority of this queue.
3214 -+ */
3215 -+ bfqq->org_ioprio = bfqq->entity.new_ioprio;
3216 -+ bfq_clear_bfqq_prio_changed(bfqq);
3217 -+}
3218 -+
3219 -+static void bfq_changed_ioprio(struct bfq_io_cq *bic)
3220 -+{
3221 -+ struct bfq_data *bfqd;
3222 -+ struct bfq_queue *bfqq, *new_bfqq;
3223 -+ struct bfq_group *bfqg;
3224 -+ unsigned long uninitialized_var(flags);
3225 -+ int ioprio = bic->icq.ioc->ioprio;
3226 -+
3227 -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
3228 -+ /*
3229 -+ * This condition may trigger on a newly created bic, be sure to drop the
3230 -+ * lock before returning.
3231 -+ */
3232 -+ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
3233 -+ goto out;
3234 -+
3235 -+ bfqq = bic->bfqq[BLK_RW_ASYNC];
3236 -+ if (bfqq != NULL) {
3237 -+ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
3238 -+ sched_data);
3239 -+ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
3240 -+ GFP_ATOMIC);
3241 -+ if (new_bfqq != NULL) {
3242 -+ bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
3243 -+ bfq_log_bfqq(bfqd, bfqq,
3244 -+ "changed_ioprio: bfqq %p %d",
3245 -+ bfqq, atomic_read(&bfqq->ref));
3246 -+ bfq_put_queue(bfqq);
3247 -+ }
3248 -+ }
3249 -+
3250 -+ bfqq = bic->bfqq[BLK_RW_SYNC];
3251 -+ if (bfqq != NULL)
3252 -+ bfq_mark_bfqq_prio_changed(bfqq);
3253 -+
3254 -+ bic->ioprio = ioprio;
3255 -+
3256 -+out:
3257 -+ bfq_put_bfqd_unlock(bfqd, &flags);
3258 -+}
3259 -+
3260 -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3261 -+ pid_t pid, int is_sync)
3262 -+{
3263 -+ RB_CLEAR_NODE(&bfqq->entity.rb_node);
3264 -+ INIT_LIST_HEAD(&bfqq->fifo);
3265 -+
3266 -+ atomic_set(&bfqq->ref, 0);
3267 -+ bfqq->bfqd = bfqd;
3268 -+
3269 -+ bfq_mark_bfqq_prio_changed(bfqq);
3270 -+
3271 -+ if (is_sync) {
3272 -+ if (!bfq_class_idle(bfqq))
3273 -+ bfq_mark_bfqq_idle_window(bfqq);
3274 -+ bfq_mark_bfqq_sync(bfqq);
3275 -+ }
3276 -+
3277 -+ /* Tentative initial value to trade off between thr and lat */
3278 -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
3279 -+ bfqq->pid = pid;
3280 -+
3281 -+ bfqq->raising_coeff = 1;
3282 -+ bfqq->last_rais_start_finish = 0;
3283 -+ bfqq->soft_rt_next_start = -1;
3284 -+}
3285 -+
3286 -+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
3287 -+ struct bfq_group *bfqg,
3288 -+ int is_sync,
3289 -+ struct bfq_io_cq *bic,
3290 -+ gfp_t gfp_mask)
3291 -+{
3292 -+ struct bfq_queue *bfqq, *new_bfqq = NULL;
3293 -+
3294 -+retry:
3295 -+ /* bic always exists here */
3296 -+ bfqq = bic_to_bfqq(bic, is_sync);
3297 -+
3298 -+ /*
3299 -+ * Always try a new alloc if we fall back to the OOM bfqq
3300 -+ * originally, since it should just be a temporary situation.
3301 -+ */
3302 -+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3303 -+ bfqq = NULL;
3304 -+ if (new_bfqq != NULL) {
3305 -+ bfqq = new_bfqq;
3306 -+ new_bfqq = NULL;
3307 -+ } else if (gfp_mask & __GFP_WAIT) {
3308 -+ spin_unlock_irq(bfqd->queue->queue_lock);
3309 -+ new_bfqq = kmem_cache_alloc_node(bfq_pool,
3310 -+ gfp_mask | __GFP_ZERO,
3311 -+ bfqd->queue->node);
3312 -+ spin_lock_irq(bfqd->queue->queue_lock);
3313 -+ if (new_bfqq != NULL)
3314 -+ goto retry;
3315 -+ } else {
3316 -+ bfqq = kmem_cache_alloc_node(bfq_pool,
3317 -+ gfp_mask | __GFP_ZERO,
3318 -+ bfqd->queue->node);
3319 -+ }
3320 -+
3321 -+ if (bfqq != NULL) {
3322 -+ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
3323 -+ bfq_log_bfqq(bfqd, bfqq, "allocated");
3324 -+ } else {
3325 -+ bfqq = &bfqd->oom_bfqq;
3326 -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
3327 -+ }
3328 -+
3329 -+ bfq_init_prio_data(bfqq, bic);
3330 -+ bfq_init_entity(&bfqq->entity, bfqg);
3331 -+ }
3332 -+
3333 -+ if (new_bfqq != NULL)
3334 -+ kmem_cache_free(bfq_pool, new_bfqq);
3335 -+
3336 -+ return bfqq;
3337 -+}
3338 -+
3339 -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
3340 -+ struct bfq_group *bfqg,
3341 -+ int ioprio_class, int ioprio)
3342 -+{
3343 -+ switch (ioprio_class) {
3344 -+ case IOPRIO_CLASS_RT:
3345 -+ return &bfqg->async_bfqq[0][ioprio];
3346 -+ case IOPRIO_CLASS_NONE:
3347 -+ ioprio = IOPRIO_NORM;
3348 -+ /* fall through */
3349 -+ case IOPRIO_CLASS_BE:
3350 -+ return &bfqg->async_bfqq[1][ioprio];
3351 -+ case IOPRIO_CLASS_IDLE:
3352 -+ return &bfqg->async_idle_bfqq;
3353 -+ default:
3354 -+ BUG();
3355 -+ }
3356 -+}
3357 -+
3358 -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
3359 -+ struct bfq_group *bfqg, int is_sync,
3360 -+ struct bfq_io_cq *bic, gfp_t gfp_mask)
3361 -+{
3362 -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3363 -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3364 -+ struct bfq_queue **async_bfqq = NULL;
3365 -+ struct bfq_queue *bfqq = NULL;
3366 -+
3367 -+ if (!is_sync) {
3368 -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
3369 -+ ioprio);
3370 -+ bfqq = *async_bfqq;
3371 -+ }
3372 -+
3373 -+ if (bfqq == NULL)
3374 -+ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
3375 -+
3376 -+ /*
3377 -+ * Pin the queue now that it's allocated, scheduler exit will prune it.
3378 -+ */
3379 -+ if (!is_sync && *async_bfqq == NULL) {
3380 -+ atomic_inc(&bfqq->ref);
3381 -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
3382 -+ bfqq, atomic_read(&bfqq->ref));
3383 -+ *async_bfqq = bfqq;
3384 -+ }
3385 -+
3386 -+ atomic_inc(&bfqq->ref);
3387 -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
3388 -+ atomic_read(&bfqq->ref));
3389 -+ return bfqq;
3390 -+}
3391 -+
3392 -+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
3393 -+ struct bfq_io_cq *bic)
3394 -+{
3395 -+ unsigned long elapsed = jiffies - bic->ttime.last_end_request;
3396 -+ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
3397 -+
3398 -+ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
3399 -+ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
3400 -+ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / bic->ttime.ttime_samples;
3401 -+}
3402 -+
3403 -+static void bfq_update_io_seektime(struct bfq_data *bfqd,
3404 -+ struct bfq_queue *bfqq,
3405 -+ struct request *rq)
3406 -+{
3407 -+ sector_t sdist;
3408 -+ u64 total;
3409 -+
3410 -+ if (bfqq->last_request_pos < blk_rq_pos(rq))
3411 -+ sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
3412 -+ else
3413 -+ sdist = bfqq->last_request_pos - blk_rq_pos(rq);
3414 -+
3415 -+ /*
3416 -+ * Don't allow the seek distance to get too large from the
3417 -+ * odd fragment, pagein, etc.
3418 -+ */
3419 -+ if (bfqq->seek_samples == 0) /* first request, not really a seek */
3420 -+ sdist = 0;
3421 -+ else if (bfqq->seek_samples <= 60) /* second & third seek */
3422 -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
3423 -+ else
3424 -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
3425 -+
3426 -+ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
3427 -+ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
3428 -+ total = bfqq->seek_total + (bfqq->seek_samples/2);
3429 -+ do_div(total, bfqq->seek_samples);
3430 -+ if (bfq_bfqq_coop(bfqq)) {
3431 -+ /*
3432 -+ * If the mean seektime increases for a (non-seeky) shared
3433 -+ * queue, some cooperator is likely to be idling too much.
3434 -+ * On the contrary, if it decreases, some cooperator has
3435 -+ * probably waked up.
3436 -+ *
3437 -+ */
3438 -+ if ((sector_t)total < bfqq->seek_mean)
3439 -+ bfq_mark_bfqq_some_coop_idle(bfqq) ;
3440 -+ else if ((sector_t)total > bfqq->seek_mean)
3441 -+ bfq_clear_bfqq_some_coop_idle(bfqq) ;
3442 -+ }
3443 -+ bfqq->seek_mean = (sector_t)total;
3444 -+
3445 -+ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
3446 -+ (u64)bfqq->seek_mean);
3447 -+}
3448 -+
3449 -+/*
3450 -+ * Disable idle window if the process thinks too long or seeks so much that
3451 -+ * it doesn't matter.
3452 -+ */
3453 -+static void bfq_update_idle_window(struct bfq_data *bfqd,
3454 -+ struct bfq_queue *bfqq,
3455 -+ struct bfq_io_cq *bic)
3456 -+{
3457 -+ int enable_idle;
3458 -+
3459 -+ /* Don't idle for async or idle io prio class. */
3460 -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
3461 -+ return;
3462 -+
3463 -+ enable_idle = bfq_bfqq_idle_window(bfqq);
3464 -+
3465 -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
3466 -+ bfqd->bfq_slice_idle == 0 ||
3467 -+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
3468 -+ bfqq->raising_coeff == 1))
3469 -+ enable_idle = 0;
3470 -+ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
3471 -+ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
3472 -+ bfqq->raising_coeff == 1)
3473 -+ enable_idle = 0;
3474 -+ else
3475 -+ enable_idle = 1;
3476 -+ }
3477 -+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
3478 -+ enable_idle);
3479 -+
3480 -+ if (enable_idle)
3481 -+ bfq_mark_bfqq_idle_window(bfqq);
3482 -+ else
3483 -+ bfq_clear_bfqq_idle_window(bfqq);
3484 -+}
3485 -+
3486 -+/*
3487 -+ * Called when a new fs request (rq) is added to bfqq. Check if there's
3488 -+ * something we should do about it.
3489 -+ */
3490 -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3491 -+ struct request *rq)
3492 -+{
3493 -+ struct bfq_io_cq *bic = RQ_BIC(rq);
3494 -+
3495 -+ if (rq->cmd_flags & REQ_META)
3496 -+ bfqq->meta_pending++;
3497 -+
3498 -+ bfq_update_io_thinktime(bfqd, bic);
3499 -+ bfq_update_io_seektime(bfqd, bfqq, rq);
3500 -+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
3501 -+ !BFQQ_SEEKY(bfqq))
3502 -+ bfq_update_idle_window(bfqd, bfqq, bic);
3503 -+
3504 -+ bfq_log_bfqq(bfqd, bfqq,
3505 -+ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
3506 -+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
3507 -+ (long long unsigned)bfqq->seek_mean);
3508 -+
3509 -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3510 -+
3511 -+ if (bfqq == bfqd->active_queue) {
3512 -+ /*
3513 -+ * If there is just this request queued and the request
3514 -+ * is small, just exit.
3515 -+ * In this way, if the disk is being idled to wait for a new
3516 -+ * request from the active queue, we avoid unplugging the
3517 -+ * device now.
3518 -+ *
3519 -+ * By doing so, we spare the disk to be committed
3520 -+ * to serve just a small request. On the contrary, we wait for
3521 -+ * the block layer to decide when to unplug the device:
3522 -+ * hopefully, new requests will be merged to this
3523 -+ * one quickly, then the device will be unplugged
3524 -+ * and larger requests will be dispatched.
3525 -+ */
3526 -+ if (bfqq->queued[rq_is_sync(rq)] == 1 &&
3527 -+ blk_rq_sectors(rq) < 32) {
3528 -+ return;
3529 -+ }
3530 -+ if (bfq_bfqq_wait_request(bfqq)) {
3531 -+ /*
3532 -+ * If we are waiting for a request for this queue, let
3533 -+ * it rip immediately and flag that we must not expire
3534 -+ * this queue just now.
3535 -+ */
3536 -+ bfq_clear_bfqq_wait_request(bfqq);
3537 -+ del_timer(&bfqd->idle_slice_timer);
3538 -+ /*
3539 -+ * Here we can safely expire the queue, in
3540 -+ * case of budget timeout, without wasting
3541 -+ * guarantees
3542 -+ */
3543 -+ if (bfq_bfqq_budget_timeout(bfqq))
3544 -+ bfq_bfqq_expire(bfqd, bfqq, 0,
3545 -+ BFQ_BFQQ_BUDGET_TIMEOUT);
3546 -+ __blk_run_queue(bfqd->queue);
3547 -+ }
3548 -+ }
3549 -+}
3550 -+
3551 -+static void bfq_insert_request(struct request_queue *q, struct request *rq)
3552 -+{
3553 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3554 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3555 -+
3556 -+ assert_spin_locked(bfqd->queue->queue_lock);
3557 -+ bfq_init_prio_data(bfqq, RQ_BIC(rq));
3558 -+
3559 -+ bfq_add_rq_rb(rq);
3560 -+
3561 -+ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
3562 -+ list_add_tail(&rq->queuelist, &bfqq->fifo);
3563 -+
3564 -+ bfq_rq_enqueued(bfqd, bfqq, rq);
3565 -+}
3566 -+
3567 -+static void bfq_update_hw_tag(struct bfq_data *bfqd)
3568 -+{
3569 -+ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
3570 -+ bfqd->rq_in_driver);
3571 -+
3572 -+ if (bfqd->hw_tag == 1)
3573 -+ return;
3574 -+
3575 -+ /*
3576 -+ * This sample is valid if the number of outstanding requests
3577 -+ * is large enough to allow a queueing behavior. Note that the
3578 -+ * sum is not exact, as it's not taking into account deactivated
3579 -+ * requests.
3580 -+ */
3581 -+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
3582 -+ return;
3583 -+
3584 -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
3585 -+ return;
3586 -+
3587 -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
3588 -+ bfqd->max_rq_in_driver = 0;
3589 -+ bfqd->hw_tag_samples = 0;
3590 -+}
3591 -+
3592 -+static void bfq_completed_request(struct request_queue *q, struct request *rq)
3593 -+{
3594 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3595 -+ struct bfq_data *bfqd = bfqq->bfqd;
3596 -+ const int sync = rq_is_sync(rq);
3597 -+
3598 -+ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
3599 -+ blk_rq_sectors(rq), sync);
3600 -+
3601 -+ bfq_update_hw_tag(bfqd);
3602 -+
3603 -+ WARN_ON(!bfqd->rq_in_driver);
3604 -+ WARN_ON(!bfqq->dispatched);
3605 -+ bfqd->rq_in_driver--;
3606 -+ bfqq->dispatched--;
3607 -+
3608 -+ if (bfq_bfqq_sync(bfqq))
3609 -+ bfqd->sync_flight--;
3610 -+
3611 -+ if (sync)
3612 -+ RQ_BIC(rq)->ttime.last_end_request = jiffies;
3613 -+
3614 -+ /*
3615 -+ * If this is the active queue, check if it needs to be expired,
3616 -+ * or if we want to idle in case it has no pending requests.
3617 -+ */
3618 -+ if (bfqd->active_queue == bfqq) {
3619 -+ int budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
3620 -+ if (bfq_bfqq_budget_new(bfqq))
3621 -+ bfq_set_budget_timeout(bfqd);
3622 -+
3623 -+ /* Idling is disabled also for cooperation issues:
3624 -+ * 1) there is a close cooperator for the queue, or
3625 -+ * 2) the queue is shared and some cooperator is likely
3626 -+ * to be idle (in this case, by not arming the idle timer,
3627 -+ * we try to slow down the queue, to prevent the zones
3628 -+ * of the disk accessed by the active cooperators to become
3629 -+ * too distant from the zone that will be accessed by the
3630 -+ * currently idle cooperators)
3631 -+ */
3632 -+ if (bfq_bfqq_must_idle(bfqq, budg_timeout))
3633 -+ bfq_arm_slice_timer(bfqd);
3634 -+ else if (budg_timeout)
3635 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
3636 -+ }
3637 -+
3638 -+ if (!bfqd->rq_in_driver)
3639 -+ bfq_schedule_dispatch(bfqd);
3640 -+}
3641 -+
3642 -+static inline int __bfq_may_queue(struct bfq_queue *bfqq)
3643 -+{
3644 -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
3645 -+ bfq_clear_bfqq_must_alloc(bfqq);
3646 -+ return ELV_MQUEUE_MUST;
3647 -+ }
3648 -+
3649 -+ return ELV_MQUEUE_MAY;
3650 -+}
3651 -+
3652 -+static int bfq_may_queue(struct request_queue *q, int rw)
3653 -+{
3654 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3655 -+ struct task_struct *tsk = current;
3656 -+ struct bfq_io_cq *bic;
3657 -+ struct bfq_queue *bfqq;
3658 -+
3659 -+ /*
3660 -+ * Don't force setup of a queue from here, as a call to may_queue
3661 -+ * does not necessarily imply that a request actually will be queued.
3662 -+ * So just lookup a possibly existing queue, or return 'may queue'
3663 -+ * if that fails.
3664 -+ */
3665 -+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
3666 -+ if (bic == NULL)
3667 -+ return ELV_MQUEUE_MAY;
3668 -+
3669 -+ bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
3670 -+ if (bfqq != NULL) {
3671 -+ bfq_init_prio_data(bfqq, bic);
3672 -+
3673 -+ return __bfq_may_queue(bfqq);
3674 -+ }
3675 -+
3676 -+ return ELV_MQUEUE_MAY;
3677 -+}
3678 -+
3679 -+/*
3680 -+ * Queue lock held here.
3681 -+ */
3682 -+static void bfq_put_request(struct request *rq)
3683 -+{
3684 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3685 -+
3686 -+ if (bfqq != NULL) {
3687 -+ const int rw = rq_data_dir(rq);
3688 -+
3689 -+ BUG_ON(!bfqq->allocated[rw]);
3690 -+ bfqq->allocated[rw]--;
3691 -+
3692 -+ rq->elv.priv[0] = NULL;
3693 -+ rq->elv.priv[1] = NULL;
3694 -+
3695 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
3696 -+ bfqq, atomic_read(&bfqq->ref));
3697 -+ bfq_put_queue(bfqq);
3698 -+ }
3699 -+}
3700 -+
3701 -+static struct bfq_queue *
3702 -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
3703 -+ struct bfq_queue *bfqq)
3704 -+{
3705 -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
3706 -+ (long unsigned)bfqq->new_bfqq->pid);
3707 -+ bic_set_bfqq(bic, bfqq->new_bfqq, 1);
3708 -+ bfq_mark_bfqq_coop(bfqq->new_bfqq);
3709 -+ bfq_put_queue(bfqq);
3710 -+ return bic_to_bfqq(bic, 1);
3711 -+}
3712 -+
3713 -+/*
3714 -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
3715 -+ * was the last process referring to said bfqq.
3716 -+ */
3717 -+static struct bfq_queue *
3718 -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
3719 -+{
3720 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
3721 -+ if (bfqq_process_refs(bfqq) == 1) {
3722 -+ bfqq->pid = current->pid;
3723 -+ bfq_clear_bfqq_some_coop_idle(bfqq);
3724 -+ bfq_clear_bfqq_coop(bfqq);
3725 -+ bfq_clear_bfqq_split_coop(bfqq);
3726 -+ return bfqq;
3727 -+ }
3728 -+
3729 -+ bic_set_bfqq(bic, NULL, 1);
3730 -+
3731 -+ bfq_put_cooperator(bfqq);
3732 -+
3733 -+ bfq_put_queue(bfqq);
3734 -+ return NULL;
3735 -+}
3736 -+
3737 -+/*
3738 -+ * Allocate bfq data structures associated with this request.
3739 -+ */
3740 -+static int bfq_set_request(struct request_queue *q, struct request *rq,
3741 -+ struct bio *bio, gfp_t gfp_mask)
3742 -+{
3743 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3744 -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
3745 -+ const int rw = rq_data_dir(rq);
3746 -+ const int is_sync = rq_is_sync(rq);
3747 -+ struct bfq_queue *bfqq;
3748 -+ struct bfq_group *bfqg;
3749 -+ unsigned long flags;
3750 -+
3751 -+ might_sleep_if(gfp_mask & __GFP_WAIT);
3752 -+
3753 -+ bfq_changed_ioprio(bic);
3754 -+
3755 -+ spin_lock_irqsave(q->queue_lock, flags);
3756 -+
3757 -+ if (bic == NULL)
3758 -+ goto queue_fail;
3759 -+
3760 -+ bfqg = bfq_bic_update_cgroup(bic);
3761 -+
3762 -+new_queue:
3763 -+ bfqq = bic_to_bfqq(bic, is_sync);
3764 -+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3765 -+ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
3766 -+ bic_set_bfqq(bic, bfqq, is_sync);
3767 -+ } else {
3768 -+ /*
3769 -+ * If the queue was seeky for too long, break it apart.
3770 -+ */
3771 -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
3772 -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
3773 -+ bfqq = bfq_split_bfqq(bic, bfqq);
3774 -+ if (!bfqq)
3775 -+ goto new_queue;
3776 -+ }
3777 -+
3778 -+ /*
3779 -+ * Check to see if this queue is scheduled to merge with
3780 -+ * another closely cooperating queue. The merging of queues
3781 -+ * happens here as it must be done in process context.
3782 -+ * The reference on new_bfqq was taken in merge_bfqqs.
3783 -+ */
3784 -+ if (bfqq->new_bfqq != NULL)
3785 -+ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
3786 -+ }
3787 -+
3788 -+ bfqq->allocated[rw]++;
3789 -+ atomic_inc(&bfqq->ref);
3790 -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
3791 -+ atomic_read(&bfqq->ref));
3792 -+
3793 -+ rq->elv.priv[0] = bic;
3794 -+ rq->elv.priv[1] = bfqq;
3795 -+
3796 -+ spin_unlock_irqrestore(q->queue_lock, flags);
3797 -+
3798 -+ return 0;
3799 -+
3800 -+queue_fail:
3801 -+ bfq_schedule_dispatch(bfqd);
3802 -+ spin_unlock_irqrestore(q->queue_lock, flags);
3803 -+
3804 -+ return 1;
3805 -+}
3806 -+
3807 -+static void bfq_kick_queue(struct work_struct *work)
3808 -+{
3809 -+ struct bfq_data *bfqd =
3810 -+ container_of(work, struct bfq_data, unplug_work);
3811 -+ struct request_queue *q = bfqd->queue;
3812 -+
3813 -+ spin_lock_irq(q->queue_lock);
3814 -+ __blk_run_queue(q);
3815 -+ spin_unlock_irq(q->queue_lock);
3816 -+}
3817 -+
3818 -+/*
3819 -+ * Handler of the expiration of the timer running if the active_queue
3820 -+ * is idling inside its time slice.
3821 -+ */
3822 -+static void bfq_idle_slice_timer(unsigned long data)
3823 -+{
3824 -+ struct bfq_data *bfqd = (struct bfq_data *)data;
3825 -+ struct bfq_queue *bfqq;
3826 -+ unsigned long flags;
3827 -+ enum bfqq_expiration reason;
3828 -+
3829 -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
3830 -+
3831 -+ bfqq = bfqd->active_queue;
3832 -+ /*
3833 -+ * Theoretical race here: active_queue can be NULL or different
3834 -+ * from the queue that was idling if the timer handler spins on
3835 -+ * the queue_lock and a new request arrives for the current
3836 -+ * queue and there is a full dispatch cycle that changes the
3837 -+ * active_queue. This can hardly happen, but in the worst case
3838 -+ * we just expire a queue too early.
3839 -+ */
3840 -+ if (bfqq != NULL) {
3841 -+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
3842 -+ if (bfq_bfqq_budget_timeout(bfqq))
3843 -+ /*
3844 -+ * Also here the queue can be safely expired
3845 -+ * for budget timeout without wasting
3846 -+ * guarantees
3847 -+ */
3848 -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
3849 -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
3850 -+ /*
3851 -+ * The queue may not be empty upon timer expiration,
3852 -+ * because we may not disable the timer when the first
3853 -+ * request of the active queue arrives during
3854 -+ * disk idling
3855 -+ */
3856 -+ reason = BFQ_BFQQ_TOO_IDLE;
3857 -+ else
3858 -+ goto schedule_dispatch;
3859 -+
3860 -+ bfq_bfqq_expire(bfqd, bfqq, 1, reason);
3861 -+ }
3862 -+
3863 -+schedule_dispatch:
3864 -+ bfq_schedule_dispatch(bfqd);
3865 -+
3866 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
3867 -+}
3868 -+
3869 -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
3870 -+{
3871 -+ del_timer_sync(&bfqd->idle_slice_timer);
3872 -+ cancel_work_sync(&bfqd->unplug_work);
3873 -+}
3874 -+
3875 -+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
3876 -+ struct bfq_queue **bfqq_ptr)
3877 -+{
3878 -+ struct bfq_group *root_group = bfqd->root_group;
3879 -+ struct bfq_queue *bfqq = *bfqq_ptr;
3880 -+
3881 -+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
3882 -+ if (bfqq != NULL) {
3883 -+ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
3884 -+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
3885 -+ bfqq, atomic_read(&bfqq->ref));
3886 -+ bfq_put_queue(bfqq);
3887 -+ *bfqq_ptr = NULL;
3888 -+ }
3889 -+}
3890 -+
3891 -+/*
3892 -+ * Release all the bfqg references to its async queues. If we are
3893 -+ * deallocating the group these queues may still contain requests, so
3894 -+ * we reparent them to the root cgroup (i.e., the only one that will
3895 -+ * exist for sure untill all the requests on a device are gone).
3896 -+ */
3897 -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
3898 -+{
3899 -+ int i, j;
3900 -+
3901 -+ for (i = 0; i < 2; i++)
3902 -+ for (j = 0; j < IOPRIO_BE_NR; j++)
3903 -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
3904 -+
3905 -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
3906 -+}
3907 -+
3908 -+static void bfq_exit_queue(struct elevator_queue *e)
3909 -+{
3910 -+ struct bfq_data *bfqd = e->elevator_data;
3911 -+ struct request_queue *q = bfqd->queue;
3912 -+ struct bfq_queue *bfqq, *n;
3913 -+
3914 -+ bfq_shutdown_timer_wq(bfqd);
3915 -+
3916 -+ spin_lock_irq(q->queue_lock);
3917 -+
3918 -+ BUG_ON(bfqd->active_queue != NULL);
3919 -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
3920 -+ bfq_deactivate_bfqq(bfqd, bfqq, 0);
3921 -+
3922 -+ bfq_disconnect_groups(bfqd);
3923 -+ spin_unlock_irq(q->queue_lock);
3924 -+
3925 -+ bfq_shutdown_timer_wq(bfqd);
3926 -+
3927 -+ synchronize_rcu();
3928 -+
3929 -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
3930 -+
3931 -+ bfq_free_root_group(bfqd);
3932 -+ kfree(bfqd);
3933 -+}
3934 -+
3935 -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
3936 -+{
3937 -+ struct bfq_group *bfqg;
3938 -+ struct bfq_data *bfqd;
3939 -+ struct elevator_queue *eq;
3940 -+
3941 -+ eq = elevator_alloc(q, e);
3942 -+ if (eq == NULL)
3943 -+ return -ENOMEM;
3944 -+
3945 -+ bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node);
3946 -+ if (bfqd == NULL) {
3947 -+ kobject_put(&eq->kobj);
3948 -+ return -ENOMEM;
3949 -+ }
3950 -+ eq->elevator_data = bfqd;
3951 -+
3952 -+ /*
3953 -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
3954 -+ * Grab a permanent reference to it, so that the normal code flow
3955 -+ * will not attempt to free it.
3956 -+ */
3957 -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
3958 -+ atomic_inc(&bfqd->oom_bfqq.ref);
3959 -+
3960 -+ bfqd->queue = q;
3961 -+
3962 -+ spin_lock_irq(q->queue_lock);
3963 -+ q->elevator = eq;
3964 -+ spin_unlock_irq(q->queue_lock);
3965 -+
3966 -+ bfqg = bfq_alloc_root_group(bfqd, q->node);
3967 -+ if (bfqg == NULL) {
3968 -+ kfree(bfqd);
3969 -+ kobject_put(&eq->kobj);
3970 -+ return -ENOMEM;
3971 -+ }
3972 -+
3973 -+ bfqd->root_group = bfqg;
3974 -+
3975 -+ init_timer(&bfqd->idle_slice_timer);
3976 -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
3977 -+ bfqd->idle_slice_timer.data = (unsigned long)bfqd;
3978 -+
3979 -+ bfqd->rq_pos_tree = RB_ROOT;
3980 -+
3981 -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
3982 -+
3983 -+ INIT_LIST_HEAD(&bfqd->active_list);
3984 -+ INIT_LIST_HEAD(&bfqd->idle_list);
3985 -+
3986 -+ bfqd->hw_tag = -1;
3987 -+
3988 -+ bfqd->bfq_max_budget = bfq_default_max_budget;
3989 -+
3990 -+ bfqd->bfq_quantum = bfq_quantum;
3991 -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
3992 -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
3993 -+ bfqd->bfq_back_max = bfq_back_max;
3994 -+ bfqd->bfq_back_penalty = bfq_back_penalty;
3995 -+ bfqd->bfq_slice_idle = bfq_slice_idle;
3996 -+ bfqd->bfq_class_idle_last_service = 0;
3997 -+ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
3998 -+ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
3999 -+ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
4000 -+
4001 -+ bfqd->low_latency = true;
4002 -+
4003 -+ bfqd->bfq_raising_coeff = 20;
4004 -+ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
4005 -+ bfqd->bfq_raising_max_time = 0;
4006 -+ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
4007 -+ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
4008 -+ bfqd->bfq_raising_max_softrt_rate = 7000;
4009 -+
4010 -+ /* Initially estimate the device's peak rate as the reference rate */
4011 -+ if (blk_queue_nonrot(bfqd->queue)) {
4012 -+ bfqd->RT_prod = R_nonrot * T_nonrot;
4013 -+ bfqd->peak_rate = R_nonrot;
4014 -+ } else {
4015 -+ bfqd->RT_prod = R_rot * T_rot;
4016 -+ bfqd->peak_rate = R_rot;
4017 -+ }
4018 -+
4019 -+ return 0;
4020 -+}
4021 -+
4022 -+static void bfq_slab_kill(void)
4023 -+{
4024 -+ if (bfq_pool != NULL)
4025 -+ kmem_cache_destroy(bfq_pool);
4026 -+}
4027 -+
4028 -+static int __init bfq_slab_setup(void)
4029 -+{
4030 -+ bfq_pool = KMEM_CACHE(bfq_queue, 0);
4031 -+ if (bfq_pool == NULL)
4032 -+ return -ENOMEM;
4033 -+ return 0;
4034 -+}
4035 -+
4036 -+static ssize_t bfq_var_show(unsigned int var, char *page)
4037 -+{
4038 -+ return sprintf(page, "%d\n", var);
4039 -+}
4040 -+
4041 -+static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
4042 -+{
4043 -+ unsigned long new_val;
4044 -+ int ret = strict_strtoul(page, 10, &new_val);
4045 -+
4046 -+ if (ret == 0)
4047 -+ *var = new_val;
4048 -+
4049 -+ return count;
4050 -+}
4051 -+
4052 -+static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
4053 -+{
4054 -+ struct bfq_data *bfqd = e->elevator_data;
4055 -+ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
4056 -+ jiffies_to_msecs(bfqd->bfq_raising_max_time) :
4057 -+ jiffies_to_msecs(bfq_wrais_duration(bfqd)));
4058 -+}
4059 -+
4060 -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
4061 -+{
4062 -+ struct bfq_queue *bfqq;
4063 -+ struct bfq_data *bfqd = e->elevator_data;
4064 -+ ssize_t num_char = 0;
4065 -+
4066 -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
4067 -+ bfqd->queued);
4068 -+
4069 -+ spin_lock_irq(bfqd->queue->queue_lock);
4070 -+
4071 -+ num_char += sprintf(page + num_char, "Active:\n");
4072 -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
4073 -+ num_char += sprintf(page + num_char,
4074 -+ "pid%d: weight %hu, nr_queued %d %d,"
4075 -+ " dur %d/%u\n",
4076 -+ bfqq->pid,
4077 -+ bfqq->entity.weight,
4078 -+ bfqq->queued[0],
4079 -+ bfqq->queued[1],
4080 -+ jiffies_to_msecs(jiffies -
4081 -+ bfqq->last_rais_start_finish),
4082 -+ jiffies_to_msecs(bfqq->raising_cur_max_time));
4083 -+ }
4084 -+
4085 -+ num_char += sprintf(page + num_char, "Idle:\n");
4086 -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
4087 -+ num_char += sprintf(page + num_char,
4088 -+ "pid%d: weight %hu, dur %d/%u\n",
4089 -+ bfqq->pid,
4090 -+ bfqq->entity.weight,
4091 -+ jiffies_to_msecs(jiffies -
4092 -+ bfqq->last_rais_start_finish),
4093 -+ jiffies_to_msecs(bfqq->raising_cur_max_time));
4094 -+ }
4095 -+
4096 -+ spin_unlock_irq(bfqd->queue->queue_lock);
4097 -+
4098 -+ return num_char;
4099 -+}
4100 -+
4101 -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
4102 -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
4103 -+{ \
4104 -+ struct bfq_data *bfqd = e->elevator_data; \
4105 -+ unsigned int __data = __VAR; \
4106 -+ if (__CONV) \
4107 -+ __data = jiffies_to_msecs(__data); \
4108 -+ return bfq_var_show(__data, (page)); \
4109 -+}
4110 -+SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
4111 -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
4112 -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
4113 -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
4114 -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
4115 -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
4116 -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
4117 -+SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
4118 -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
4119 -+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
4120 -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
4121 -+SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
4122 -+SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
4123 -+SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
4124 -+ 1);
4125 -+SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
4126 -+ bfqd->bfq_raising_min_inter_arr_async,
4127 -+ 1);
4128 -+SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
4129 -+ bfqd->bfq_raising_max_softrt_rate, 0);
4130 -+#undef SHOW_FUNCTION
4131 -+
4132 -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
4133 -+static ssize_t \
4134 -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \
4135 -+{ \
4136 -+ struct bfq_data *bfqd = e->elevator_data; \
4137 -+ unsigned long uninitialized_var(__data); \
4138 -+ int ret = bfq_var_store(&__data, (page), count); \
4139 -+ if (__data < (MIN)) \
4140 -+ __data = (MIN); \
4141 -+ else if (__data > (MAX)) \
4142 -+ __data = (MAX); \
4143 -+ if (__CONV) \
4144 -+ *(__PTR) = msecs_to_jiffies(__data); \
4145 -+ else \
4146 -+ *(__PTR) = __data; \
4147 -+ return ret; \
4148 -+}
4149 -+STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
4150 -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
4151 -+ INT_MAX, 1);
4152 -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
4153 -+ INT_MAX, 1);
4154 -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
4155 -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
4156 -+ INT_MAX, 0);
4157 -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
4158 -+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
4159 -+ 1, INT_MAX, 0);
4160 -+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
4161 -+ INT_MAX, 1);
4162 -+STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
4163 -+ INT_MAX, 0);
4164 -+STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
4165 -+ INT_MAX, 1);
4166 -+STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
4167 -+ INT_MAX, 1);
4168 -+STORE_FUNCTION(bfq_raising_min_idle_time_store,
4169 -+ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
4170 -+STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
4171 -+ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
4172 -+STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
4173 -+ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
4174 -+#undef STORE_FUNCTION
4175 -+
4176 -+/* do nothing for the moment */
4177 -+static ssize_t bfq_weights_store(struct elevator_queue *e,
4178 -+ const char *page, size_t count)
4179 -+{
4180 -+ return count;
4181 -+}
4182 -+
4183 -+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
4184 -+{
4185 -+ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
4186 -+
4187 -+ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
4188 -+ return bfq_calc_max_budget(bfqd->peak_rate, timeout);
4189 -+ else
4190 -+ return bfq_default_max_budget;
4191 -+}
4192 -+
4193 -+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
4194 -+ const char *page, size_t count)
4195 -+{
4196 -+ struct bfq_data *bfqd = e->elevator_data;
4197 -+ unsigned long uninitialized_var(__data);
4198 -+ int ret = bfq_var_store(&__data, (page), count);
4199 -+
4200 -+ if (__data == 0)
4201 -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4202 -+ else {
4203 -+ if (__data > INT_MAX)
4204 -+ __data = INT_MAX;
4205 -+ bfqd->bfq_max_budget = __data;
4206 -+ }
4207 -+
4208 -+ bfqd->bfq_user_max_budget = __data;
4209 -+
4210 -+ return ret;
4211 -+}
4212 -+
4213 -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
4214 -+ const char *page, size_t count)
4215 -+{
4216 -+ struct bfq_data *bfqd = e->elevator_data;
4217 -+ unsigned long uninitialized_var(__data);
4218 -+ int ret = bfq_var_store(&__data, (page), count);
4219 -+
4220 -+ if (__data < 1)
4221 -+ __data = 1;
4222 -+ else if (__data > INT_MAX)
4223 -+ __data = INT_MAX;
4224 -+
4225 -+ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
4226 -+ if (bfqd->bfq_user_max_budget == 0)
4227 -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4228 -+
4229 -+ return ret;
4230 -+}
4231 -+
4232 -+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
4233 -+ const char *page, size_t count)
4234 -+{
4235 -+ struct bfq_data *bfqd = e->elevator_data;
4236 -+ unsigned long uninitialized_var(__data);
4237 -+ int ret = bfq_var_store(&__data, (page), count);
4238 -+
4239 -+ if (__data > 1)
4240 -+ __data = 1;
4241 -+ if (__data == 0 && bfqd->low_latency != 0)
4242 -+ bfq_end_raising(bfqd);
4243 -+ bfqd->low_latency = __data;
4244 -+
4245 -+ return ret;
4246 -+}
4247 -+
4248 -+#define BFQ_ATTR(name) \
4249 -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
4250 -+
4251 -+static struct elv_fs_entry bfq_attrs[] = {
4252 -+ BFQ_ATTR(quantum),
4253 -+ BFQ_ATTR(fifo_expire_sync),
4254 -+ BFQ_ATTR(fifo_expire_async),
4255 -+ BFQ_ATTR(back_seek_max),
4256 -+ BFQ_ATTR(back_seek_penalty),
4257 -+ BFQ_ATTR(slice_idle),
4258 -+ BFQ_ATTR(max_budget),
4259 -+ BFQ_ATTR(max_budget_async_rq),
4260 -+ BFQ_ATTR(timeout_sync),
4261 -+ BFQ_ATTR(timeout_async),
4262 -+ BFQ_ATTR(low_latency),
4263 -+ BFQ_ATTR(raising_coeff),
4264 -+ BFQ_ATTR(raising_max_time),
4265 -+ BFQ_ATTR(raising_rt_max_time),
4266 -+ BFQ_ATTR(raising_min_idle_time),
4267 -+ BFQ_ATTR(raising_min_inter_arr_async),
4268 -+ BFQ_ATTR(raising_max_softrt_rate),
4269 -+ BFQ_ATTR(weights),
4270 -+ __ATTR_NULL
4271 -+};
4272 -+
4273 -+static struct elevator_type iosched_bfq = {
4274 -+ .ops = {
4275 -+ .elevator_merge_fn = bfq_merge,
4276 -+ .elevator_merged_fn = bfq_merged_request,
4277 -+ .elevator_merge_req_fn = bfq_merged_requests,
4278 -+ .elevator_allow_merge_fn = bfq_allow_merge,
4279 -+ .elevator_dispatch_fn = bfq_dispatch_requests,
4280 -+ .elevator_add_req_fn = bfq_insert_request,
4281 -+ .elevator_activate_req_fn = bfq_activate_request,
4282 -+ .elevator_deactivate_req_fn = bfq_deactivate_request,
4283 -+ .elevator_completed_req_fn = bfq_completed_request,
4284 -+ .elevator_former_req_fn = elv_rb_former_request,
4285 -+ .elevator_latter_req_fn = elv_rb_latter_request,
4286 -+ .elevator_init_icq_fn = bfq_init_icq,
4287 -+ .elevator_exit_icq_fn = bfq_exit_icq,
4288 -+ .elevator_set_req_fn = bfq_set_request,
4289 -+ .elevator_put_req_fn = bfq_put_request,
4290 -+ .elevator_may_queue_fn = bfq_may_queue,
4291 -+ .elevator_init_fn = bfq_init_queue,
4292 -+ .elevator_exit_fn = bfq_exit_queue,
4293 -+ },
4294 -+ .icq_size = sizeof(struct bfq_io_cq),
4295 -+ .icq_align = __alignof__(struct bfq_io_cq),
4296 -+ .elevator_attrs = bfq_attrs,
4297 -+ .elevator_name = "bfq",
4298 -+ .elevator_owner = THIS_MODULE,
4299 -+};
4300 -+
4301 -+static int __init bfq_init(void)
4302 -+{
4303 -+ /*
4304 -+ * Can be 0 on HZ < 1000 setups.
4305 -+ */
4306 -+ if (bfq_slice_idle == 0)
4307 -+ bfq_slice_idle = 1;
4308 -+
4309 -+ if (bfq_timeout_async == 0)
4310 -+ bfq_timeout_async = 1;
4311 -+
4312 -+ if (bfq_slab_setup())
4313 -+ return -ENOMEM;
4314 -+
4315 -+ elv_register(&iosched_bfq);
4316 -+
4317 -+ return 0;
4318 -+}
4319 -+
4320 -+static void __exit bfq_exit(void)
4321 -+{
4322 -+ elv_unregister(&iosched_bfq);
4323 -+ bfq_slab_kill();
4324 -+}
4325 -+
4326 -+module_init(bfq_init);
4327 -+module_exit(bfq_exit);
4328 -+
4329 -+MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
4330 -+MODULE_LICENSE("GPL");
4331 -+MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
4332 -diff --git a/block/bfq-sched.c b/block/bfq-sched.c
4333 -new file mode 100644
4334 -index 0000000..03f8061
4335 ---- /dev/null
4336 -+++ b/block/bfq-sched.c
4337 -@@ -0,0 +1,1072 @@
4338 -+/*
4339 -+ * BFQ: Hierarchical B-WF2Q+ scheduler.
4340 -+ *
4341 -+ * Based on ideas and code from CFQ:
4342 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
4343 -+ *
4344 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
4345 -+ * Paolo Valente <paolo.valente@×××××××.it>
4346 -+ *
4347 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
4348 -+ */
4349 -+
4350 -+#ifdef CONFIG_CGROUP_BFQIO
4351 -+#define for_each_entity(entity) \
4352 -+ for (; entity != NULL; entity = entity->parent)
4353 -+
4354 -+#define for_each_entity_safe(entity, parent) \
4355 -+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
4356 -+
4357 -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
4358 -+ int extract,
4359 -+ struct bfq_data *bfqd);
4360 -+
4361 -+static inline void bfq_update_budget(struct bfq_entity *next_active)
4362 -+{
4363 -+ struct bfq_entity *bfqg_entity;
4364 -+ struct bfq_group *bfqg;
4365 -+ struct bfq_sched_data *group_sd;
4366 -+
4367 -+ BUG_ON(next_active == NULL);
4368 -+
4369 -+ group_sd = next_active->sched_data;
4370 -+
4371 -+ bfqg = container_of(group_sd, struct bfq_group, sched_data);
4372 -+ /*
4373 -+ * bfq_group's my_entity field is not NULL only if the group
4374 -+ * is not the root group. We must not touch the root entity
4375 -+ * as it must never become an active entity.
4376 -+ */
4377 -+ bfqg_entity = bfqg->my_entity;
4378 -+ if (bfqg_entity != NULL)
4379 -+ bfqg_entity->budget = next_active->budget;
4380 -+}
4381 -+
4382 -+static int bfq_update_next_active(struct bfq_sched_data *sd)
4383 -+{
4384 -+ struct bfq_entity *next_active;
4385 -+
4386 -+ if (sd->active_entity != NULL)
4387 -+ /* will update/requeue at the end of service */
4388 -+ return 0;
4389 -+
4390 -+ /*
4391 -+ * NOTE: this can be improved in many ways, such as returning
4392 -+ * 1 (and thus propagating upwards the update) only when the
4393 -+ * budget changes, or caching the bfqq that will be scheduled
4394 -+ * next from this subtree. By now we worry more about
4395 -+ * correctness than about performance...
4396 -+ */
4397 -+ next_active = bfq_lookup_next_entity(sd, 0, NULL);
4398 -+ sd->next_active = next_active;
4399 -+
4400 -+ if (next_active != NULL)
4401 -+ bfq_update_budget(next_active);
4402 -+
4403 -+ return 1;
4404 -+}
4405 -+
4406 -+static inline void bfq_check_next_active(struct bfq_sched_data *sd,
4407 -+ struct bfq_entity *entity)
4408 -+{
4409 -+ BUG_ON(sd->next_active != entity);
4410 -+}
4411 -+#else
4412 -+#define for_each_entity(entity) \
4413 -+ for (; entity != NULL; entity = NULL)
4414 -+
4415 -+#define for_each_entity_safe(entity, parent) \
4416 -+ for (parent = NULL; entity != NULL; entity = parent)
4417 -+
4418 -+static inline int bfq_update_next_active(struct bfq_sched_data *sd)
4419 -+{
4420 -+ return 0;
4421 -+}
4422 -+
4423 -+static inline void bfq_check_next_active(struct bfq_sched_data *sd,
4424 -+ struct bfq_entity *entity)
4425 -+{
4426 -+}
4427 -+
4428 -+static inline void bfq_update_budget(struct bfq_entity *next_active)
4429 -+{
4430 -+}
4431 -+#endif
4432 -+
4433 -+/*
4434 -+ * Shift for timestamp calculations. This actually limits the maximum
4435 -+ * service allowed in one timestamp delta (small shift values increase it),
4436 -+ * the maximum total weight that can be used for the queues in the system
4437 -+ * (big shift values increase it), and the period of virtual time wraparounds.
4438 -+ */
4439 -+#define WFQ_SERVICE_SHIFT 22
4440 -+
4441 -+/**
4442 -+ * bfq_gt - compare two timestamps.
4443 -+ * @a: first ts.
4444 -+ * @b: second ts.
4445 -+ *
4446 -+ * Return @a > @b, dealing with wrapping correctly.
4447 -+ */
4448 -+static inline int bfq_gt(u64 a, u64 b)
4449 -+{
4450 -+ return (s64)(a - b) > 0;
4451 -+}
4452 -+
4453 -+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
4454 -+{
4455 -+ struct bfq_queue *bfqq = NULL;
4456 -+
4457 -+ BUG_ON(entity == NULL);
4458 -+
4459 -+ if (entity->my_sched_data == NULL)
4460 -+ bfqq = container_of(entity, struct bfq_queue, entity);
4461 -+
4462 -+ return bfqq;
4463 -+}
4464 -+
4465 -+
4466 -+/**
4467 -+ * bfq_delta - map service into the virtual time domain.
4468 -+ * @service: amount of service.
4469 -+ * @weight: scale factor (weight of an entity or weight sum).
4470 -+ */
4471 -+static inline u64 bfq_delta(unsigned long service,
4472 -+ unsigned long weight)
4473 -+{
4474 -+ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
4475 -+
4476 -+ do_div(d, weight);
4477 -+ return d;
4478 -+}
4479 -+
4480 -+/**
4481 -+ * bfq_calc_finish - assign the finish time to an entity.
4482 -+ * @entity: the entity to act upon.
4483 -+ * @service: the service to be charged to the entity.
4484 -+ */
4485 -+static inline void bfq_calc_finish(struct bfq_entity *entity,
4486 -+ unsigned long service)
4487 -+{
4488 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4489 -+
4490 -+ BUG_ON(entity->weight == 0);
4491 -+
4492 -+ entity->finish = entity->start +
4493 -+ bfq_delta(service, entity->weight);
4494 -+
4495 -+ if (bfqq != NULL) {
4496 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
4497 -+ "calc_finish: serv %lu, w %d",
4498 -+ service, entity->weight);
4499 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
4500 -+ "calc_finish: start %llu, finish %llu, delta %llu",
4501 -+ entity->start, entity->finish,
4502 -+ bfq_delta(service, entity->weight));
4503 -+ }
4504 -+}
4505 -+
4506 -+/**
4507 -+ * bfq_entity_of - get an entity from a node.
4508 -+ * @node: the node field of the entity.
4509 -+ *
4510 -+ * Convert a node pointer to the relative entity. This is used only
4511 -+ * to simplify the logic of some functions and not as the generic
4512 -+ * conversion mechanism because, e.g., in the tree walking functions,
4513 -+ * the check for a %NULL value would be redundant.
4514 -+ */
4515 -+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
4516 -+{
4517 -+ struct bfq_entity *entity = NULL;
4518 -+
4519 -+ if (node != NULL)
4520 -+ entity = rb_entry(node, struct bfq_entity, rb_node);
4521 -+
4522 -+ return entity;
4523 -+}
4524 -+
4525 -+/**
4526 -+ * bfq_extract - remove an entity from a tree.
4527 -+ * @root: the tree root.
4528 -+ * @entity: the entity to remove.
4529 -+ */
4530 -+static inline void bfq_extract(struct rb_root *root,
4531 -+ struct bfq_entity *entity)
4532 -+{
4533 -+ BUG_ON(entity->tree != root);
4534 -+
4535 -+ entity->tree = NULL;
4536 -+ rb_erase(&entity->rb_node, root);
4537 -+}
4538 -+
4539 -+/**
4540 -+ * bfq_idle_extract - extract an entity from the idle tree.
4541 -+ * @st: the service tree of the owning @entity.
4542 -+ * @entity: the entity being removed.
4543 -+ */
4544 -+static void bfq_idle_extract(struct bfq_service_tree *st,
4545 -+ struct bfq_entity *entity)
4546 -+{
4547 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4548 -+ struct rb_node *next;
4549 -+
4550 -+ BUG_ON(entity->tree != &st->idle);
4551 -+
4552 -+ if (entity == st->first_idle) {
4553 -+ next = rb_next(&entity->rb_node);
4554 -+ st->first_idle = bfq_entity_of(next);
4555 -+ }
4556 -+
4557 -+ if (entity == st->last_idle) {
4558 -+ next = rb_prev(&entity->rb_node);
4559 -+ st->last_idle = bfq_entity_of(next);
4560 -+ }
4561 -+
4562 -+ bfq_extract(&st->idle, entity);
4563 -+
4564 -+ if (bfqq != NULL)
4565 -+ list_del(&bfqq->bfqq_list);
4566 -+}
4567 -+
4568 -+/**
4569 -+ * bfq_insert - generic tree insertion.
4570 -+ * @root: tree root.
4571 -+ * @entity: entity to insert.
4572 -+ *
4573 -+ * This is used for the idle and the active tree, since they are both
4574 -+ * ordered by finish time.
4575 -+ */
4576 -+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
4577 -+{
4578 -+ struct bfq_entity *entry;
4579 -+ struct rb_node **node = &root->rb_node;
4580 -+ struct rb_node *parent = NULL;
4581 -+
4582 -+ BUG_ON(entity->tree != NULL);
4583 -+
4584 -+ while (*node != NULL) {
4585 -+ parent = *node;
4586 -+ entry = rb_entry(parent, struct bfq_entity, rb_node);
4587 -+
4588 -+ if (bfq_gt(entry->finish, entity->finish))
4589 -+ node = &parent->rb_left;
4590 -+ else
4591 -+ node = &parent->rb_right;
4592 -+ }
4593 -+
4594 -+ rb_link_node(&entity->rb_node, parent, node);
4595 -+ rb_insert_color(&entity->rb_node, root);
4596 -+
4597 -+ entity->tree = root;
4598 -+}
4599 -+
4600 -+/**
4601 -+ * bfq_update_min - update the min_start field of a entity.
4602 -+ * @entity: the entity to update.
4603 -+ * @node: one of its children.
4604 -+ *
4605 -+ * This function is called when @entity may store an invalid value for
4606 -+ * min_start due to updates to the active tree. The function assumes
4607 -+ * that the subtree rooted at @node (which may be its left or its right
4608 -+ * child) has a valid min_start value.
4609 -+ */
4610 -+static inline void bfq_update_min(struct bfq_entity *entity,
4611 -+ struct rb_node *node)
4612 -+{
4613 -+ struct bfq_entity *child;
4614 -+
4615 -+ if (node != NULL) {
4616 -+ child = rb_entry(node, struct bfq_entity, rb_node);
4617 -+ if (bfq_gt(entity->min_start, child->min_start))
4618 -+ entity->min_start = child->min_start;
4619 -+ }
4620 -+}
4621 -+
4622 -+/**
4623 -+ * bfq_update_active_node - recalculate min_start.
4624 -+ * @node: the node to update.
4625 -+ *
4626 -+ * @node may have changed position or one of its children may have moved,
4627 -+ * this function updates its min_start value. The left and right subtrees
4628 -+ * are assumed to hold a correct min_start value.
4629 -+ */
4630 -+static inline void bfq_update_active_node(struct rb_node *node)
4631 -+{
4632 -+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
4633 -+
4634 -+ entity->min_start = entity->start;
4635 -+ bfq_update_min(entity, node->rb_right);
4636 -+ bfq_update_min(entity, node->rb_left);
4637 -+}
4638 -+
4639 -+/**
4640 -+ * bfq_update_active_tree - update min_start for the whole active tree.
4641 -+ * @node: the starting node.
4642 -+ *
4643 -+ * @node must be the deepest modified node after an update. This function
4644 -+ * updates its min_start using the values held by its children, assuming
4645 -+ * that they did not change, and then updates all the nodes that may have
4646 -+ * changed in the path to the root. The only nodes that may have changed
4647 -+ * are the ones in the path or their siblings.
4648 -+ */
4649 -+static void bfq_update_active_tree(struct rb_node *node)
4650 -+{
4651 -+ struct rb_node *parent;
4652 -+
4653 -+up:
4654 -+ bfq_update_active_node(node);
4655 -+
4656 -+ parent = rb_parent(node);
4657 -+ if (parent == NULL)
4658 -+ return;
4659 -+
4660 -+ if (node == parent->rb_left && parent->rb_right != NULL)
4661 -+ bfq_update_active_node(parent->rb_right);
4662 -+ else if (parent->rb_left != NULL)
4663 -+ bfq_update_active_node(parent->rb_left);
4664 -+
4665 -+ node = parent;
4666 -+ goto up;
4667 -+}
4668 -+
4669 -+/**
4670 -+ * bfq_active_insert - insert an entity in the active tree of its group/device.
4671 -+ * @st: the service tree of the entity.
4672 -+ * @entity: the entity being inserted.
4673 -+ *
4674 -+ * The active tree is ordered by finish time, but an extra key is kept
4675 -+ * per each node, containing the minimum value for the start times of
4676 -+ * its children (and the node itself), so it's possible to search for
4677 -+ * the eligible node with the lowest finish time in logarithmic time.
4678 -+ */
4679 -+static void bfq_active_insert(struct bfq_service_tree *st,
4680 -+ struct bfq_entity *entity)
4681 -+{
4682 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4683 -+ struct rb_node *node = &entity->rb_node;
4684 -+
4685 -+ bfq_insert(&st->active, entity);
4686 -+
4687 -+ if (node->rb_left != NULL)
4688 -+ node = node->rb_left;
4689 -+ else if (node->rb_right != NULL)
4690 -+ node = node->rb_right;
4691 -+
4692 -+ bfq_update_active_tree(node);
4693 -+
4694 -+ if (bfqq != NULL)
4695 -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
4696 -+}
4697 -+
4698 -+/**
4699 -+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
4700 -+ * @ioprio: the ioprio value to convert.
4701 -+ */
4702 -+static unsigned short bfq_ioprio_to_weight(int ioprio)
4703 -+{
4704 -+ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
4705 -+ return IOPRIO_BE_NR - ioprio;
4706 -+}
4707 -+
4708 -+/**
4709 -+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
4710 -+ * @weight: the weight value to convert.
4711 -+ *
4712 -+ * To preserve as mush as possible the old only-ioprio user interface,
4713 -+ * 0 is used as an escape ioprio value for weights (numerically) equal or
4714 -+ * larger than IOPRIO_BE_NR
4715 -+ */
4716 -+static unsigned short bfq_weight_to_ioprio(int weight)
4717 -+{
4718 -+ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
4719 -+ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
4720 -+}
4721 -+
4722 -+static inline void bfq_get_entity(struct bfq_entity *entity)
4723 -+{
4724 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4725 -+ struct bfq_sched_data *sd;
4726 -+
4727 -+ if (bfqq != NULL) {
4728 -+ sd = entity->sched_data;
4729 -+ atomic_inc(&bfqq->ref);
4730 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
4731 -+ bfqq, atomic_read(&bfqq->ref));
4732 -+ }
4733 -+}
4734 -+
4735 -+/**
4736 -+ * bfq_find_deepest - find the deepest node that an extraction can modify.
4737 -+ * @node: the node being removed.
4738 -+ *
4739 -+ * Do the first step of an extraction in an rb tree, looking for the
4740 -+ * node that will replace @node, and returning the deepest node that
4741 -+ * the following modifications to the tree can touch. If @node is the
4742 -+ * last node in the tree return %NULL.
4743 -+ */
4744 -+static struct rb_node *bfq_find_deepest(struct rb_node *node)
4745 -+{
4746 -+ struct rb_node *deepest;
4747 -+
4748 -+ if (node->rb_right == NULL && node->rb_left == NULL)
4749 -+ deepest = rb_parent(node);
4750 -+ else if (node->rb_right == NULL)
4751 -+ deepest = node->rb_left;
4752 -+ else if (node->rb_left == NULL)
4753 -+ deepest = node->rb_right;
4754 -+ else {
4755 -+ deepest = rb_next(node);
4756 -+ if (deepest->rb_right != NULL)
4757 -+ deepest = deepest->rb_right;
4758 -+ else if (rb_parent(deepest) != node)
4759 -+ deepest = rb_parent(deepest);
4760 -+ }
4761 -+
4762 -+ return deepest;
4763 -+}
4764 -+
4765 -+/**
4766 -+ * bfq_active_extract - remove an entity from the active tree.
4767 -+ * @st: the service_tree containing the tree.
4768 -+ * @entity: the entity being removed.
4769 -+ */
4770 -+static void bfq_active_extract(struct bfq_service_tree *st,
4771 -+ struct bfq_entity *entity)
4772 -+{
4773 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4774 -+ struct rb_node *node;
4775 -+
4776 -+ node = bfq_find_deepest(&entity->rb_node);
4777 -+ bfq_extract(&st->active, entity);
4778 -+
4779 -+ if (node != NULL)
4780 -+ bfq_update_active_tree(node);
4781 -+
4782 -+ if (bfqq != NULL)
4783 -+ list_del(&bfqq->bfqq_list);
4784 -+}
4785 -+
4786 -+/**
4787 -+ * bfq_idle_insert - insert an entity into the idle tree.
4788 -+ * @st: the service tree containing the tree.
4789 -+ * @entity: the entity to insert.
4790 -+ */
4791 -+static void bfq_idle_insert(struct bfq_service_tree *st,
4792 -+ struct bfq_entity *entity)
4793 -+{
4794 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4795 -+ struct bfq_entity *first_idle = st->first_idle;
4796 -+ struct bfq_entity *last_idle = st->last_idle;
4797 -+
4798 -+ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
4799 -+ st->first_idle = entity;
4800 -+ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
4801 -+ st->last_idle = entity;
4802 -+
4803 -+ bfq_insert(&st->idle, entity);
4804 -+
4805 -+ if (bfqq != NULL)
4806 -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
4807 -+}
4808 -+
4809 -+/**
4810 -+ * bfq_forget_entity - remove an entity from the wfq trees.
4811 -+ * @st: the service tree.
4812 -+ * @entity: the entity being removed.
4813 -+ *
4814 -+ * Update the device status and forget everything about @entity, putting
4815 -+ * the device reference to it, if it is a queue. Entities belonging to
4816 -+ * groups are not refcounted.
4817 -+ */
4818 -+static void bfq_forget_entity(struct bfq_service_tree *st,
4819 -+ struct bfq_entity *entity)
4820 -+{
4821 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4822 -+ struct bfq_sched_data *sd;
4823 -+
4824 -+ BUG_ON(!entity->on_st);
4825 -+
4826 -+ entity->on_st = 0;
4827 -+ st->wsum -= entity->weight;
4828 -+ if (bfqq != NULL) {
4829 -+ sd = entity->sched_data;
4830 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
4831 -+ bfqq, atomic_read(&bfqq->ref));
4832 -+ bfq_put_queue(bfqq);
4833 -+ }
4834 -+}
4835 -+
4836 -+/**
4837 -+ * bfq_put_idle_entity - release the idle tree ref of an entity.
4838 -+ * @st: service tree for the entity.
4839 -+ * @entity: the entity being released.
4840 -+ */
4841 -+static void bfq_put_idle_entity(struct bfq_service_tree *st,
4842 -+ struct bfq_entity *entity)
4843 -+{
4844 -+ bfq_idle_extract(st, entity);
4845 -+ bfq_forget_entity(st, entity);
4846 -+}
4847 -+
4848 -+/**
4849 -+ * bfq_forget_idle - update the idle tree if necessary.
4850 -+ * @st: the service tree to act upon.
4851 -+ *
4852 -+ * To preserve the global O(log N) complexity we only remove one entry here;
4853 -+ * as the idle tree will not grow indefinitely this can be done safely.
4854 -+ */
4855 -+static void bfq_forget_idle(struct bfq_service_tree *st)
4856 -+{
4857 -+ struct bfq_entity *first_idle = st->first_idle;
4858 -+ struct bfq_entity *last_idle = st->last_idle;
4859 -+
4860 -+ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
4861 -+ !bfq_gt(last_idle->finish, st->vtime)) {
4862 -+ /*
4863 -+ * Forget the whole idle tree, increasing the vtime past
4864 -+ * the last finish time of idle entities.
4865 -+ */
4866 -+ st->vtime = last_idle->finish;
4867 -+ }
4868 -+
4869 -+ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
4870 -+ bfq_put_idle_entity(st, first_idle);
4871 -+}
4872 -+
4873 -+static struct bfq_service_tree *
4874 -+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
4875 -+ struct bfq_entity *entity)
4876 -+{
4877 -+ struct bfq_service_tree *new_st = old_st;
4878 -+
4879 -+ if (entity->ioprio_changed) {
4880 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4881 -+
4882 -+ BUG_ON(old_st->wsum < entity->weight);
4883 -+ old_st->wsum -= entity->weight;
4884 -+
4885 -+ if (entity->new_weight != entity->orig_weight) {
4886 -+ entity->orig_weight = entity->new_weight;
4887 -+ entity->ioprio =
4888 -+ bfq_weight_to_ioprio(entity->orig_weight);
4889 -+ } else if (entity->new_ioprio != entity->ioprio) {
4890 -+ entity->ioprio = entity->new_ioprio;
4891 -+ entity->orig_weight =
4892 -+ bfq_ioprio_to_weight(entity->ioprio);
4893 -+ } else
4894 -+ entity->new_weight = entity->orig_weight =
4895 -+ bfq_ioprio_to_weight(entity->ioprio);
4896 -+
4897 -+ entity->ioprio_class = entity->new_ioprio_class;
4898 -+ entity->ioprio_changed = 0;
4899 -+
4900 -+ /*
4901 -+ * NOTE: here we may be changing the weight too early,
4902 -+ * this will cause unfairness. The correct approach
4903 -+ * would have required additional complexity to defer
4904 -+ * weight changes to the proper time instants (i.e.,
4905 -+ * when entity->finish <= old_st->vtime).
4906 -+ */
4907 -+ new_st = bfq_entity_service_tree(entity);
4908 -+ entity->weight = entity->orig_weight *
4909 -+ (bfqq != NULL ? bfqq->raising_coeff : 1);
4910 -+ new_st->wsum += entity->weight;
4911 -+
4912 -+ if (new_st != old_st)
4913 -+ entity->start = new_st->vtime;
4914 -+ }
4915 -+
4916 -+ return new_st;
4917 -+}
4918 -+
4919 -+/**
4920 -+ * bfq_bfqq_served - update the scheduler status after selection for service.
4921 -+ * @bfqq: the queue being served.
4922 -+ * @served: bytes to transfer.
4923 -+ *
4924 -+ * NOTE: this can be optimized, as the timestamps of upper level entities
4925 -+ * are synchronized every time a new bfqq is selected for service. By now,
4926 -+ * we keep it to better check consistency.
4927 -+ */
4928 -+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
4929 -+{
4930 -+ struct bfq_entity *entity = &bfqq->entity;
4931 -+ struct bfq_service_tree *st;
4932 -+
4933 -+ for_each_entity(entity) {
4934 -+ st = bfq_entity_service_tree(entity);
4935 -+
4936 -+ entity->service += served;
4937 -+ BUG_ON(entity->service > entity->budget);
4938 -+ BUG_ON(st->wsum == 0);
4939 -+
4940 -+ st->vtime += bfq_delta(served, st->wsum);
4941 -+ bfq_forget_idle(st);
4942 -+ }
4943 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
4944 -+}
4945 -+
4946 -+/**
4947 -+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
4948 -+ * @bfqq: the queue that needs a service update.
4949 -+ *
4950 -+ * When it's not possible to be fair in the service domain, because
4951 -+ * a queue is not consuming its budget fast enough (the meaning of
4952 -+ * fast depends on the timeout parameter), we charge it a full
4953 -+ * budget. In this way we should obtain a sort of time-domain
4954 -+ * fairness among all the seeky/slow queues.
4955 -+ */
4956 -+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
4957 -+{
4958 -+ struct bfq_entity *entity = &bfqq->entity;
4959 -+
4960 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
4961 -+
4962 -+ bfq_bfqq_served(bfqq, entity->budget - entity->service);
4963 -+}
4964 -+
4965 -+/**
4966 -+ * __bfq_activate_entity - activate an entity.
4967 -+ * @entity: the entity being activated.
4968 -+ *
4969 -+ * Called whenever an entity is activated, i.e., it is not active and one
4970 -+ * of its children receives a new request, or has to be reactivated due to
4971 -+ * budget exhaustion. It uses the current budget of the entity (and the
4972 -+ * service received if @entity is active) of the queue to calculate its
4973 -+ * timestamps.
4974 -+ */
4975 -+static void __bfq_activate_entity(struct bfq_entity *entity)
4976 -+{
4977 -+ struct bfq_sched_data *sd = entity->sched_data;
4978 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
4979 -+
4980 -+ if (entity == sd->active_entity) {
4981 -+ BUG_ON(entity->tree != NULL);
4982 -+ /*
4983 -+ * If we are requeueing the current entity we have
4984 -+ * to take care of not charging to it service it has
4985 -+ * not received.
4986 -+ */
4987 -+ bfq_calc_finish(entity, entity->service);
4988 -+ entity->start = entity->finish;
4989 -+ sd->active_entity = NULL;
4990 -+ } else if (entity->tree == &st->active) {
4991 -+ /*
4992 -+ * Requeueing an entity due to a change of some
4993 -+ * next_active entity below it. We reuse the old
4994 -+ * start time.
4995 -+ */
4996 -+ bfq_active_extract(st, entity);
4997 -+ } else if (entity->tree == &st->idle) {
4998 -+ /*
4999 -+ * Must be on the idle tree, bfq_idle_extract() will
5000 -+ * check for that.
5001 -+ */
5002 -+ bfq_idle_extract(st, entity);
5003 -+ entity->start = bfq_gt(st->vtime, entity->finish) ?
5004 -+ st->vtime : entity->finish;
5005 -+ } else {
5006 -+ /*
5007 -+ * The finish time of the entity may be invalid, and
5008 -+ * it is in the past for sure, otherwise the queue
5009 -+ * would have been on the idle tree.
5010 -+ */
5011 -+ entity->start = st->vtime;
5012 -+ st->wsum += entity->weight;
5013 -+ bfq_get_entity(entity);
5014 -+
5015 -+ BUG_ON(entity->on_st);
5016 -+ entity->on_st = 1;
5017 -+ }
5018 -+
5019 -+ st = __bfq_entity_update_weight_prio(st, entity);
5020 -+ bfq_calc_finish(entity, entity->budget);
5021 -+ bfq_active_insert(st, entity);
5022 -+}
5023 -+
5024 -+/**
5025 -+ * bfq_activate_entity - activate an entity and its ancestors if necessary.
5026 -+ * @entity: the entity to activate.
5027 -+ *
5028 -+ * Activate @entity and all the entities on the path from it to the root.
5029 -+ */
5030 -+static void bfq_activate_entity(struct bfq_entity *entity)
5031 -+{
5032 -+ struct bfq_sched_data *sd;
5033 -+
5034 -+ for_each_entity(entity) {
5035 -+ __bfq_activate_entity(entity);
5036 -+
5037 -+ sd = entity->sched_data;
5038 -+ if (!bfq_update_next_active(sd))
5039 -+ /*
5040 -+ * No need to propagate the activation to the
5041 -+ * upper entities, as they will be updated when
5042 -+ * the active entity is rescheduled.
5043 -+ */
5044 -+ break;
5045 -+ }
5046 -+}
5047 -+
5048 -+/**
5049 -+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
5050 -+ * @entity: the entity to deactivate.
5051 -+ * @requeue: if false, the entity will not be put into the idle tree.
5052 -+ *
5053 -+ * Deactivate an entity, independently from its previous state. If the
5054 -+ * entity was not on a service tree just return, otherwise if it is on
5055 -+ * any scheduler tree, extract it from that tree, and if necessary
5056 -+ * and if the caller did not specify @requeue, put it on the idle tree.
5057 -+ *
5058 -+ * Return %1 if the caller should update the entity hierarchy, i.e.,
5059 -+ * if the entity was under service or if it was the next_active for
5060 -+ * its sched_data; return %0 otherwise.
5061 -+ */
5062 -+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
5063 -+{
5064 -+ struct bfq_sched_data *sd = entity->sched_data;
5065 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
5066 -+ int was_active = entity == sd->active_entity;
5067 -+ int ret = 0;
5068 -+
5069 -+ if (!entity->on_st)
5070 -+ return 0;
5071 -+
5072 -+ BUG_ON(was_active && entity->tree != NULL);
5073 -+
5074 -+ if (was_active) {
5075 -+ bfq_calc_finish(entity, entity->service);
5076 -+ sd->active_entity = NULL;
5077 -+ } else if (entity->tree == &st->active)
5078 -+ bfq_active_extract(st, entity);
5079 -+ else if (entity->tree == &st->idle)
5080 -+ bfq_idle_extract(st, entity);
5081 -+ else if (entity->tree != NULL)
5082 -+ BUG();
5083 -+
5084 -+ if (was_active || sd->next_active == entity)
5085 -+ ret = bfq_update_next_active(sd);
5086 -+
5087 -+ if (!requeue || !bfq_gt(entity->finish, st->vtime))
5088 -+ bfq_forget_entity(st, entity);
5089 -+ else
5090 -+ bfq_idle_insert(st, entity);
5091 -+
5092 -+ BUG_ON(sd->active_entity == entity);
5093 -+ BUG_ON(sd->next_active == entity);
5094 -+
5095 -+ return ret;
5096 -+}
5097 -+
5098 -+/**
5099 -+ * bfq_deactivate_entity - deactivate an entity.
5100 -+ * @entity: the entity to deactivate.
5101 -+ * @requeue: true if the entity can be put on the idle tree
5102 -+ */
5103 -+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
5104 -+{
5105 -+ struct bfq_sched_data *sd;
5106 -+ struct bfq_entity *parent;
5107 -+
5108 -+ for_each_entity_safe(entity, parent) {
5109 -+ sd = entity->sched_data;
5110 -+
5111 -+ if (!__bfq_deactivate_entity(entity, requeue))
5112 -+ /*
5113 -+ * The parent entity is still backlogged, and
5114 -+ * we don't need to update it as it is still
5115 -+ * under service.
5116 -+ */
5117 -+ break;
5118 -+
5119 -+ if (sd->next_active != NULL)
5120 -+ /*
5121 -+ * The parent entity is still backlogged and
5122 -+ * the budgets on the path towards the root
5123 -+ * need to be updated.
5124 -+ */
5125 -+ goto update;
5126 -+
5127 -+ /*
5128 -+ * If we reach there the parent is no more backlogged and
5129 -+ * we want to propagate the dequeue upwards.
5130 -+ */
5131 -+ requeue = 1;
5132 -+ }
5133 -+
5134 -+ return;
5135 -+
5136 -+update:
5137 -+ entity = parent;
5138 -+ for_each_entity(entity) {
5139 -+ __bfq_activate_entity(entity);
5140 -+
5141 -+ sd = entity->sched_data;
5142 -+ if (!bfq_update_next_active(sd))
5143 -+ break;
5144 -+ }
5145 -+}
5146 -+
5147 -+/**
5148 -+ * bfq_update_vtime - update vtime if necessary.
5149 -+ * @st: the service tree to act upon.
5150 -+ *
5151 -+ * If necessary update the service tree vtime to have at least one
5152 -+ * eligible entity, skipping to its start time. Assumes that the
5153 -+ * active tree of the device is not empty.
5154 -+ *
5155 -+ * NOTE: this hierarchical implementation updates vtimes quite often,
5156 -+ * we may end up with reactivated tasks getting timestamps after a
5157 -+ * vtime skip done because we needed a ->first_active entity on some
5158 -+ * intermediate node.
5159 -+ */
5160 -+static void bfq_update_vtime(struct bfq_service_tree *st)
5161 -+{
5162 -+ struct bfq_entity *entry;
5163 -+ struct rb_node *node = st->active.rb_node;
5164 -+
5165 -+ entry = rb_entry(node, struct bfq_entity, rb_node);
5166 -+ if (bfq_gt(entry->min_start, st->vtime)) {
5167 -+ st->vtime = entry->min_start;
5168 -+ bfq_forget_idle(st);
5169 -+ }
5170 -+}
5171 -+
5172 -+/**
5173 -+ * bfq_first_active - find the eligible entity with the smallest finish time
5174 -+ * @st: the service tree to select from.
5175 -+ *
5176 -+ * This function searches the first schedulable entity, starting from the
5177 -+ * root of the tree and going on the left every time on this side there is
5178 -+ * a subtree with at least one eligible (start >= vtime) entity. The path
5179 -+ * on the right is followed only if a) the left subtree contains no eligible
5180 -+ * entities and b) no eligible entity has been found yet.
5181 -+ */
5182 -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
5183 -+{
5184 -+ struct bfq_entity *entry, *first = NULL;
5185 -+ struct rb_node *node = st->active.rb_node;
5186 -+
5187 -+ while (node != NULL) {
5188 -+ entry = rb_entry(node, struct bfq_entity, rb_node);
5189 -+left:
5190 -+ if (!bfq_gt(entry->start, st->vtime))
5191 -+ first = entry;
5192 -+
5193 -+ BUG_ON(bfq_gt(entry->min_start, st->vtime));
5194 -+
5195 -+ if (node->rb_left != NULL) {
5196 -+ entry = rb_entry(node->rb_left,
5197 -+ struct bfq_entity, rb_node);
5198 -+ if (!bfq_gt(entry->min_start, st->vtime)) {
5199 -+ node = node->rb_left;
5200 -+ goto left;
5201 -+ }
5202 -+ }
5203 -+ if (first != NULL)
5204 -+ break;
5205 -+ node = node->rb_right;
5206 -+ }
5207 -+
5208 -+ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
5209 -+ return first;
5210 -+}
5211 -+
5212 -+/**
5213 -+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
5214 -+ * @st: the service tree.
5215 -+ *
5216 -+ * Update the virtual time in @st and return the first eligible entity
5217 -+ * it contains.
5218 -+ */
5219 -+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
5220 -+ bool force)
5221 -+{
5222 -+ struct bfq_entity *entity, *new_next_active = NULL;
5223 -+
5224 -+ if (RB_EMPTY_ROOT(&st->active))
5225 -+ return NULL;
5226 -+
5227 -+ bfq_update_vtime(st);
5228 -+ entity = bfq_first_active_entity(st);
5229 -+ BUG_ON(bfq_gt(entity->start, st->vtime));
5230 -+
5231 -+ /*
5232 -+ * If the chosen entity does not match with the sched_data's
5233 -+ * next_active and we are forcedly serving the IDLE priority
5234 -+ * class tree, bubble up budget update.
5235 -+ */
5236 -+ if (unlikely(force && entity != entity->sched_data->next_active)) {
5237 -+ new_next_active = entity;
5238 -+ for_each_entity(new_next_active)
5239 -+ bfq_update_budget(new_next_active);
5240 -+ }
5241 -+
5242 -+ return entity;
5243 -+}
5244 -+
5245 -+/**
5246 -+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
5247 -+ * @sd: the sched_data.
5248 -+ * @extract: if true the returned entity will be also extracted from @sd.
5249 -+ *
5250 -+ * NOTE: since we cache the next_active entity at each level of the
5251 -+ * hierarchy, the complexity of the lookup can be decreased with
5252 -+ * absolutely no effort just returning the cached next_active value;
5253 -+ * we prefer to do full lookups to test the consistency of * the data
5254 -+ * structures.
5255 -+ */
5256 -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
5257 -+ int extract,
5258 -+ struct bfq_data *bfqd)
5259 -+{
5260 -+ struct bfq_service_tree *st = sd->service_tree;
5261 -+ struct bfq_entity *entity;
5262 -+ int i=0;
5263 -+
5264 -+ BUG_ON(sd->active_entity != NULL);
5265 -+
5266 -+ if (bfqd != NULL &&
5267 -+ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
5268 -+ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true);
5269 -+ if (entity != NULL) {
5270 -+ i = BFQ_IOPRIO_CLASSES - 1;
5271 -+ bfqd->bfq_class_idle_last_service = jiffies;
5272 -+ sd->next_active = entity;
5273 -+ }
5274 -+ }
5275 -+ for (; i < BFQ_IOPRIO_CLASSES; i++) {
5276 -+ entity = __bfq_lookup_next_entity(st + i, false);
5277 -+ if (entity != NULL) {
5278 -+ if (extract) {
5279 -+ bfq_check_next_active(sd, entity);
5280 -+ bfq_active_extract(st + i, entity);
5281 -+ sd->active_entity = entity;
5282 -+ sd->next_active = NULL;
5283 -+ }
5284 -+ break;
5285 -+ }
5286 -+ }
5287 -+
5288 -+ return entity;
5289 -+}
5290 -+
5291 -+/*
5292 -+ * Get next queue for service.
5293 -+ */
5294 -+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
5295 -+{
5296 -+ struct bfq_entity *entity = NULL;
5297 -+ struct bfq_sched_data *sd;
5298 -+ struct bfq_queue *bfqq;
5299 -+
5300 -+ BUG_ON(bfqd->active_queue != NULL);
5301 -+
5302 -+ if (bfqd->busy_queues == 0)
5303 -+ return NULL;
5304 -+
5305 -+ sd = &bfqd->root_group->sched_data;
5306 -+ for (; sd != NULL; sd = entity->my_sched_data) {
5307 -+ entity = bfq_lookup_next_entity(sd, 1, bfqd);
5308 -+ BUG_ON(entity == NULL);
5309 -+ entity->service = 0;
5310 -+ }
5311 -+
5312 -+ bfqq = bfq_entity_to_bfqq(entity);
5313 -+ BUG_ON(bfqq == NULL);
5314 -+
5315 -+ return bfqq;
5316 -+}
5317 -+
5318 -+/*
5319 -+ * Forced extraction of the given queue.
5320 -+ */
5321 -+static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
5322 -+ struct bfq_queue *bfqq)
5323 -+{
5324 -+ struct bfq_entity *entity;
5325 -+ struct bfq_sched_data *sd;
5326 -+
5327 -+ BUG_ON(bfqd->active_queue != NULL);
5328 -+
5329 -+ entity = &bfqq->entity;
5330 -+ /*
5331 -+ * Bubble up extraction/update from the leaf to the root.
5332 -+ */
5333 -+ for_each_entity(entity) {
5334 -+ sd = entity->sched_data;
5335 -+ bfq_update_budget(entity);
5336 -+ bfq_update_vtime(bfq_entity_service_tree(entity));
5337 -+ bfq_active_extract(bfq_entity_service_tree(entity), entity);
5338 -+ sd->active_entity = entity;
5339 -+ sd->next_active = NULL;
5340 -+ entity->service = 0;
5341 -+ }
5342 -+
5343 -+ return;
5344 -+}
5345 -+
5346 -+static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)
5347 -+{
5348 -+ if (bfqd->active_bic != NULL) {
5349 -+ put_io_context(bfqd->active_bic->icq.ioc);
5350 -+ bfqd->active_bic = NULL;
5351 -+ }
5352 -+
5353 -+ bfqd->active_queue = NULL;
5354 -+ del_timer(&bfqd->idle_slice_timer);
5355 -+}
5356 -+
5357 -+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5358 -+ int requeue)
5359 -+{
5360 -+ struct bfq_entity *entity = &bfqq->entity;
5361 -+
5362 -+ if (bfqq == bfqd->active_queue)
5363 -+ __bfq_bfqd_reset_active(bfqd);
5364 -+
5365 -+ bfq_deactivate_entity(entity, requeue);
5366 -+}
5367 -+
5368 -+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5369 -+{
5370 -+ struct bfq_entity *entity = &bfqq->entity;
5371 -+
5372 -+ bfq_activate_entity(entity);
5373 -+}
5374 -+
5375 -+/*
5376 -+ * Called when the bfqq no longer has requests pending, remove it from
5377 -+ * the service tree.
5378 -+ */
5379 -+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5380 -+ int requeue)
5381 -+{
5382 -+ BUG_ON(!bfq_bfqq_busy(bfqq));
5383 -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
5384 -+
5385 -+ bfq_log_bfqq(bfqd, bfqq, "del from busy");
5386 -+
5387 -+ bfq_clear_bfqq_busy(bfqq);
5388 -+
5389 -+ BUG_ON(bfqd->busy_queues == 0);
5390 -+ bfqd->busy_queues--;
5391 -+
5392 -+ bfq_deactivate_bfqq(bfqd, bfqq, requeue);
5393 -+}
5394 -+
5395 -+/*
5396 -+ * Called when an inactive queue receives a new request.
5397 -+ */
5398 -+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5399 -+{
5400 -+ BUG_ON(bfq_bfqq_busy(bfqq));
5401 -+ BUG_ON(bfqq == bfqd->active_queue);
5402 -+
5403 -+ bfq_log_bfqq(bfqd, bfqq, "add to busy");
5404 -+
5405 -+ bfq_activate_bfqq(bfqd, bfqq);
5406 -+
5407 -+ bfq_mark_bfqq_busy(bfqq);
5408 -+ bfqd->busy_queues++;
5409 -+}
5410 -diff --git a/block/bfq.h b/block/bfq.h
5411 -new file mode 100644
5412 -index 0000000..48ecde9
5413 ---- /dev/null
5414 -+++ b/block/bfq.h
5415 -@@ -0,0 +1,603 @@
5416 -+/*
5417 -+ * BFQ-v6r2 for 3.10.0: data structures and common functions prototypes.
5418 -+ *
5419 -+ * Based on ideas and code from CFQ:
5420 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
5421 -+ *
5422 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
5423 -+ * Paolo Valente <paolo.valente@×××××××.it>
5424 -+ *
5425 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
5426 -+ */
5427 -+
5428 -+#ifndef _BFQ_H
5429 -+#define _BFQ_H
5430 -+
5431 -+#include <linux/blktrace_api.h>
5432 -+#include <linux/hrtimer.h>
5433 -+#include <linux/ioprio.h>
5434 -+#include <linux/rbtree.h>
5435 -+
5436 -+#define BFQ_IOPRIO_CLASSES 3
5437 -+#define BFQ_CL_IDLE_TIMEOUT HZ/5
5438 -+
5439 -+#define BFQ_MIN_WEIGHT 1
5440 -+#define BFQ_MAX_WEIGHT 1000
5441 -+
5442 -+#define BFQ_DEFAULT_GRP_WEIGHT 10
5443 -+#define BFQ_DEFAULT_GRP_IOPRIO 0
5444 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
5445 -+
5446 -+struct bfq_entity;
5447 -+
5448 -+/**
5449 -+ * struct bfq_service_tree - per ioprio_class service tree.
5450 -+ * @active: tree for active entities (i.e., those backlogged).
5451 -+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
5452 -+ * @first_idle: idle entity with minimum F_i.
5453 -+ * @last_idle: idle entity with maximum F_i.
5454 -+ * @vtime: scheduler virtual time.
5455 -+ * @wsum: scheduler weight sum; active and idle entities contribute to it.
5456 -+ *
5457 -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
5458 -+ * ioprio_class has its own independent scheduler, and so its own
5459 -+ * bfq_service_tree. All the fields are protected by the queue lock
5460 -+ * of the containing bfqd.
5461 -+ */
5462 -+struct bfq_service_tree {
5463 -+ struct rb_root active;
5464 -+ struct rb_root idle;
5465 -+
5466 -+ struct bfq_entity *first_idle;
5467 -+ struct bfq_entity *last_idle;
5468 -+
5469 -+ u64 vtime;
5470 -+ unsigned long wsum;
5471 -+};
5472 -+
5473 -+/**
5474 -+ * struct bfq_sched_data - multi-class scheduler.
5475 -+ * @active_entity: entity under service.
5476 -+ * @next_active: head-of-the-line entity in the scheduler.
5477 -+ * @service_tree: array of service trees, one per ioprio_class.
5478 -+ *
5479 -+ * bfq_sched_data is the basic scheduler queue. It supports three
5480 -+ * ioprio_classes, and can be used either as a toplevel queue or as
5481 -+ * an intermediate queue on a hierarchical setup.
5482 -+ * @next_active points to the active entity of the sched_data service
5483 -+ * trees that will be scheduled next.
5484 -+ *
5485 -+ * The supported ioprio_classes are the same as in CFQ, in descending
5486 -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
5487 -+ * Requests from higher priority queues are served before all the
5488 -+ * requests from lower priority queues; among requests of the same
5489 -+ * queue requests are served according to B-WF2Q+.
5490 -+ * All the fields are protected by the queue lock of the containing bfqd.
5491 -+ */
5492 -+struct bfq_sched_data {
5493 -+ struct bfq_entity *active_entity;
5494 -+ struct bfq_entity *next_active;
5495 -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
5496 -+};
5497 -+
5498 -+/**
5499 -+ * struct bfq_entity - schedulable entity.
5500 -+ * @rb_node: service_tree member.
5501 -+ * @on_st: flag, true if the entity is on a tree (either the active or
5502 -+ * the idle one of its service_tree).
5503 -+ * @finish: B-WF2Q+ finish timestamp (aka F_i).
5504 -+ * @start: B-WF2Q+ start timestamp (aka S_i).
5505 -+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
5506 -+ * @min_start: minimum start time of the (active) subtree rooted at
5507 -+ * this entity; used for O(log N) lookups into active trees.
5508 -+ * @service: service received during the last round of service.
5509 -+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
5510 -+ * @weight: weight of the queue
5511 -+ * @parent: parent entity, for hierarchical scheduling.
5512 -+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
5513 -+ * associated scheduler queue, %NULL on leaf nodes.
5514 -+ * @sched_data: the scheduler queue this entity belongs to.
5515 -+ * @ioprio: the ioprio in use.
5516 -+ * @new_weight: when a weight change is requested, the new weight value.
5517 -+ * @orig_weight: original weight, used to implement weight boosting
5518 -+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
5519 -+ * @ioprio_class: the ioprio_class in use.
5520 -+ * @new_ioprio_class: when an ioprio_class change is requested, the new
5521 -+ * ioprio_class value.
5522 -+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
5523 -+ * ioprio_class change.
5524 -+ *
5525 -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
5526 -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
5527 -+ * entity belongs to the sched_data of the parent group in the cgroup
5528 -+ * hierarchy. Non-leaf entities have also their own sched_data, stored
5529 -+ * in @my_sched_data.
5530 -+ *
5531 -+ * Each entity stores independently its priority values; this would
5532 -+ * allow different weights on different devices, but this
5533 -+ * functionality is not exported to userspace by now. Priorities and
5534 -+ * weights are updated lazily, first storing the new values into the
5535 -+ * new_* fields, then setting the @ioprio_changed flag. As soon as
5536 -+ * there is a transition in the entity state that allows the priority
5537 -+ * update to take place the effective and the requested priority
5538 -+ * values are synchronized.
5539 -+ *
5540 -+ * Unless cgroups are used, the weight value is calculated from the
5541 -+ * ioprio to export the same interface as CFQ. When dealing with
5542 -+ * ``well-behaved'' queues (i.e., queues that do not spend too much
5543 -+ * time to consume their budget and have true sequential behavior, and
5544 -+ * when there are no external factors breaking anticipation) the
5545 -+ * relative weights at each level of the cgroups hierarchy should be
5546 -+ * guaranteed. All the fields are protected by the queue lock of the
5547 -+ * containing bfqd.
5548 -+ */
5549 -+struct bfq_entity {
5550 -+ struct rb_node rb_node;
5551 -+
5552 -+ int on_st;
5553 -+
5554 -+ u64 finish;
5555 -+ u64 start;
5556 -+
5557 -+ struct rb_root *tree;
5558 -+
5559 -+ u64 min_start;
5560 -+
5561 -+ unsigned long service, budget;
5562 -+ unsigned short weight, new_weight;
5563 -+ unsigned short orig_weight;
5564 -+
5565 -+ struct bfq_entity *parent;
5566 -+
5567 -+ struct bfq_sched_data *my_sched_data;
5568 -+ struct bfq_sched_data *sched_data;
5569 -+
5570 -+ unsigned short ioprio, new_ioprio;
5571 -+ unsigned short ioprio_class, new_ioprio_class;
5572 -+
5573 -+ int ioprio_changed;
5574 -+};
5575 -+
5576 -+struct bfq_group;
5577 -+
5578 -+/**
5579 -+ * struct bfq_queue - leaf schedulable entity.
5580 -+ * @ref: reference counter.
5581 -+ * @bfqd: parent bfq_data.
5582 -+ * @new_bfqq: shared bfq_queue if queue is cooperating with
5583 -+ * one or more other queues.
5584 -+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
5585 -+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
5586 -+ * @sort_list: sorted list of pending requests.
5587 -+ * @next_rq: if fifo isn't expired, next request to serve.
5588 -+ * @queued: nr of requests queued in @sort_list.
5589 -+ * @allocated: currently allocated requests.
5590 -+ * @meta_pending: pending metadata requests.
5591 -+ * @fifo: fifo list of requests in sort_list.
5592 -+ * @entity: entity representing this queue in the scheduler.
5593 -+ * @max_budget: maximum budget allowed from the feedback mechanism.
5594 -+ * @budget_timeout: budget expiration (in jiffies).
5595 -+ * @dispatched: number of requests on the dispatch list or inside driver.
5596 -+ * @org_ioprio: saved ioprio during boosted periods.
5597 -+ * @flags: status flags.
5598 -+ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
5599 -+ * @seek_samples: number of seeks sampled
5600 -+ * @seek_total: sum of the distances of the seeks sampled
5601 -+ * @seek_mean: mean seek distance
5602 -+ * @last_request_pos: position of the last request enqueued
5603 -+ * @pid: pid of the process owning the queue, used for logging purposes.
5604 -+ * @last_rais_start_time: last (idle -> weight-raised) transition attempt
5605 -+ * @raising_cur_max_time: current max raising time for this queue
5606 -+ *
5607 -+ * A bfq_queue is a leaf request queue; it can be associated to an io_context
5608 -+ * or more (if it is an async one). @cgroup holds a reference to the
5609 -+ * cgroup, to be sure that it does not disappear while a bfqq still
5610 -+ * references it (mostly to avoid races between request issuing and task
5611 -+ * migration followed by cgroup distruction).
5612 -+ * All the fields are protected by the queue lock of the containing bfqd.
5613 -+ */
5614 -+struct bfq_queue {
5615 -+ atomic_t ref;
5616 -+ struct bfq_data *bfqd;
5617 -+
5618 -+ /* fields for cooperating queues handling */
5619 -+ struct bfq_queue *new_bfqq;
5620 -+ struct rb_node pos_node;
5621 -+ struct rb_root *pos_root;
5622 -+
5623 -+ struct rb_root sort_list;
5624 -+ struct request *next_rq;
5625 -+ int queued[2];
5626 -+ int allocated[2];
5627 -+ int meta_pending;
5628 -+ struct list_head fifo;
5629 -+
5630 -+ struct bfq_entity entity;
5631 -+
5632 -+ unsigned long max_budget;
5633 -+ unsigned long budget_timeout;
5634 -+
5635 -+ int dispatched;
5636 -+
5637 -+ unsigned short org_ioprio;
5638 -+
5639 -+ unsigned int flags;
5640 -+
5641 -+ struct list_head bfqq_list;
5642 -+
5643 -+ unsigned int seek_samples;
5644 -+ u64 seek_total;
5645 -+ sector_t seek_mean;
5646 -+ sector_t last_request_pos;
5647 -+
5648 -+ pid_t pid;
5649 -+
5650 -+ /* weight-raising fields */
5651 -+ unsigned int raising_cur_max_time;
5652 -+ u64 last_rais_start_finish, soft_rt_next_start;
5653 -+ unsigned int raising_coeff;
5654 -+};
5655 -+
5656 -+/**
5657 -+ * struct bfq_ttime - per process thinktime stats.
5658 -+ * @ttime_total: total process thinktime
5659 -+ * @ttime_samples: number of thinktime samples
5660 -+ * @ttime_mean: average process thinktime
5661 -+ */
5662 -+struct bfq_ttime {
5663 -+ unsigned long last_end_request;
5664 -+
5665 -+ unsigned long ttime_total;
5666 -+ unsigned long ttime_samples;
5667 -+ unsigned long ttime_mean;
5668 -+};
5669 -+
5670 -+/**
5671 -+ * struct bfq_io_cq - per (request_queue, io_context) structure.
5672 -+ * @icq: associated io_cq structure
5673 -+ * @bfqq: array of two process queues, the sync and the async
5674 -+ * @ttime: associated @bfq_ttime struct
5675 -+ */
5676 -+struct bfq_io_cq {
5677 -+ struct io_cq icq; /* must be the first member */
5678 -+ struct bfq_queue *bfqq[2];
5679 -+ struct bfq_ttime ttime;
5680 -+ int ioprio;
5681 -+};
5682 -+
5683 -+/**
5684 -+ * struct bfq_data - per device data structure.
5685 -+ * @queue: request queue for the managed device.
5686 -+ * @root_group: root bfq_group for the device.
5687 -+ * @rq_pos_tree: rbtree sorted by next_request position,
5688 -+ * used when determining if two or more queues
5689 -+ * have interleaving requests (see bfq_close_cooperator).
5690 -+ * @busy_queues: number of bfq_queues containing requests (including the
5691 -+ * queue under service, even if it is idling).
5692 -+ * @queued: number of queued requests.
5693 -+ * @rq_in_driver: number of requests dispatched and waiting for completion.
5694 -+ * @sync_flight: number of sync requests in the driver.
5695 -+ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
5696 -+ * completed requests .
5697 -+ * @hw_tag_samples: nr of samples used to calculate hw_tag.
5698 -+ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
5699 -+ * @budgets_assigned: number of budgets assigned.
5700 -+ * @idle_slice_timer: timer set when idling for the next sequential request
5701 -+ * from the queue under service.
5702 -+ * @unplug_work: delayed work to restart dispatching on the request queue.
5703 -+ * @active_queue: bfq_queue under service.
5704 -+ * @active_bic: bfq_io_cq (bic) associated with the @active_queue.
5705 -+ * @last_position: on-disk position of the last served request.
5706 -+ * @last_budget_start: beginning of the last budget.
5707 -+ * @last_idling_start: beginning of the last idle slice.
5708 -+ * @peak_rate: peak transfer rate observed for a budget.
5709 -+ * @peak_rate_samples: number of samples used to calculate @peak_rate.
5710 -+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
5711 -+ * @group_list: list of all the bfq_groups active on the device.
5712 -+ * @active_list: list of all the bfq_queues active on the device.
5713 -+ * @idle_list: list of all the bfq_queues idle on the device.
5714 -+ * @bfq_quantum: max number of requests dispatched per dispatch round.
5715 -+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
5716 -+ * requests are served in fifo order.
5717 -+ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
5718 -+ * @bfq_back_max: maximum allowed backward seek.
5719 -+ * @bfq_slice_idle: maximum idling time.
5720 -+ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
5721 -+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
5722 -+ * async queues.
5723 -+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
5724 -+ * to prevent seeky queues to impose long latencies to well
5725 -+ * behaved ones (this also implies that seeky queues cannot
5726 -+ * receive guarantees in the service domain; after a timeout
5727 -+ * they are charged for the whole allocated budget, to try
5728 -+ * to preserve a behavior reasonably fair among them, but
5729 -+ * without service-domain guarantees).
5730 -+ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
5731 -+ * queue is multiplied
5732 -+ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
5733 -+ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
5734 -+ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
5735 -+ * may be reactivated for a queue (in jiffies)
5736 -+ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
5737 -+ * after which weight-raising may be
5738 -+ * reactivated for an already busy queue
5739 -+ * (in jiffies)
5740 -+ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
5741 -+ * sectors per seconds
5742 -+ * @RT_prod: cached value of the product R*T used for computing the maximum
5743 -+ * duration of the weight raising automatically
5744 -+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
5745 -+ *
5746 -+ * All the fields are protected by the @queue lock.
5747 -+ */
5748 -+struct bfq_data {
5749 -+ struct request_queue *queue;
5750 -+
5751 -+ struct bfq_group *root_group;
5752 -+
5753 -+ struct rb_root rq_pos_tree;
5754 -+
5755 -+ int busy_queues;
5756 -+ int queued;
5757 -+ int rq_in_driver;
5758 -+ int sync_flight;
5759 -+
5760 -+ int max_rq_in_driver;
5761 -+ int hw_tag_samples;
5762 -+ int hw_tag;
5763 -+
5764 -+ int budgets_assigned;
5765 -+
5766 -+ struct timer_list idle_slice_timer;
5767 -+ struct work_struct unplug_work;
5768 -+
5769 -+ struct bfq_queue *active_queue;
5770 -+ struct bfq_io_cq *active_bic;
5771 -+
5772 -+ sector_t last_position;
5773 -+
5774 -+ ktime_t last_budget_start;
5775 -+ ktime_t last_idling_start;
5776 -+ int peak_rate_samples;
5777 -+ u64 peak_rate;
5778 -+ unsigned long bfq_max_budget;
5779 -+
5780 -+ struct hlist_head group_list;
5781 -+ struct list_head active_list;
5782 -+ struct list_head idle_list;
5783 -+
5784 -+ unsigned int bfq_quantum;
5785 -+ unsigned int bfq_fifo_expire[2];
5786 -+ unsigned int bfq_back_penalty;
5787 -+ unsigned int bfq_back_max;
5788 -+ unsigned int bfq_slice_idle;
5789 -+ u64 bfq_class_idle_last_service;
5790 -+
5791 -+ unsigned int bfq_user_max_budget;
5792 -+ unsigned int bfq_max_budget_async_rq;
5793 -+ unsigned int bfq_timeout[2];
5794 -+
5795 -+ bool low_latency;
5796 -+
5797 -+ /* parameters of the low_latency heuristics */
5798 -+ unsigned int bfq_raising_coeff;
5799 -+ unsigned int bfq_raising_max_time;
5800 -+ unsigned int bfq_raising_rt_max_time;
5801 -+ unsigned int bfq_raising_min_idle_time;
5802 -+ unsigned int bfq_raising_min_inter_arr_async;
5803 -+ unsigned int bfq_raising_max_softrt_rate;
5804 -+ u64 RT_prod;
5805 -+
5806 -+ struct bfq_queue oom_bfqq;
5807 -+};
5808 -+
5809 -+enum bfqq_state_flags {
5810 -+ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */
5811 -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
5812 -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
5813 -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
5814 -+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
5815 -+ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */
5816 -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
5817 -+ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
5818 -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
5819 -+ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
5820 -+ BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */
5821 -+};
5822 -+
5823 -+#define BFQ_BFQQ_FNS(name) \
5824 -+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
5825 -+{ \
5826 -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
5827 -+} \
5828 -+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
5829 -+{ \
5830 -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
5831 -+} \
5832 -+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
5833 -+{ \
5834 -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
5835 -+}
5836 -+
5837 -+BFQ_BFQQ_FNS(busy);
5838 -+BFQ_BFQQ_FNS(wait_request);
5839 -+BFQ_BFQQ_FNS(must_alloc);
5840 -+BFQ_BFQQ_FNS(fifo_expire);
5841 -+BFQ_BFQQ_FNS(idle_window);
5842 -+BFQ_BFQQ_FNS(prio_changed);
5843 -+BFQ_BFQQ_FNS(sync);
5844 -+BFQ_BFQQ_FNS(budget_new);
5845 -+BFQ_BFQQ_FNS(coop);
5846 -+BFQ_BFQQ_FNS(split_coop);
5847 -+BFQ_BFQQ_FNS(some_coop_idle);
5848 -+#undef BFQ_BFQQ_FNS
5849 -+
5850 -+/* Logging facilities. */
5851 -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
5852 -+ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
5853 -+
5854 -+#define bfq_log(bfqd, fmt, args...) \
5855 -+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
5856 -+
5857 -+/* Expiration reasons. */
5858 -+enum bfqq_expiration {
5859 -+ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */
5860 -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
5861 -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
5862 -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
5863 -+};
5864 -+
5865 -+#ifdef CONFIG_CGROUP_BFQIO
5866 -+/**
5867 -+ * struct bfq_group - per (device, cgroup) data structure.
5868 -+ * @entity: schedulable entity to insert into the parent group sched_data.
5869 -+ * @sched_data: own sched_data, to contain child entities (they may be
5870 -+ * both bfq_queues and bfq_groups).
5871 -+ * @group_node: node to be inserted into the bfqio_cgroup->group_data
5872 -+ * list of the containing cgroup's bfqio_cgroup.
5873 -+ * @bfqd_node: node to be inserted into the @bfqd->group_list list
5874 -+ * of the groups active on the same device; used for cleanup.
5875 -+ * @bfqd: the bfq_data for the device this group acts upon.
5876 -+ * @async_bfqq: array of async queues for all the tasks belonging to
5877 -+ * the group, one queue per ioprio value per ioprio_class,
5878 -+ * except for the idle class that has only one queue.
5879 -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
5880 -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
5881 -+ * to avoid too many special cases during group creation/migration.
5882 -+ *
5883 -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
5884 -+ * there is a set of bfq_groups, each one collecting the lower-level
5885 -+ * entities belonging to the group that are acting on the same device.
5886 -+ *
5887 -+ * Locking works as follows:
5888 -+ * o @group_node is protected by the bfqio_cgroup lock, and is accessed
5889 -+ * via RCU from its readers.
5890 -+ * o @bfqd is protected by the queue lock, RCU is used to access it
5891 -+ * from the readers.
5892 -+ * o All the other fields are protected by the @bfqd queue lock.
5893 -+ */
5894 -+struct bfq_group {
5895 -+ struct bfq_entity entity;
5896 -+ struct bfq_sched_data sched_data;
5897 -+
5898 -+ struct hlist_node group_node;
5899 -+ struct hlist_node bfqd_node;
5900 -+
5901 -+ void *bfqd;
5902 -+
5903 -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
5904 -+ struct bfq_queue *async_idle_bfqq;
5905 -+
5906 -+ struct bfq_entity *my_entity;
5907 -+};
5908 -+
5909 -+/**
5910 -+ * struct bfqio_cgroup - bfq cgroup data structure.
5911 -+ * @css: subsystem state for bfq in the containing cgroup.
5912 -+ * @weight: cgroup weight.
5913 -+ * @ioprio: cgroup ioprio.
5914 -+ * @ioprio_class: cgroup ioprio_class.
5915 -+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
5916 -+ * @group_data: list containing the bfq_group belonging to this cgroup.
5917 -+ *
5918 -+ * @group_data is accessed using RCU, with @lock protecting the updates,
5919 -+ * @ioprio and @ioprio_class are protected by @lock.
5920 -+ */
5921 -+struct bfqio_cgroup {
5922 -+ struct cgroup_subsys_state css;
5923 -+
5924 -+ unsigned short weight, ioprio, ioprio_class;
5925 -+
5926 -+ spinlock_t lock;
5927 -+ struct hlist_head group_data;
5928 -+};
5929 -+#else
5930 -+struct bfq_group {
5931 -+ struct bfq_sched_data sched_data;
5932 -+
5933 -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
5934 -+ struct bfq_queue *async_idle_bfqq;
5935 -+};
5936 -+#endif
5937 -+
5938 -+static inline struct bfq_service_tree *
5939 -+bfq_entity_service_tree(struct bfq_entity *entity)
5940 -+{
5941 -+ struct bfq_sched_data *sched_data = entity->sched_data;
5942 -+ unsigned int idx = entity->ioprio_class - 1;
5943 -+
5944 -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
5945 -+ BUG_ON(sched_data == NULL);
5946 -+
5947 -+ return sched_data->service_tree + idx;
5948 -+}
5949 -+
5950 -+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
5951 -+ int is_sync)
5952 -+{
5953 -+ return bic->bfqq[!!is_sync];
5954 -+}
5955 -+
5956 -+static inline void bic_set_bfqq(struct bfq_io_cq *bic,
5957 -+ struct bfq_queue *bfqq, int is_sync)
5958 -+{
5959 -+ bic->bfqq[!!is_sync] = bfqq;
5960 -+}
5961 -+
5962 -+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
5963 -+{
5964 -+ return bic->icq.q->elevator->elevator_data;
5965 -+}
5966 -+
5967 -+/**
5968 -+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
5969 -+ * @ptr: a pointer to a bfqd.
5970 -+ * @flags: storage for the flags to be saved.
5971 -+ *
5972 -+ * This function allows bfqg->bfqd to be protected by the
5973 -+ * queue lock of the bfqd they reference; the pointer is dereferenced
5974 -+ * under RCU, so the storage for bfqd is assured to be safe as long
5975 -+ * as the RCU read side critical section does not end. After the
5976 -+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
5977 -+ * sure that no other writer accessed it. If we raced with a writer,
5978 -+ * the function returns NULL, with the queue unlocked, otherwise it
5979 -+ * returns the dereferenced pointer, with the queue locked.
5980 -+ */
5981 -+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
5982 -+ unsigned long *flags)
5983 -+{
5984 -+ struct bfq_data *bfqd;
5985 -+
5986 -+ rcu_read_lock();
5987 -+ bfqd = rcu_dereference(*(struct bfq_data **)ptr);
5988 -+
5989 -+ if (bfqd != NULL) {
5990 -+ spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
5991 -+ if (*ptr == bfqd)
5992 -+ goto out;
5993 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
5994 -+ }
5995 -+
5996 -+ bfqd = NULL;
5997 -+out:
5998 -+ rcu_read_unlock();
5999 -+ return bfqd;
6000 -+}
6001 -+
6002 -+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
6003 -+ unsigned long *flags)
6004 -+{
6005 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
6006 -+}
6007 -+
6008 -+static void bfq_changed_ioprio(struct bfq_io_cq *bic);
6009 -+static void bfq_put_queue(struct bfq_queue *bfqq);
6010 -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
6011 -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
6012 -+ struct bfq_group *bfqg, int is_sync,
6013 -+ struct bfq_io_cq *bic, gfp_t gfp_mask);
6014 -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
6015 -+ struct bfq_group *bfqg);
6016 -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
6017 -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
6018 -+#endif
6019 ---
6020 -1.8.1.4
6021 -
6022
6023 Deleted: genpatches-2.6/trunk/3.12/5000-BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1
6024 ===================================================================
6025 --- genpatches-2.6/trunk/3.12/5000-BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1 2013-11-04 00:52:35 UTC (rev 2564)
6026 +++ genpatches-2.6/trunk/3.12/5000-BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1 2013-11-04 10:09:31 UTC (rev 2565)
6027 @@ -1,1049 +0,0 @@
6028 -From 9acaa783ecab69925d38c6aca7252ff565a093d0 Mon Sep 17 00:00:00 2001
6029 -From: Mauro Andreolini <mauro.andreolini@×××××××.it>
6030 -Date: Fri, 14 Jun 2013 13:46:47 +0200
6031 -Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v6r2 for
6032 - 3.11.0
6033 -
6034 -A set of processes may happen to perform interleaved reads, i.e., requests
6035 -whose union would give rise to a sequential read pattern. There are two
6036 -typical cases: in the first case, processes read fixed-size chunks of
6037 -data at a fixed distance from each other, while in the second case processes
6038 -may read variable-size chunks at variable distances. The latter case occurs
6039 -for example with KVM, which splits the I/O generated by the guest into
6040 -multiple chunks, and lets these chunks be served by a pool of cooperating
6041 -processes, iteratively assigning the next chunk of I/O to the first
6042 -available process. CFQ uses actual queue merging for the first type of
6043 -processes, whereas it uses preemption to get a sequential read pattern out
6044 -of the read requests performed by the second type of processes. In the end
6045 -it uses two different mechanisms to achieve the same goal: boosting the
6046 -throughput with interleaved I/O.
6047 -
6048 -This patch introduces Early Queue Merge (EQM), a unified mechanism to get a
6049 -sequential read pattern with both types of processes. The main idea is
6050 -checking newly arrived requests against the next request of the active queue
6051 -both in case of actual request insert and in case of request merge. By doing
6052 -so, both the types of processes can be handled by just merging their queues.
6053 -EQM is then simpler and more compact than the pair of mechanisms used in
6054 -CFQ.
6055 -
6056 -Finally, EQM also preserves the typical low-latency properties of BFQ, by
6057 -properly restoring the weight-raising state of a queue when it gets back to
6058 -a non-merged state.
6059 -
6060 -Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>
6061 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
6062 -Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>
6063 ----
6064 - block/bfq-iosched.c | 653 ++++++++++++++++++++++++++++++++++++----------------
6065 - block/bfq-sched.c | 28 ---
6066 - block/bfq.h | 16 ++
6067 - 3 files changed, 466 insertions(+), 231 deletions(-)
6068 -
6069 -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
6070 -index 0ed2746..bbe79fb 100644
6071 ---- a/block/bfq-iosched.c
6072 -+++ b/block/bfq-iosched.c
6073 -@@ -444,6 +444,43 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
6074 - return dur;
6075 - }
6076 -
6077 -+static inline void
6078 -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
6079 -+{
6080 -+ if (bic->saved_idle_window)
6081 -+ bfq_mark_bfqq_idle_window(bfqq);
6082 -+ else
6083 -+ bfq_clear_bfqq_idle_window(bfqq);
6084 -+ if (bic->raising_time_left && bfqq->bfqd->low_latency) {
6085 -+ /*
6086 -+ * Start a weight raising period with the duration given by
6087 -+ * the raising_time_left snapshot.
6088 -+ */
6089 -+ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;
6090 -+ bfqq->raising_cur_max_time = bic->raising_time_left;
6091 -+ bfqq->last_rais_start_finish = jiffies;
6092 -+ }
6093 -+ /*
6094 -+ * Clear raising_time_left to prevent bfq_bfqq_save_state() from
6095 -+ * getting confused about the queue's need of a weight-raising
6096 -+ * period.
6097 -+ */
6098 -+ bic->raising_time_left = 0;
6099 -+}
6100 -+
6101 -+/*
6102 -+ * Must be called with the queue_lock held.
6103 -+ */
6104 -+static int bfqq_process_refs(struct bfq_queue *bfqq)
6105 -+{
6106 -+ int process_refs, io_refs;
6107 -+
6108 -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
6109 -+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
6110 -+ BUG_ON(process_refs < 0);
6111 -+ return process_refs;
6112 -+}
6113 -+
6114 - static void bfq_add_rq_rb(struct request *rq)
6115 - {
6116 - struct bfq_queue *bfqq = RQ_BFQQ(rq);
6117 -@@ -483,11 +520,20 @@ static void bfq_add_rq_rb(struct request *rq)
6118 - if (! bfqd->low_latency)
6119 - goto add_bfqq_busy;
6120 -
6121 -+ if (bfq_bfqq_just_split(bfqq))
6122 -+ goto set_ioprio_changed;
6123 -+
6124 - /*
6125 -- * If the queue is not being boosted and has been idle
6126 -- * for enough time, start a weight-raising period
6127 -+ * If the queue:
6128 -+ * - is not being boosted,
6129 -+ * - has been idle for enough time,
6130 -+ * - is not a sync queue or is linked to a bfq_io_cq (it is
6131 -+ * shared "for its nature" or it is not shared and its
6132 -+ * requests have not been redirected to a shared queue)
6133 -+ * start a weight-raising period.
6134 - */
6135 -- if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {
6136 -+ if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&
6137 -+ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
6138 - bfqq->raising_coeff = bfqd->bfq_raising_coeff;
6139 - if (idle_for_long_time)
6140 - bfqq->raising_cur_max_time =
6141 -@@ -517,6 +563,7 @@ static void bfq_add_rq_rb(struct request *rq)
6142 - raising_cur_max_time));
6143 - }
6144 - }
6145 -+set_ioprio_changed:
6146 - if (old_raising_coeff != bfqq->raising_coeff)
6147 - entity->ioprio_changed = 1;
6148 - add_bfqq_busy:
6149 -@@ -695,89 +742,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)
6150 - spin_unlock_irq(bfqd->queue->queue_lock);
6151 - }
6152 -
6153 --static int bfq_allow_merge(struct request_queue *q, struct request *rq,
6154 -- struct bio *bio)
6155 -+static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
6156 - {
6157 -- struct bfq_data *bfqd = q->elevator->elevator_data;
6158 -- struct bfq_io_cq *bic;
6159 -- struct bfq_queue *bfqq;
6160 --
6161 -- /*
6162 -- * Disallow merge of a sync bio into an async request.
6163 -- */
6164 -- if (bfq_bio_sync(bio) && !rq_is_sync(rq))
6165 -- return 0;
6166 --
6167 -- /*
6168 -- * Lookup the bfqq that this bio will be queued with. Allow
6169 -- * merge only if rq is queued there.
6170 -- * Queue lock is held here.
6171 -- */
6172 -- bic = bfq_bic_lookup(bfqd, current->io_context);
6173 -- if (bic == NULL)
6174 -- return 0;
6175 --
6176 -- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
6177 -- return bfqq == RQ_BFQQ(rq);
6178 --}
6179 --
6180 --static void __bfq_set_active_queue(struct bfq_data *bfqd,
6181 -- struct bfq_queue *bfqq)
6182 --{
6183 -- if (bfqq != NULL) {
6184 -- bfq_mark_bfqq_must_alloc(bfqq);
6185 -- bfq_mark_bfqq_budget_new(bfqq);
6186 -- bfq_clear_bfqq_fifo_expire(bfqq);
6187 --
6188 -- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
6189 --
6190 -- bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
6191 -- bfqq->entity.budget);
6192 -- }
6193 --
6194 -- bfqd->active_queue = bfqq;
6195 --}
6196 --
6197 --/*
6198 -- * Get and set a new active queue for service.
6199 -- */
6200 --static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,
6201 -- struct bfq_queue *bfqq)
6202 --{
6203 -- if (!bfqq)
6204 -- bfqq = bfq_get_next_queue(bfqd);
6205 -+ if (request)
6206 -+ return blk_rq_pos(io_struct);
6207 - else
6208 -- bfq_get_next_queue_forced(bfqd, bfqq);
6209 --
6210 -- __bfq_set_active_queue(bfqd, bfqq);
6211 -- return bfqq;
6212 -+ return ((struct bio *)io_struct)->bi_sector;
6213 - }
6214 -
6215 --static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
6216 -- struct request *rq)
6217 -+static inline sector_t bfq_dist_from(sector_t pos1,
6218 -+ sector_t pos2)
6219 - {
6220 -- if (blk_rq_pos(rq) >= bfqd->last_position)
6221 -- return blk_rq_pos(rq) - bfqd->last_position;
6222 -+ if (pos1 >= pos2)
6223 -+ return pos1 - pos2;
6224 - else
6225 -- return bfqd->last_position - blk_rq_pos(rq);
6226 -+ return pos2 - pos1;
6227 - }
6228 -
6229 --/*
6230 -- * Return true if bfqq has no request pending and rq is close enough to
6231 -- * bfqd->last_position, or if rq is closer to bfqd->last_position than
6232 -- * bfqq->next_rq
6233 -- */
6234 --static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
6235 -+static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
6236 -+ sector_t sector)
6237 - {
6238 -- return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
6239 -+ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
6240 -+ BFQQ_SEEK_THR;
6241 - }
6242 -
6243 --static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
6244 -+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
6245 - {
6246 - struct rb_root *root = &bfqd->rq_pos_tree;
6247 - struct rb_node *parent, *node;
6248 - struct bfq_queue *__bfqq;
6249 -- sector_t sector = bfqd->last_position;
6250 -
6251 - if (RB_EMPTY_ROOT(root))
6252 - return NULL;
6253 -@@ -796,7 +789,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
6254 - * position).
6255 - */
6256 - __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
6257 -- if (bfq_rq_close(bfqd, __bfqq->next_rq))
6258 -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
6259 - return __bfqq;
6260 -
6261 - if (blk_rq_pos(__bfqq->next_rq) < sector)
6262 -@@ -807,7 +800,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
6263 - return NULL;
6264 -
6265 - __bfqq = rb_entry(node, struct bfq_queue, pos_node);
6266 -- if (bfq_rq_close(bfqd, __bfqq->next_rq))
6267 -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
6268 - return __bfqq;
6269 -
6270 - return NULL;
6271 -@@ -816,14 +809,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
6272 - /*
6273 - * bfqd - obvious
6274 - * cur_bfqq - passed in so that we don't decide that the current queue
6275 -- * is closely cooperating with itself.
6276 -- *
6277 -- * We are assuming that cur_bfqq has dispatched at least one request,
6278 -- * and that bfqd->last_position reflects a position on the disk associated
6279 -- * with the I/O issued by cur_bfqq.
6280 -+ * is closely cooperating with itself
6281 -+ * sector - used as a reference point to search for a close queue
6282 - */
6283 - static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
6284 -- struct bfq_queue *cur_bfqq)
6285 -+ struct bfq_queue *cur_bfqq,
6286 -+ sector_t sector)
6287 - {
6288 - struct bfq_queue *bfqq;
6289 -
6290 -@@ -843,7 +834,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
6291 - * working closely on the same area of the disk. In that case,
6292 - * we can group them together and don't waste time idling.
6293 - */
6294 -- bfqq = bfqq_close(bfqd);
6295 -+ bfqq = bfqq_close(bfqd, sector);
6296 - if (bfqq == NULL || bfqq == cur_bfqq)
6297 - return NULL;
6298 -
6299 -@@ -870,6 +861,275 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
6300 - return bfqq;
6301 - }
6302 -
6303 -+static struct bfq_queue *
6304 -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
6305 -+{
6306 -+ int process_refs, new_process_refs;
6307 -+ struct bfq_queue *__bfqq;
6308 -+
6309 -+ /*
6310 -+ * If there are no process references on the new_bfqq, then it is
6311 -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
6312 -+ * may have dropped their last reference (not just their last process
6313 -+ * reference).
6314 -+ */
6315 -+ if (!bfqq_process_refs(new_bfqq))
6316 -+ return NULL;
6317 -+
6318 -+ /* Avoid a circular list and skip interim queue merges. */
6319 -+ while ((__bfqq = new_bfqq->new_bfqq)) {
6320 -+ if (__bfqq == bfqq)
6321 -+ return NULL;
6322 -+ new_bfqq = __bfqq;
6323 -+ }
6324 -+
6325 -+ process_refs = bfqq_process_refs(bfqq);
6326 -+ new_process_refs = bfqq_process_refs(new_bfqq);
6327 -+ /*
6328 -+ * If the process for the bfqq has gone away, there is no
6329 -+ * sense in merging the queues.
6330 -+ */
6331 -+ if (process_refs == 0 || new_process_refs == 0)
6332 -+ return NULL;
6333 -+
6334 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
6335 -+ new_bfqq->pid);
6336 -+
6337 -+ /*
6338 -+ * Merging is just a redirection: the requests of the process owning
6339 -+ * one of the two queues are redirected to the other queue. The latter
6340 -+ * queue, in its turn, is set as shared if this is the first time that
6341 -+ * the requests of some process are redirected to it.
6342 -+ *
6343 -+ * We redirect bfqq to new_bfqq and not the opposite, because we
6344 -+ * are in the context of the process owning bfqq, hence we have the
6345 -+ * io_cq of this process. So we can immediately configure this io_cq
6346 -+ * to redirect the requests of the process to new_bfqq.
6347 -+ *
6348 -+ * NOTE, even if new_bfqq coincides with the active queue, the io_cq of
6349 -+ * new_bfqq is not available, because, if the active queue is shared,
6350 -+ * bfqd->active_bic may not point to the io_cq of the active queue.
6351 -+ * Redirecting the requests of the process owning bfqq to the currently
6352 -+ * active queue is in any case the best option, as we feed the active queue
6353 -+ * with new requests close to the last request served and, by doing so,
6354 -+ * hopefully increase the throughput.
6355 -+ */
6356 -+ bfqq->new_bfqq = new_bfqq;
6357 -+ atomic_add(process_refs, &new_bfqq->ref);
6358 -+ return new_bfqq;
6359 -+}
6360 -+
6361 -+/*
6362 -+ * Attempt to schedule a merge of bfqq with the currently active queue or
6363 -+ * with a close queue among the scheduled queues.
6364 -+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
6365 -+ * structure otherwise.
6366 -+ */
6367 -+static struct bfq_queue *
6368 -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6369 -+ void *io_struct, bool request)
6370 -+{
6371 -+ struct bfq_queue *active_bfqq, *new_bfqq;
6372 -+
6373 -+ if (bfqq->new_bfqq)
6374 -+ return bfqq->new_bfqq;
6375 -+
6376 -+ if (!io_struct)
6377 -+ return NULL;
6378 -+
6379 -+ active_bfqq = bfqd->active_queue;
6380 -+
6381 -+ if (active_bfqq == NULL || active_bfqq == bfqq || !bfqd->active_bic)
6382 -+ goto check_scheduled;
6383 -+
6384 -+ if (bfq_class_idle(active_bfqq) || bfq_class_idle(bfqq))
6385 -+ goto check_scheduled;
6386 -+
6387 -+ if (bfq_class_rt(active_bfqq) != bfq_class_rt(bfqq))
6388 -+ goto check_scheduled;
6389 -+
6390 -+ if (active_bfqq->entity.parent != bfqq->entity.parent)
6391 -+ goto check_scheduled;
6392 -+
6393 -+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
6394 -+ bfq_bfqq_sync(active_bfqq) && bfq_bfqq_sync(bfqq))
6395 -+ if ((new_bfqq = bfq_setup_merge(bfqq, active_bfqq)))
6396 -+ return new_bfqq; /* Merge with the active queue */
6397 -+
6398 -+ /*
6399 -+ * Check whether there is a cooperator among currently scheduled
6400 -+ * queues. The only thing we need is that the bio/request is not
6401 -+ * NULL, as we need it to establish whether a cooperator exists.
6402 -+ */
6403 -+check_scheduled:
6404 -+ new_bfqq = bfq_close_cooperator(bfqd, bfqq,
6405 -+ bfq_io_struct_pos(io_struct, request));
6406 -+ if (new_bfqq)
6407 -+ return bfq_setup_merge(bfqq, new_bfqq);
6408 -+
6409 -+ return NULL;
6410 -+}
6411 -+
6412 -+static inline void
6413 -+bfq_bfqq_save_state(struct bfq_queue *bfqq)
6414 -+{
6415 -+ /*
6416 -+ * If bfqq->bic == NULL, the queue is already shared or its requests
6417 -+ * have already been redirected to a shared queue; both idle window
6418 -+ * and weight raising state have already been saved. Do nothing.
6419 -+ */
6420 -+ if (bfqq->bic == NULL)
6421 -+ return;
6422 -+ if (bfqq->bic->raising_time_left)
6423 -+ /*
6424 -+ * This is the queue of a just-started process, and would
6425 -+ * deserve weight raising: we set raising_time_left to the full
6426 -+ * weight-raising duration to trigger weight-raising when and
6427 -+ * if the queue is split and the first request of the queue
6428 -+ * is enqueued.
6429 -+ */
6430 -+ bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);
6431 -+ else if (bfqq->raising_coeff > 1) {
6432 -+ unsigned long wrais_duration =
6433 -+ jiffies - bfqq->last_rais_start_finish;
6434 -+ /*
6435 -+ * It may happen that a queue's weight raising period lasts
6436 -+ * longer than its raising_cur_max_time, as weight raising is
6437 -+ * handled only when a request is enqueued or dispatched (it
6438 -+ * does not use any timer). If the weight raising period is
6439 -+ * about to end, don't save it.
6440 -+ */
6441 -+ if (bfqq->raising_cur_max_time <= wrais_duration)
6442 -+ bfqq->bic->raising_time_left = 0;
6443 -+ else
6444 -+ bfqq->bic->raising_time_left =
6445 -+ bfqq->raising_cur_max_time - wrais_duration;
6446 -+ /*
6447 -+ * The bfq_queue is becoming shared or the requests of the
6448 -+ * process owning the queue are being redirected to a shared
6449 -+ * queue. Stop the weight raising period of the queue, as in
6450 -+ * both cases it should not be owned by an interactive or soft
6451 -+ * real-time application.
6452 -+ */
6453 -+ bfq_bfqq_end_raising(bfqq);
6454 -+ } else
6455 -+ bfqq->bic->raising_time_left = 0;
6456 -+ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
6457 -+}
6458 -+
6459 -+static inline void
6460 -+bfq_get_bic_reference(struct bfq_queue *bfqq)
6461 -+{
6462 -+ /*
6463 -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs
6464 -+ * is about to begin using a shared bfq_queue.
6465 -+ */
6466 -+ if (bfqq->bic)
6467 -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
6468 -+}
6469 -+
6470 -+static void
6471 -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
6472 -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
6473 -+{
6474 -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
6475 -+ (long unsigned)new_bfqq->pid);
6476 -+ /* Save weight raising and idle window of the merged queues */
6477 -+ bfq_bfqq_save_state(bfqq);
6478 -+ bfq_bfqq_save_state(new_bfqq);
6479 -+ /*
6480 -+ * Grab a reference to the bic, to prevent it from being destroyed
6481 -+ * before being possibly touched by a bfq_split_bfqq().
6482 -+ */
6483 -+ bfq_get_bic_reference(bfqq);
6484 -+ bfq_get_bic_reference(new_bfqq);
6485 -+ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */
6486 -+ bic_set_bfqq(bic, new_bfqq, 1);
6487 -+ bfq_mark_bfqq_coop(new_bfqq);
6488 -+ /*
6489 -+ * new_bfqq now belongs to at least two bics (it is a shared queue): set
6490 -+ * new_bfqq->bic to NULL. bfqq either:
6491 -+ * - does not belong to any bic any more, and hence bfqq->bic must
6492 -+ * be set to NULL, or
6493 -+ * - is a queue whose owning bics have already been redirected to a
6494 -+ * different queue, hence the queue is destined to not belong to any
6495 -+ * bic soon and bfqq->bic is already NULL (therefore the next
6496 -+ * assignment causes no harm).
6497 -+ */
6498 -+ new_bfqq->bic = NULL;
6499 -+ bfqq->bic = NULL;
6500 -+ bfq_put_queue(bfqq);
6501 -+}
6502 -+
6503 -+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
6504 -+ struct bio *bio)
6505 -+{
6506 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
6507 -+ struct bfq_io_cq *bic;
6508 -+ struct bfq_queue *bfqq, *new_bfqq;
6509 -+
6510 -+ /*
6511 -+ * Disallow merge of a sync bio into an async request.
6512 -+ */
6513 -+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
6514 -+ return 0;
6515 -+
6516 -+ /*
6517 -+ * Lookup the bfqq that this bio will be queued with. Allow
6518 -+ * merge only if rq is queued there.
6519 -+ * Queue lock is held here.
6520 -+ */
6521 -+ bic = bfq_bic_lookup(bfqd, current->io_context);
6522 -+ if (bic == NULL)
6523 -+ return 0;
6524 -+
6525 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
6526 -+ /*
6527 -+ * We take advantage of this function to perform an early merge
6528 -+ * of the queues of possible cooperating processes.
6529 -+ */
6530 -+ if (bfqq != NULL &&
6531 -+ (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false))) {
6532 -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
6533 -+ /*
6534 -+ * If we get here, the bio will be queued in the shared queue,
6535 -+ * i.e., new_bfqq, so use new_bfqq to decide whether bio and
6536 -+ * rq can be merged.
6537 -+ */
6538 -+ bfqq = new_bfqq;
6539 -+ }
6540 -+
6541 -+ return bfqq == RQ_BFQQ(rq);
6542 -+}
6543 -+
6544 -+static void __bfq_set_active_queue(struct bfq_data *bfqd,
6545 -+ struct bfq_queue *bfqq)
6546 -+{
6547 -+ if (bfqq != NULL) {
6548 -+ bfq_mark_bfqq_must_alloc(bfqq);
6549 -+ bfq_mark_bfqq_budget_new(bfqq);
6550 -+ bfq_clear_bfqq_fifo_expire(bfqq);
6551 -+
6552 -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
6553 -+
6554 -+ bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
6555 -+ bfqq->entity.budget);
6556 -+ }
6557 -+
6558 -+ bfqd->active_queue = bfqq;
6559 -+}
6560 -+
6561 -+/*
6562 -+ * Get and set a new active queue for service.
6563 -+ */
6564 -+static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd)
6565 -+{
6566 -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
6567 -+
6568 -+ __bfq_set_active_queue(bfqd, bfqq);
6569 -+ return bfqq;
6570 -+}
6571 -+
6572 - /*
6573 - * If enough samples have been computed, return the current max budget
6574 - * stored in bfqd, which is dynamically updated according to the
6575 -@@ -1017,63 +1277,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
6576 - return rq;
6577 - }
6578 -
6579 --/*
6580 -- * Must be called with the queue_lock held.
6581 -- */
6582 --static int bfqq_process_refs(struct bfq_queue *bfqq)
6583 --{
6584 -- int process_refs, io_refs;
6585 --
6586 -- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
6587 -- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
6588 -- BUG_ON(process_refs < 0);
6589 -- return process_refs;
6590 --}
6591 --
6592 --static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
6593 --{
6594 -- int process_refs, new_process_refs;
6595 -- struct bfq_queue *__bfqq;
6596 --
6597 -- /*
6598 -- * If there are no process references on the new_bfqq, then it is
6599 -- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
6600 -- * may have dropped their last reference (not just their last process
6601 -- * reference).
6602 -- */
6603 -- if (!bfqq_process_refs(new_bfqq))
6604 -- return;
6605 --
6606 -- /* Avoid a circular list and skip interim queue merges. */
6607 -- while ((__bfqq = new_bfqq->new_bfqq)) {
6608 -- if (__bfqq == bfqq)
6609 -- return;
6610 -- new_bfqq = __bfqq;
6611 -- }
6612 --
6613 -- process_refs = bfqq_process_refs(bfqq);
6614 -- new_process_refs = bfqq_process_refs(new_bfqq);
6615 -- /*
6616 -- * If the process for the bfqq has gone away, there is no
6617 -- * sense in merging the queues.
6618 -- */
6619 -- if (process_refs == 0 || new_process_refs == 0)
6620 -- return;
6621 --
6622 -- /*
6623 -- * Merge in the direction of the lesser amount of work.
6624 -- */
6625 -- if (new_process_refs >= process_refs) {
6626 -- bfqq->new_bfqq = new_bfqq;
6627 -- atomic_add(process_refs, &new_bfqq->ref);
6628 -- } else {
6629 -- new_bfqq->new_bfqq = bfqq;
6630 -- atomic_add(new_process_refs, &bfqq->ref);
6631 -- }
6632 -- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
6633 -- new_bfqq->pid);
6634 --}
6635 --
6636 - static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
6637 - {
6638 - struct bfq_entity *entity = &bfqq->entity;
6639 -@@ -1493,6 +1696,14 @@ static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
6640 - * is likely to boost the disk throughput);
6641 - * - the queue is weight-raised (waiting for the request is necessary for
6642 - * providing the queue with fairness and latency guarantees).
6643 -+ *
6644 -+ * In any case, idling can be disabled for cooperation issues, if
6645 -+ * 1) there is a close cooperator for the queue, or
6646 -+ * 2) the queue is shared and some cooperator is likely to be idle (in this
6647 -+ * case, by not arming the idle timer, we try to slow down the queue, to
6648 -+ * prevent the zones of the disk accessed by the active cooperators to
6649 -+ * become too distant from the zone that will be accessed by the currently
6650 -+ * idle cooperators).
6651 - */
6652 - static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
6653 - int budg_timeout)
6654 -@@ -1507,7 +1718,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
6655 - (bfqd->rq_in_driver == 0 ||
6656 - budg_timeout ||
6657 - bfqq->raising_coeff > 1) &&
6658 -- !bfq_close_cooperator(bfqd, bfqq) &&
6659 -+ !bfq_close_cooperator(bfqd, bfqq, bfqd->last_position) &&
6660 - (!bfq_bfqq_coop(bfqq) ||
6661 - !bfq_bfqq_some_coop_idle(bfqq)) &&
6662 - !bfq_queue_nonrot_noidle(bfqd, bfqq));
6663 -@@ -1519,7 +1730,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
6664 - */
6665 - static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
6666 - {
6667 -- struct bfq_queue *bfqq, *new_bfqq = NULL;
6668 -+ struct bfq_queue *bfqq;
6669 - struct request *next_rq;
6670 - enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
6671 - int budg_timeout;
6672 -@@ -1530,17 +1741,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
6673 -
6674 - bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
6675 -
6676 -- /*
6677 -- * If another queue has a request waiting within our mean seek
6678 -- * distance, let it run. The expire code will check for close
6679 -- * cooperators and put the close queue at the front of the
6680 -- * service tree. If possible, merge the expiring queue with the
6681 -- * new bfqq.
6682 -- */
6683 -- new_bfqq = bfq_close_cooperator(bfqd, bfqq);
6684 -- if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
6685 -- bfq_setup_merge(bfqq, new_bfqq);
6686 --
6687 - budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
6688 - if (budg_timeout &&
6689 - !bfq_bfqq_must_idle(bfqq, budg_timeout))
6690 -@@ -1577,10 +1777,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
6691 - bfq_clear_bfqq_wait_request(bfqq);
6692 - del_timer(&bfqd->idle_slice_timer);
6693 - }
6694 -- if (new_bfqq == NULL)
6695 -- goto keep_queue;
6696 -- else
6697 -- goto expire;
6698 -+ goto keep_queue;
6699 - }
6700 - }
6701 -
6702 -@@ -1589,26 +1786,19 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
6703 - * queue still has requests in flight or is idling for a new request,
6704 - * then keep it.
6705 - */
6706 -- if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
6707 -+ if (timer_pending(&bfqd->idle_slice_timer) ||
6708 - (bfqq->dispatched != 0 &&
6709 - (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&
6710 -- !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {
6711 -+ !bfq_queue_nonrot_noidle(bfqd, bfqq))) {
6712 - bfqq = NULL;
6713 - goto keep_queue;
6714 -- } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
6715 -- /*
6716 -- * Expiring the queue because there is a close cooperator,
6717 -- * cancel timer.
6718 -- */
6719 -- bfq_clear_bfqq_wait_request(bfqq);
6720 -- del_timer(&bfqd->idle_slice_timer);
6721 - }
6722 -
6723 - reason = BFQ_BFQQ_NO_MORE_REQUESTS;
6724 - expire:
6725 - bfq_bfqq_expire(bfqd, bfqq, 0, reason);
6726 - new_queue:
6727 -- bfqq = bfq_set_active_queue(bfqd, new_bfqq);
6728 -+ bfqq = bfq_set_active_queue(bfqd);
6729 - bfq_log(bfqd, "select_queue: new queue %d returned",
6730 - bfqq != NULL ? bfqq->pid : 0);
6731 - keep_queue:
6732 -@@ -1617,9 +1807,8 @@ keep_queue:
6733 -
6734 - static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
6735 - {
6736 -+ struct bfq_entity *entity = &bfqq->entity;
6737 - if (bfqq->raising_coeff > 1) { /* queue is being boosted */
6738 -- struct bfq_entity *entity = &bfqq->entity;
6739 --
6740 - bfq_log_bfqq(bfqd, bfqq,
6741 - "raising period dur %u/%u msec, "
6742 - "old raising coeff %u, w %d(%d)",
6743 -@@ -1656,12 +1845,14 @@ static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
6744 - jiffies_to_msecs(bfqq->
6745 - raising_cur_max_time));
6746 - bfq_bfqq_end_raising(bfqq);
6747 -- __bfq_entity_update_weight_prio(
6748 -- bfq_entity_service_tree(entity),
6749 -- entity);
6750 - }
6751 - }
6752 - }
6753 -+ /* Update weight both if it must be raised and if it must be lowered */
6754 -+ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))
6755 -+ __bfq_entity_update_weight_prio(
6756 -+ bfq_entity_service_tree(entity),
6757 -+ entity);
6758 - }
6759 -
6760 - /*
6761 -@@ -1901,6 +2092,25 @@ static void bfq_init_icq(struct io_cq *icq)
6762 - struct bfq_io_cq *bic = icq_to_bic(icq);
6763 -
6764 - bic->ttime.last_end_request = jiffies;
6765 -+ /*
6766 -+ * A newly created bic indicates that the process has just
6767 -+ * started doing I/O, and is probably mapping into memory its
6768 -+ * executable and libraries: it definitely needs weight raising.
6769 -+ * There is however the possibility that the process performs,
6770 -+ * for a while, I/O close to some other process. EQM intercepts
6771 -+ * this behavior and may merge the queue corresponding to the
6772 -+ * process with some other queue, BEFORE the weight of the queue
6773 -+ * is raised. Merged queues are not weight-raised (they are assumed
6774 -+ * to belong to processes that benefit only from high throughput).
6775 -+ * If the merge is basically the consequence of an accident, then
6776 -+ * the queue will be split soon and will get back its old weight.
6777 -+ * It is then important to write down somewhere that this queue
6778 -+ * does need weight raising, even if it did not make it to get its
6779 -+ * weight raised before being merged. To this purpose, we overload
6780 -+ * the field raising_time_left and assign 1 to it, to mark the queue
6781 -+ * as needing weight raising.
6782 -+ */
6783 -+ bic->raising_time_left = 1;
6784 - }
6785 -
6786 - static void bfq_exit_icq(struct io_cq *icq)
6787 -@@ -1914,6 +2124,13 @@ static void bfq_exit_icq(struct io_cq *icq)
6788 - }
6789 -
6790 - if (bic->bfqq[BLK_RW_SYNC]) {
6791 -+ /*
6792 -+ * If the bic is using a shared queue, put the reference
6793 -+ * taken on the io_context when the bic started using a
6794 -+ * shared bfq_queue.
6795 -+ */
6796 -+ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
6797 -+ put_io_context(icq->ioc);
6798 - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
6799 - bic->bfqq[BLK_RW_SYNC] = NULL;
6800 - }
6801 -@@ -2211,6 +2428,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
6802 - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
6803 - return;
6804 -
6805 -+ /* Idle window just restored, statistics are meaningless. */
6806 -+ if (bfq_bfqq_just_split(bfqq))
6807 -+ return;
6808 -+
6809 - enable_idle = bfq_bfqq_idle_window(bfqq);
6810 -
6811 - if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
6812 -@@ -2251,6 +2472,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6813 - if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
6814 - !BFQQ_SEEKY(bfqq))
6815 - bfq_update_idle_window(bfqd, bfqq, bic);
6816 -+ bfq_clear_bfqq_just_split(bfqq);
6817 -
6818 - bfq_log_bfqq(bfqd, bfqq,
6819 - "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
6820 -@@ -2302,13 +2524,45 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6821 - static void bfq_insert_request(struct request_queue *q, struct request *rq)
6822 - {
6823 - struct bfq_data *bfqd = q->elevator->elevator_data;
6824 -- struct bfq_queue *bfqq = RQ_BFQQ(rq);
6825 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
6826 -
6827 - assert_spin_locked(bfqd->queue->queue_lock);
6828 -+
6829 -+ /*
6830 -+ * An unplug may trigger a requeue of a request from the device
6831 -+ * driver: make sure we are in process context while trying to
6832 -+ * merge two bfq_queues.
6833 -+ */
6834 -+ if (!in_interrupt() &&
6835 -+ (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true))) {
6836 -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
6837 -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
6838 -+ /*
6839 -+ * Release the request's reference to the old bfqq
6840 -+ * and make sure one is taken to the shared queue.
6841 -+ */
6842 -+ new_bfqq->allocated[rq_data_dir(rq)]++;
6843 -+ bfqq->allocated[rq_data_dir(rq)]--;
6844 -+ atomic_inc(&new_bfqq->ref);
6845 -+ bfq_put_queue(bfqq);
6846 -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
6847 -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq);
6848 -+ rq->elv.priv[1] = new_bfqq;
6849 -+ bfqq = new_bfqq;
6850 -+ }
6851 -+
6852 - bfq_init_prio_data(bfqq, RQ_BIC(rq));
6853 -
6854 - bfq_add_rq_rb(rq);
6855 -
6856 -+ /*
6857 -+ * Here a newly-created bfq_queue has already started a weight-raising
6858 -+ * period: clear raising_time_left to prevent bfq_bfqq_save_state()
6859 -+ * from assigning it a full weight-raising period. See the detailed
6860 -+ * comments about this field in bfq_init_icq().
6861 -+ */
6862 -+ if (bfqq->bic != NULL)
6863 -+ bfqq->bic->raising_time_left = 0;
6864 - rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
6865 - list_add_tail(&rq->queuelist, &bfqq->fifo);
6866 -
6867 -@@ -2371,15 +2625,6 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
6868 - if (bfq_bfqq_budget_new(bfqq))
6869 - bfq_set_budget_timeout(bfqd);
6870 -
6871 -- /* Idling is disabled also for cooperation issues:
6872 -- * 1) there is a close cooperator for the queue, or
6873 -- * 2) the queue is shared and some cooperator is likely
6874 -- * to be idle (in this case, by not arming the idle timer,
6875 -- * we try to slow down the queue, to prevent the zones
6876 -- * of the disk accessed by the active cooperators to become
6877 -- * too distant from the zone that will be accessed by the
6878 -- * currently idle cooperators)
6879 -- */
6880 - if (bfq_bfqq_must_idle(bfqq, budg_timeout))
6881 - bfq_arm_slice_timer(bfqd);
6882 - else if (budg_timeout)
6883 -@@ -2449,18 +2694,6 @@ static void bfq_put_request(struct request *rq)
6884 - }
6885 - }
6886 -
6887 --static struct bfq_queue *
6888 --bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
6889 -- struct bfq_queue *bfqq)
6890 --{
6891 -- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
6892 -- (long unsigned)bfqq->new_bfqq->pid);
6893 -- bic_set_bfqq(bic, bfqq->new_bfqq, 1);
6894 -- bfq_mark_bfqq_coop(bfqq->new_bfqq);
6895 -- bfq_put_queue(bfqq);
6896 -- return bic_to_bfqq(bic, 1);
6897 --}
6898 --
6899 - /*
6900 - * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
6901 - * was the last process referring to said bfqq.
6902 -@@ -2469,6 +2702,9 @@ static struct bfq_queue *
6903 - bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
6904 - {
6905 - bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
6906 -+
6907 -+ put_io_context(bic->icq.ioc);
6908 -+
6909 - if (bfqq_process_refs(bfqq) == 1) {
6910 - bfqq->pid = current->pid;
6911 - bfq_clear_bfqq_some_coop_idle(bfqq);
6912 -@@ -2498,6 +2734,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
6913 - struct bfq_queue *bfqq;
6914 - struct bfq_group *bfqg;
6915 - unsigned long flags;
6916 -+ bool split = false;
6917 -
6918 - might_sleep_if(gfp_mask & __GFP_WAIT);
6919 -
6920 -@@ -2516,24 +2753,14 @@ new_queue:
6921 - bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
6922 - bic_set_bfqq(bic, bfqq, is_sync);
6923 - } else {
6924 -- /*
6925 -- * If the queue was seeky for too long, break it apart.
6926 -- */
6927 -+ /* If the queue was seeky for too long, break it apart. */
6928 - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
6929 - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
6930 - bfqq = bfq_split_bfqq(bic, bfqq);
6931 -+ split = true;
6932 - if (!bfqq)
6933 - goto new_queue;
6934 - }
6935 --
6936 -- /*
6937 -- * Check to see if this queue is scheduled to merge with
6938 -- * another closely cooperating queue. The merging of queues
6939 -- * happens here as it must be done in process context.
6940 -- * The reference on new_bfqq was taken in merge_bfqqs.
6941 -- */
6942 -- if (bfqq->new_bfqq != NULL)
6943 -- bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
6944 - }
6945 -
6946 - bfqq->allocated[rw]++;
6947 -@@ -2544,6 +2771,26 @@ new_queue:
6948 - rq->elv.priv[0] = bic;
6949 - rq->elv.priv[1] = bfqq;
6950 -
6951 -+ /*
6952 -+ * If a bfq_queue has only one process reference, it is owned
6953 -+ * by only one bfq_io_cq: we can set the bic field of the
6954 -+ * bfq_queue to the address of that structure. Also, if the
6955 -+ * queue has just been split, mark a flag so that the
6956 -+ * information is available to the other scheduler hooks.
6957 -+ */
6958 -+ if (bfqq_process_refs(bfqq) == 1) {
6959 -+ bfqq->bic = bic;
6960 -+ if (split) {
6961 -+ bfq_mark_bfqq_just_split(bfqq);
6962 -+ /*
6963 -+ * If the queue has just been split from a shared queue,
6964 -+ * restore the idle window and the possible weight
6965 -+ * raising period.
6966 -+ */
6967 -+ bfq_bfqq_resume_state(bfqq, bic);
6968 -+ }
6969 -+ }
6970 -+
6971 - spin_unlock_irqrestore(q->queue_lock, flags);
6972 -
6973 - return 0;
6974 -diff --git a/block/bfq-sched.c b/block/bfq-sched.c
6975 -index 03f8061..a0edaa2 100644
6976 ---- a/block/bfq-sched.c
6977 -+++ b/block/bfq-sched.c
6978 -@@ -978,34 +978,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
6979 - return bfqq;
6980 - }
6981 -
6982 --/*
6983 -- * Forced extraction of the given queue.
6984 -- */
6985 --static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
6986 -- struct bfq_queue *bfqq)
6987 --{
6988 -- struct bfq_entity *entity;
6989 -- struct bfq_sched_data *sd;
6990 --
6991 -- BUG_ON(bfqd->active_queue != NULL);
6992 --
6993 -- entity = &bfqq->entity;
6994 -- /*
6995 -- * Bubble up extraction/update from the leaf to the root.
6996 -- */
6997 -- for_each_entity(entity) {
6998 -- sd = entity->sched_data;
6999 -- bfq_update_budget(entity);
7000 -- bfq_update_vtime(bfq_entity_service_tree(entity));
7001 -- bfq_active_extract(bfq_entity_service_tree(entity), entity);
7002 -- sd->active_entity = entity;
7003 -- sd->next_active = NULL;
7004 -- entity->service = 0;
7005 -- }
7006 --
7007 -- return;
7008 --}
7009 --
7010 - static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)
7011 - {
7012 - if (bfqd->active_bic != NULL) {
7013 -diff --git a/block/bfq.h b/block/bfq.h
7014 -index 48ecde9..bb52975 100644
7015 ---- a/block/bfq.h
7016 -+++ b/block/bfq.h
7017 -@@ -188,6 +188,8 @@ struct bfq_group;
7018 - * @pid: pid of the process owning the queue, used for logging purposes.
7019 - * @last_rais_start_time: last (idle -> weight-raised) transition attempt
7020 - * @raising_cur_max_time: current max raising time for this queue
7021 -+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
7022 -+ * queue is shared
7023 - *
7024 - * A bfq_queue is a leaf request queue; it can be associated to an io_context
7025 - * or more (if it is an async one). @cgroup holds a reference to the
7026 -@@ -231,6 +233,7 @@ struct bfq_queue {
7027 - sector_t last_request_pos;
7028 -
7029 - pid_t pid;
7030 -+ struct bfq_io_cq *bic;
7031 -
7032 - /* weight-raising fields */
7033 - unsigned int raising_cur_max_time;
7034 -@@ -257,12 +260,23 @@ struct bfq_ttime {
7035 - * @icq: associated io_cq structure
7036 - * @bfqq: array of two process queues, the sync and the async
7037 - * @ttime: associated @bfq_ttime struct
7038 -+ * @raising_time_left: snapshot of the time left before weight raising ends
7039 -+ * for the sync queue associated to this process; this
7040 -+ * snapshot is taken to remember this value while the weight
7041 -+ * raising is suspended because the queue is merged with a
7042 -+ * shared queue, and is used to set @raising_cur_max_time
7043 -+ * when the queue is split from the shared queue and its
7044 -+ * weight is raised again
7045 -+ * @saved_idle_window: same purpose as the previous field for the idle window
7046 - */
7047 - struct bfq_io_cq {
7048 - struct io_cq icq; /* must be the first member */
7049 - struct bfq_queue *bfqq[2];
7050 - struct bfq_ttime ttime;
7051 - int ioprio;
7052 -+
7053 -+ unsigned int raising_time_left;
7054 -+ unsigned int saved_idle_window;
7055 - };
7056 -
7057 - /**
7058 -@@ -403,6 +417,7 @@ enum bfqq_state_flags {
7059 - BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
7060 - BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
7061 - BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */
7062 -+ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */
7063 - };
7064 -
7065 - #define BFQ_BFQQ_FNS(name) \
7066 -@@ -430,6 +445,7 @@ BFQ_BFQQ_FNS(budget_new);
7067 - BFQ_BFQQ_FNS(coop);
7068 - BFQ_BFQQ_FNS(split_coop);
7069 - BFQ_BFQQ_FNS(some_coop_idle);
7070 -+BFQ_BFQQ_FNS(just_split);
7071 - #undef BFQ_BFQQ_FNS
7072 -
7073 - /* Logging facilities. */
7074 ---
7075 -1.8.1.4
7076 -
7077
7078 Added: genpatches-2.6/trunk/3.12/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch
7079 ===================================================================
7080 --- genpatches-2.6/trunk/3.12/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch (rev 0)
7081 +++ genpatches-2.6/trunk/3.12/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.11.patch 2013-11-04 10:09:31 UTC (rev 2565)
7082 @@ -0,0 +1,97 @@
7083 +From 3728677b4d3cd39d83be87f9939328201b871c48 Mon Sep 17 00:00:00 2001
7084 +From: Arianna Avanzini <avanzini.arianna@×××××.com>
7085 +Date: Tue, 3 Sep 2013 16:50:42 +0200
7086 +Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v6r2-3.11
7087 +
7088 +Update Kconfig.iosched and do the related Makefile changes to include
7089 +kernel configuration options for BFQ. Also add the bfqio controller
7090 +to the cgroups subsystem.
7091 +
7092 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
7093 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
7094 +---
7095 + block/Kconfig.iosched | 25 +++++++++++++++++++++++++
7096 + block/Makefile | 1 +
7097 + include/linux/cgroup_subsys.h | 4 ++++
7098 + 3 files changed, 30 insertions(+)
7099 +
7100 +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
7101 +index 421bef9..695e064 100644
7102 +--- a/block/Kconfig.iosched
7103 ++++ b/block/Kconfig.iosched
7104 +@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED
7105 + ---help---
7106 + Enable group IO scheduling in CFQ.
7107 +
7108 ++config IOSCHED_BFQ
7109 ++ tristate "BFQ I/O scheduler"
7110 ++ default n
7111 ++ ---help---
7112 ++ The BFQ I/O scheduler tries to distribute bandwidth among
7113 ++ all processes according to their weights.
7114 ++ It aims at distributing the bandwidth as desired, independently of
7115 ++ the disk parameters and with any workload. It also tries to
7116 ++ guarantee low latency to interactive and soft real-time
7117 ++ applications. If compiled built-in (saying Y here), BFQ can
7118 ++ be configured to support hierarchical scheduling.
7119 ++
7120 ++config CGROUP_BFQIO
7121 ++ bool "BFQ hierarchical scheduling support"
7122 ++ depends on CGROUPS && IOSCHED_BFQ=y
7123 ++ default n
7124 ++ ---help---
7125 ++ Enable hierarchical scheduling in BFQ, using the cgroups
7126 ++ filesystem interface. The name of the subsystem will be
7127 ++ bfqio.
7128 ++
7129 + choice
7130 + prompt "Default I/O scheduler"
7131 + default DEFAULT_CFQ
7132 +@@ -52,6 +73,9 @@ choice
7133 + config DEFAULT_CFQ
7134 + bool "CFQ" if IOSCHED_CFQ=y
7135 +
7136 ++ config DEFAULT_BFQ
7137 ++ bool "BFQ" if IOSCHED_BFQ=y
7138 ++
7139 + config DEFAULT_NOOP
7140 + bool "No-op"
7141 +
7142 +@@ -61,6 +85,7 @@ config DEFAULT_IOSCHED
7143 + string
7144 + default "deadline" if DEFAULT_DEADLINE
7145 + default "cfq" if DEFAULT_CFQ
7146 ++ default "bfq" if DEFAULT_BFQ
7147 + default "noop" if DEFAULT_NOOP
7148 +
7149 + endmenu
7150 +diff --git a/block/Makefile b/block/Makefile
7151 +index 39b76ba..c0d20fa 100644
7152 +--- a/block/Makefile
7153 ++++ b/block/Makefile
7154 +@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
7155 + obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
7156 + obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
7157 + obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
7158 ++obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
7159 +
7160 + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
7161 + obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
7162 +diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
7163 +index b613ffd..43c5dc9 100644
7164 +--- a/include/linux/cgroup_subsys.h
7165 ++++ b/include/linux/cgroup_subsys.h
7166 +@@ -39,6 +39,10 @@ SUBSYS(net_cls)
7167 + SUBSYS(blkio)
7168 + #endif
7169 +
7170 ++#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)
7171 ++SUBSYS(bfqio)
7172 ++#endif
7173 ++
7174 + #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)
7175 + SUBSYS(perf)
7176 + #endif
7177 +--
7178 +1.8.1.4
7179 +
7180
7181 Added: genpatches-2.6/trunk/3.12/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1
7182 ===================================================================
7183 --- genpatches-2.6/trunk/3.12/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1 (rev 0)
7184 +++ genpatches-2.6/trunk/3.12/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.11.patch1 2013-11-04 10:09:31 UTC (rev 2565)
7185 @@ -0,0 +1,5773 @@
7186 +From 009b78bafe1763f71e6bdbb4f536b564a73b7db5 Mon Sep 17 00:00:00 2001
7187 +From: Arianna Avanzini <avanzini.arianna@×××××.com>
7188 +Date: Thu, 9 May 2013 19:10:02 +0200
7189 +Subject: [PATCH 2/3] block: introduce the BFQ-v6r2 I/O sched for 3.11
7190 +
7191 +Add the BFQ-v6r2 I/O scheduler to 3.11.
7192 +The general structure is borrowed from CFQ, as much code. A (bfq_)queue
7193 +is associated to each task doing I/O on a device, and each time a
7194 +scheduling decision has to be made a queue is selected and served until
7195 +it expires.
7196 +
7197 + - Slices are given in the service domain: tasks are assigned
7198 + budgets, measured in number of sectors. Once got the disk, a task
7199 + must however consume its assigned budget within a configurable
7200 + maximum time (by default, the maximum possible value of the
7201 + budgets is automatically computed to comply with this timeout).
7202 + This allows the desired latency vs "throughput boosting" tradeoff
7203 + to be set.
7204 +
7205 + - Budgets are scheduled according to a variant of WF2Q+, implemented
7206 + using an augmented rb-tree to take eligibility into account while
7207 + preserving an O(log N) overall complexity.
7208 +
7209 + - A low-latency tunable is provided; if enabled, both interactive
7210 + and soft real-time applications are guaranteed very low latency.
7211 +
7212 + - Latency guarantees are preserved also in presence of NCQ.
7213 +
7214 + - Also with flash-based devices, a high throughput is achieved while
7215 + still preserving latency guarantees.
7216 +
7217 + - Useful features borrowed from CFQ: cooperating-queues merging (with
7218 + some additional optimizations with respect to the original CFQ version),
7219 + static fallback queue for OOM.
7220 +
7221 + - BFQ supports full hierarchical scheduling, exporting a cgroups
7222 + interface. Each node has a full scheduler, so each group can
7223 + be assigned its own ioprio (mapped to a weight, see next point)
7224 + and an ioprio_class.
7225 +
7226 + - If the cgroups interface is used, weights can be explictly
7227 + assigned, otherwise ioprio values are mapped to weights using the
7228 + relation weight = IOPRIO_BE_NR - ioprio.
7229 +
7230 + - ioprio classes are served in strict priority order, i.e., lower
7231 + priority queues are not served as long as there are higher
7232 + priority queues. Among queues in the same class the bandwidth is
7233 + distributed in proportion to the weight of each queue. A very
7234 + thin extra bandwidth is however guaranteed to the Idle class, to
7235 + prevent it from starving.
7236 +
7237 +Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
7238 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
7239 +---
7240 + block/bfq-cgroup.c | 881 +++++++++++++++
7241 + block/bfq-ioc.c | 36 +
7242 + block/bfq-iosched.c | 3082 +++++++++++++++++++++++++++++++++++++++++++++++++++
7243 + block/bfq-sched.c | 1072 ++++++++++++++++++
7244 + block/bfq.h | 603 ++++++++++
7245 + 5 files changed, 5674 insertions(+)
7246 + create mode 100644 block/bfq-cgroup.c
7247 + create mode 100644 block/bfq-ioc.c
7248 + create mode 100644 block/bfq-iosched.c
7249 + create mode 100644 block/bfq-sched.c
7250 + create mode 100644 block/bfq.h
7251 +
7252 +diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
7253 +new file mode 100644
7254 +index 0000000..bb9b851
7255 +--- /dev/null
7256 ++++ b/block/bfq-cgroup.c
7257 +@@ -0,0 +1,881 @@
7258 ++/*
7259 ++ * BFQ: CGROUPS support.
7260 ++ *
7261 ++ * Based on ideas and code from CFQ:
7262 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
7263 ++ *
7264 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
7265 ++ * Paolo Valente <paolo.valente@×××××××.it>
7266 ++ *
7267 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
7268 ++ *
7269 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
7270 ++ */
7271 ++
7272 ++#ifdef CONFIG_CGROUP_BFQIO
7273 ++
7274 ++static DEFINE_MUTEX(bfqio_mutex);
7275 ++
7276 ++static bool bfqio_is_removed(struct cgroup *cgroup)
7277 ++{
7278 ++ return test_bit(CGRP_DEAD, &cgroup->flags);
7279 ++}
7280 ++
7281 ++static struct bfqio_cgroup bfqio_root_cgroup = {
7282 ++ .weight = BFQ_DEFAULT_GRP_WEIGHT,
7283 ++ .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
7284 ++ .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
7285 ++};
7286 ++
7287 ++static inline void bfq_init_entity(struct bfq_entity *entity,
7288 ++ struct bfq_group *bfqg)
7289 ++{
7290 ++ entity->weight = entity->new_weight;
7291 ++ entity->orig_weight = entity->new_weight;
7292 ++ entity->ioprio = entity->new_ioprio;
7293 ++ entity->ioprio_class = entity->new_ioprio_class;
7294 ++ entity->parent = bfqg->my_entity;
7295 ++ entity->sched_data = &bfqg->sched_data;
7296 ++}
7297 ++
7298 ++static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)
7299 ++{
7300 ++ return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),
7301 ++ struct bfqio_cgroup, css);
7302 ++}
7303 ++
7304 ++/*
7305 ++ * Search the bfq_group for bfqd into the hash table (by now only a list)
7306 ++ * of bgrp. Must be called under rcu_read_lock().
7307 ++ */
7308 ++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
7309 ++ struct bfq_data *bfqd)
7310 ++{
7311 ++ struct bfq_group *bfqg;
7312 ++ void *key;
7313 ++
7314 ++ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
7315 ++ key = rcu_dereference(bfqg->bfqd);
7316 ++ if (key == bfqd)
7317 ++ return bfqg;
7318 ++ }
7319 ++
7320 ++ return NULL;
7321 ++}
7322 ++
7323 ++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
7324 ++ struct bfq_group *bfqg)
7325 ++{
7326 ++ struct bfq_entity *entity = &bfqg->entity;
7327 ++
7328 ++ /*
7329 ++ * If the weight of the entity has never been set via the sysfs
7330 ++ * interface, then bgrp->weight == 0. In this case we initialize
7331 ++ * the weight from the current ioprio value. Otherwise, the group
7332 ++ * weight, if set, has priority over the ioprio value.
7333 ++ */
7334 ++ if (bgrp->weight == 0) {
7335 ++ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
7336 ++ entity->new_ioprio = bgrp->ioprio;
7337 ++ } else {
7338 ++ entity->new_weight = bgrp->weight;
7339 ++ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
7340 ++ }
7341 ++ entity->orig_weight = entity->weight = entity->new_weight;
7342 ++ entity->ioprio = entity->new_ioprio;
7343 ++ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
7344 ++ entity->my_sched_data = &bfqg->sched_data;
7345 ++}
7346 ++
7347 ++static inline void bfq_group_set_parent(struct bfq_group *bfqg,
7348 ++ struct bfq_group *parent)
7349 ++{
7350 ++ struct bfq_entity *entity;
7351 ++
7352 ++ BUG_ON(parent == NULL);
7353 ++ BUG_ON(bfqg == NULL);
7354 ++
7355 ++ entity = &bfqg->entity;
7356 ++ entity->parent = parent->my_entity;
7357 ++ entity->sched_data = &parent->sched_data;
7358 ++}
7359 ++
7360 ++/**
7361 ++ * bfq_group_chain_alloc - allocate a chain of groups.
7362 ++ * @bfqd: queue descriptor.
7363 ++ * @cgroup: the leaf cgroup this chain starts from.
7364 ++ *
7365 ++ * Allocate a chain of groups starting from the one belonging to
7366 ++ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
7367 ++ * to the root has already an allocated group on @bfqd.
7368 ++ */
7369 ++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
7370 ++ struct cgroup *cgroup)
7371 ++{
7372 ++ struct bfqio_cgroup *bgrp;
7373 ++ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
7374 ++
7375 ++ for (; cgroup != NULL; cgroup = cgroup->parent) {
7376 ++ bgrp = cgroup_to_bfqio(cgroup);
7377 ++
7378 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
7379 ++ if (bfqg != NULL) {
7380 ++ /*
7381 ++ * All the cgroups in the path from there to the
7382 ++ * root must have a bfq_group for bfqd, so we don't
7383 ++ * need any more allocations.
7384 ++ */
7385 ++ break;
7386 ++ }
7387 ++
7388 ++ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
7389 ++ if (bfqg == NULL)
7390 ++ goto cleanup;
7391 ++
7392 ++ bfq_group_init_entity(bgrp, bfqg);
7393 ++ bfqg->my_entity = &bfqg->entity;
7394 ++
7395 ++ if (leaf == NULL) {
7396 ++ leaf = bfqg;
7397 ++ prev = leaf;
7398 ++ } else {
7399 ++ bfq_group_set_parent(prev, bfqg);
7400 ++ /*
7401 ++ * Build a list of allocated nodes using the bfqd
7402 ++ * filed, that is still unused and will be initialized
7403 ++ * only after the node will be connected.
7404 ++ */
7405 ++ prev->bfqd = bfqg;
7406 ++ prev = bfqg;
7407 ++ }
7408 ++ }
7409 ++
7410 ++ return leaf;
7411 ++
7412 ++cleanup:
7413 ++ while (leaf != NULL) {
7414 ++ prev = leaf;
7415 ++ leaf = leaf->bfqd;
7416 ++ kfree(prev);
7417 ++ }
7418 ++
7419 ++ return NULL;
7420 ++}
7421 ++
7422 ++/**
7423 ++ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
7424 ++ * @bfqd: the queue descriptor.
7425 ++ * @cgroup: the leaf cgroup to start from.
7426 ++ * @leaf: the leaf group (to be associated to @cgroup).
7427 ++ *
7428 ++ * Try to link a chain of groups to a cgroup hierarchy, connecting the
7429 ++ * nodes bottom-up, so we can be sure that when we find a cgroup in the
7430 ++ * hierarchy that already as a group associated to @bfqd all the nodes
7431 ++ * in the path to the root cgroup have one too.
7432 ++ *
7433 ++ * On locking: the queue lock protects the hierarchy (there is a hierarchy
7434 ++ * per device) while the bfqio_cgroup lock protects the list of groups
7435 ++ * belonging to the same cgroup.
7436 ++ */
7437 ++static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
7438 ++ struct bfq_group *leaf)
7439 ++{
7440 ++ struct bfqio_cgroup *bgrp;
7441 ++ struct bfq_group *bfqg, *next, *prev = NULL;
7442 ++ unsigned long flags;
7443 ++
7444 ++ assert_spin_locked(bfqd->queue->queue_lock);
7445 ++
7446 ++ for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {
7447 ++ bgrp = cgroup_to_bfqio(cgroup);
7448 ++ next = leaf->bfqd;
7449 ++
7450 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
7451 ++ BUG_ON(bfqg != NULL);
7452 ++
7453 ++ spin_lock_irqsave(&bgrp->lock, flags);
7454 ++
7455 ++ rcu_assign_pointer(leaf->bfqd, bfqd);
7456 ++ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
7457 ++ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
7458 ++
7459 ++ spin_unlock_irqrestore(&bgrp->lock, flags);
7460 ++
7461 ++ prev = leaf;
7462 ++ leaf = next;
7463 ++ }
7464 ++
7465 ++ BUG_ON(cgroup == NULL && leaf != NULL);
7466 ++ if (cgroup != NULL && prev != NULL) {
7467 ++ bgrp = cgroup_to_bfqio(cgroup);
7468 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
7469 ++ bfq_group_set_parent(prev, bfqg);
7470 ++ }
7471 ++}
7472 ++
7473 ++/**
7474 ++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
7475 ++ * @bfqd: queue descriptor.
7476 ++ * @cgroup: cgroup being searched for.
7477 ++ *
7478 ++ * Return a group associated to @bfqd in @cgroup, allocating one if
7479 ++ * necessary. When a group is returned all the cgroups in the path
7480 ++ * to the root have a group associated to @bfqd.
7481 ++ *
7482 ++ * If the allocation fails, return the root group: this breaks guarantees
7483 ++ * but is a safe fallbak. If this loss becames a problem it can be
7484 ++ * mitigated using the equivalent weight (given by the product of the
7485 ++ * weights of the groups in the path from @group to the root) in the
7486 ++ * root scheduler.
7487 ++ *
7488 ++ * We allocate all the missing nodes in the path from the leaf cgroup
7489 ++ * to the root and we connect the nodes only after all the allocations
7490 ++ * have been successful.
7491 ++ */
7492 ++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
7493 ++ struct cgroup *cgroup)
7494 ++{
7495 ++ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
7496 ++ struct bfq_group *bfqg;
7497 ++
7498 ++ bfqg = bfqio_lookup_group(bgrp, bfqd);
7499 ++ if (bfqg != NULL)
7500 ++ return bfqg;
7501 ++
7502 ++ bfqg = bfq_group_chain_alloc(bfqd, cgroup);
7503 ++ if (bfqg != NULL)
7504 ++ bfq_group_chain_link(bfqd, cgroup, bfqg);
7505 ++ else
7506 ++ bfqg = bfqd->root_group;
7507 ++
7508 ++ return bfqg;
7509 ++}
7510 ++
7511 ++/**
7512 ++ * bfq_bfqq_move - migrate @bfqq to @bfqg.
7513 ++ * @bfqd: queue descriptor.
7514 ++ * @bfqq: the queue to move.
7515 ++ * @entity: @bfqq's entity.
7516 ++ * @bfqg: the group to move to.
7517 ++ *
7518 ++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
7519 ++ * it on the new one. Avoid putting the entity on the old group idle tree.
7520 ++ *
7521 ++ * Must be called under the queue lock; the cgroup owning @bfqg must
7522 ++ * not disappear (by now this just means that we are called under
7523 ++ * rcu_read_lock()).
7524 ++ */
7525 ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
7526 ++ struct bfq_entity *entity, struct bfq_group *bfqg)
7527 ++{
7528 ++ int busy, resume;
7529 ++
7530 ++ busy = bfq_bfqq_busy(bfqq);
7531 ++ resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
7532 ++
7533 ++ BUG_ON(resume && !entity->on_st);
7534 ++ BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue);
7535 ++
7536 ++ if (busy) {
7537 ++ BUG_ON(atomic_read(&bfqq->ref) < 2);
7538 ++
7539 ++ if (!resume)
7540 ++ bfq_del_bfqq_busy(bfqd, bfqq, 0);
7541 ++ else
7542 ++ bfq_deactivate_bfqq(bfqd, bfqq, 0);
7543 ++ } else if (entity->on_st)
7544 ++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
7545 ++
7546 ++ /*
7547 ++ * Here we use a reference to bfqg. We don't need a refcounter
7548 ++ * as the cgroup reference will not be dropped, so that its
7549 ++ * destroy() callback will not be invoked.
7550 ++ */
7551 ++ entity->parent = bfqg->my_entity;
7552 ++ entity->sched_data = &bfqg->sched_data;
7553 ++
7554 ++ if (busy && resume)
7555 ++ bfq_activate_bfqq(bfqd, bfqq);
7556 ++
7557 ++ if (bfqd->active_queue == NULL && !bfqd->rq_in_driver)
7558 ++ bfq_schedule_dispatch(bfqd);
7559 ++}
7560 ++
7561 ++/**
7562 ++ * __bfq_bic_change_cgroup - move @bic to @cgroup.
7563 ++ * @bfqd: the queue descriptor.
7564 ++ * @bic: the bic to move.
7565 ++ * @cgroup: the cgroup to move to.
7566 ++ *
7567 ++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
7568 ++ * has to make sure that the reference to cgroup is valid across the call.
7569 ++ *
7570 ++ * NOTE: an alternative approach might have been to store the current
7571 ++ * cgroup in bfqq and getting a reference to it, reducing the lookup
7572 ++ * time here, at the price of slightly more complex code.
7573 ++ */
7574 ++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
7575 ++ struct bfq_io_cq *bic,
7576 ++ struct cgroup *cgroup)
7577 ++{
7578 ++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
7579 ++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
7580 ++ struct bfq_entity *entity;
7581 ++ struct bfq_group *bfqg;
7582 ++ struct bfqio_cgroup *bgrp;
7583 ++
7584 ++ bgrp = cgroup_to_bfqio(cgroup);
7585 ++
7586 ++ bfqg = bfq_find_alloc_group(bfqd, cgroup);
7587 ++ if (async_bfqq != NULL) {
7588 ++ entity = &async_bfqq->entity;
7589 ++
7590 ++ if (entity->sched_data != &bfqg->sched_data) {
7591 ++ bic_set_bfqq(bic, NULL, 0);
7592 ++ bfq_log_bfqq(bfqd, async_bfqq,
7593 ++ "bic_change_group: %p %d",
7594 ++ async_bfqq, atomic_read(&async_bfqq->ref));
7595 ++ bfq_put_queue(async_bfqq);
7596 ++ }
7597 ++ }
7598 ++
7599 ++ if (sync_bfqq != NULL) {
7600 ++ entity = &sync_bfqq->entity;
7601 ++ if (entity->sched_data != &bfqg->sched_data)
7602 ++ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
7603 ++ }
7604 ++
7605 ++ return bfqg;
7606 ++}
7607 ++
7608 ++/**
7609 ++ * bfq_bic_change_cgroup - move @bic to @cgroup.
7610 ++ * @bic: the bic being migrated.
7611 ++ * @cgroup: the destination cgroup.
7612 ++ *
7613 ++ * When the task owning @bic is moved to @cgroup, @bic is immediately
7614 ++ * moved into its new parent group.
7615 ++ */
7616 ++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
7617 ++ struct cgroup *cgroup)
7618 ++{
7619 ++ struct bfq_data *bfqd;
7620 ++ unsigned long uninitialized_var(flags);
7621 ++
7622 ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
7623 ++ if (bfqd != NULL) {
7624 ++ __bfq_bic_change_cgroup(bfqd, bic, cgroup);
7625 ++ bfq_put_bfqd_unlock(bfqd, &flags);
7626 ++ }
7627 ++}
7628 ++
7629 ++/**
7630 ++ * bfq_bic_update_cgroup - update the cgroup of @bic.
7631 ++ * @bic: the @bic to update.
7632 ++ *
7633 ++ * Make sure that @bic is enqueued in the cgroup of the current task.
7634 ++ * We need this in addition to moving bics during the cgroup attach
7635 ++ * phase because the task owning @bic could be at its first disk
7636 ++ * access or we may end up in the root cgroup as the result of a
7637 ++ * memory allocation failure and here we try to move to the right
7638 ++ * group.
7639 ++ *
7640 ++ * Must be called under the queue lock. It is safe to use the returned
7641 ++ * value even after the rcu_read_unlock() as the migration/destruction
7642 ++ * paths act under the queue lock too. IOW it is impossible to race with
7643 ++ * group migration/destruction and end up with an invalid group as:
7644 ++ * a) here cgroup has not yet been destroyed, nor its destroy callback
7645 ++ * has started execution, as current holds a reference to it,
7646 ++ * b) if it is destroyed after rcu_read_unlock() [after current is
7647 ++ * migrated to a different cgroup] its attach() callback will have
7648 ++ * taken care of remove all the references to the old cgroup data.
7649 ++ */
7650 ++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
7651 ++{
7652 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
7653 ++ struct bfq_group *bfqg;
7654 ++ struct cgroup *cgroup;
7655 ++
7656 ++ BUG_ON(bfqd == NULL);
7657 ++
7658 ++ rcu_read_lock();
7659 ++ cgroup = task_cgroup(current, bfqio_subsys_id);
7660 ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, cgroup);
7661 ++ rcu_read_unlock();
7662 ++
7663 ++ return bfqg;
7664 ++}
7665 ++
7666 ++/**
7667 ++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
7668 ++ * @st: the service tree being flushed.
7669 ++ */
7670 ++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
7671 ++{
7672 ++ struct bfq_entity *entity = st->first_idle;
7673 ++
7674 ++ for (; entity != NULL; entity = st->first_idle)
7675 ++ __bfq_deactivate_entity(entity, 0);
7676 ++}
7677 ++
7678 ++/**
7679 ++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
7680 ++ * @bfqd: the device data structure with the root group.
7681 ++ * @entity: the entity to move.
7682 ++ */
7683 ++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
7684 ++ struct bfq_entity *entity)
7685 ++{
7686 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
7687 ++
7688 ++ BUG_ON(bfqq == NULL);
7689 ++ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
7690 ++ return;
7691 ++}
7692 ++
7693 ++/**
7694 ++ * bfq_reparent_active_entities - move to the root group all active entities.
7695 ++ * @bfqd: the device data structure with the root group.
7696 ++ * @bfqg: the group to move from.
7697 ++ * @st: the service tree with the entities.
7698 ++ *
7699 ++ * Needs queue_lock to be taken and reference to be valid over the call.
7700 ++ */
7701 ++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
7702 ++ struct bfq_group *bfqg,
7703 ++ struct bfq_service_tree *st)
7704 ++{
7705 ++ struct rb_root *active = &st->active;
7706 ++ struct bfq_entity *entity = NULL;
7707 ++
7708 ++ if (!RB_EMPTY_ROOT(&st->active))
7709 ++ entity = bfq_entity_of(rb_first(active));
7710 ++
7711 ++ for (; entity != NULL ; entity = bfq_entity_of(rb_first(active)))
7712 ++ bfq_reparent_leaf_entity(bfqd, entity);
7713 ++
7714 ++ if (bfqg->sched_data.active_entity != NULL)
7715 ++ bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);
7716 ++
7717 ++ return;
7718 ++}
7719 ++
7720 ++/**
7721 ++ * bfq_destroy_group - destroy @bfqg.
7722 ++ * @bgrp: the bfqio_cgroup containing @bfqg.
7723 ++ * @bfqg: the group being destroyed.
7724 ++ *
7725 ++ * Destroy @bfqg, making sure that it is not referenced from its parent.
7726 ++ */
7727 ++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
7728 ++{
7729 ++ struct bfq_data *bfqd;
7730 ++ struct bfq_service_tree *st;
7731 ++ struct bfq_entity *entity = bfqg->my_entity;
7732 ++ unsigned long uninitialized_var(flags);
7733 ++ int i;
7734 ++
7735 ++ hlist_del(&bfqg->group_node);
7736 ++
7737 ++ /*
7738 ++ * Empty all service_trees belonging to this group before deactivating
7739 ++ * the group itself.
7740 ++ */
7741 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
7742 ++ st = bfqg->sched_data.service_tree + i;
7743 ++
7744 ++ /*
7745 ++ * The idle tree may still contain bfq_queues belonging
7746 ++ * to exited task because they never migrated to a different
7747 ++ * cgroup from the one being destroyed now. Noone else
7748 ++ * can access them so it's safe to act without any lock.
7749 ++ */
7750 ++ bfq_flush_idle_tree(st);
7751 ++
7752 ++ /*
7753 ++ * It may happen that some queues are still active
7754 ++ * (busy) upon group destruction (if the corresponding
7755 ++ * processes have been forced to terminate). We move
7756 ++ * all the leaf entities corresponding to these queues
7757 ++ * to the root_group.
7758 ++ * Also, it may happen that the group has an entity
7759 ++ * under service, which is disconnected from the active
7760 ++ * tree: it must be moved, too.
7761 ++ * There is no need to put the sync queues, as the
7762 ++ * scheduler has taken no reference.
7763 ++ */
7764 ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
7765 ++ if (bfqd != NULL) {
7766 ++ bfq_reparent_active_entities(bfqd, bfqg, st);
7767 ++ bfq_put_bfqd_unlock(bfqd, &flags);
7768 ++ }
7769 ++ BUG_ON(!RB_EMPTY_ROOT(&st->active));
7770 ++ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
7771 ++ }
7772 ++ BUG_ON(bfqg->sched_data.next_active != NULL);
7773 ++ BUG_ON(bfqg->sched_data.active_entity != NULL);
7774 ++
7775 ++ /*
7776 ++ * We may race with device destruction, take extra care when
7777 ++ * dereferencing bfqg->bfqd.
7778 ++ */
7779 ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
7780 ++ if (bfqd != NULL) {
7781 ++ hlist_del(&bfqg->bfqd_node);
7782 ++ __bfq_deactivate_entity(entity, 0);
7783 ++ bfq_put_async_queues(bfqd, bfqg);
7784 ++ bfq_put_bfqd_unlock(bfqd, &flags);
7785 ++ }
7786 ++ BUG_ON(entity->tree != NULL);
7787 ++
7788 ++ /*
7789 ++ * No need to defer the kfree() to the end of the RCU grace
7790 ++ * period: we are called from the destroy() callback of our
7791 ++ * cgroup, so we can be sure that noone is a) still using
7792 ++ * this cgroup or b) doing lookups in it.
7793 ++ */
7794 ++ kfree(bfqg);
7795 ++}
7796 ++
7797 ++static void bfq_end_raising_async(struct bfq_data *bfqd)
7798 ++{
7799 ++ struct hlist_node *tmp;
7800 ++ struct bfq_group *bfqg;
7801 ++
7802 ++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
7803 ++ bfq_end_raising_async_queues(bfqd, bfqg);
7804 ++}
7805 ++
7806 ++/**
7807 ++ * bfq_disconnect_groups - diconnect @bfqd from all its groups.
7808 ++ * @bfqd: the device descriptor being exited.
7809 ++ *
7810 ++ * When the device exits we just make sure that no lookup can return
7811 ++ * the now unused group structures. They will be deallocated on cgroup
7812 ++ * destruction.
7813 ++ */
7814 ++static void bfq_disconnect_groups(struct bfq_data *bfqd)
7815 ++{
7816 ++ struct hlist_node *tmp;
7817 ++ struct bfq_group *bfqg;
7818 ++
7819 ++ bfq_log(bfqd, "disconnect_groups beginning") ;
7820 ++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
7821 ++ hlist_del(&bfqg->bfqd_node);
7822 ++
7823 ++ __bfq_deactivate_entity(bfqg->my_entity, 0);
7824 ++
7825 ++ /*
7826 ++ * Don't remove from the group hash, just set an
7827 ++ * invalid key. No lookups can race with the
7828 ++ * assignment as bfqd is being destroyed; this
7829 ++ * implies also that new elements cannot be added
7830 ++ * to the list.
7831 ++ */
7832 ++ rcu_assign_pointer(bfqg->bfqd, NULL);
7833 ++
7834 ++ bfq_log(bfqd, "disconnect_groups: put async for group %p",
7835 ++ bfqg) ;
7836 ++ bfq_put_async_queues(bfqd, bfqg);
7837 ++ }
7838 ++}
7839 ++
7840 ++static inline void bfq_free_root_group(struct bfq_data *bfqd)
7841 ++{
7842 ++ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
7843 ++ struct bfq_group *bfqg = bfqd->root_group;
7844 ++
7845 ++ bfq_put_async_queues(bfqd, bfqg);
7846 ++
7847 ++ spin_lock_irq(&bgrp->lock);
7848 ++ hlist_del_rcu(&bfqg->group_node);
7849 ++ spin_unlock_irq(&bgrp->lock);
7850 ++
7851 ++ /*
7852 ++ * No need to synchronize_rcu() here: since the device is gone
7853 ++ * there cannot be any read-side access to its root_group.
7854 ++ */
7855 ++ kfree(bfqg);
7856 ++}
7857 ++
7858 ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
7859 ++{
7860 ++ struct bfq_group *bfqg;
7861 ++ struct bfqio_cgroup *bgrp;
7862 ++ int i;
7863 ++
7864 ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
7865 ++ if (bfqg == NULL)
7866 ++ return NULL;
7867 ++
7868 ++ bfqg->entity.parent = NULL;
7869 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
7870 ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
7871 ++
7872 ++ bgrp = &bfqio_root_cgroup;
7873 ++ spin_lock_irq(&bgrp->lock);
7874 ++ rcu_assign_pointer(bfqg->bfqd, bfqd);
7875 ++ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
7876 ++ spin_unlock_irq(&bgrp->lock);
7877 ++
7878 ++ return bfqg;
7879 ++}
7880 ++
7881 ++#define SHOW_FUNCTION(__VAR) \
7882 ++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup, \
7883 ++ struct cftype *cftype) \
7884 ++{ \
7885 ++ struct bfqio_cgroup *bgrp; \
7886 ++ u64 ret = -ENODEV; \
7887 ++ \
7888 ++ mutex_lock(&bfqio_mutex); \
7889 ++ if (bfqio_is_removed(cgroup)) \
7890 ++ goto out_unlock; \
7891 ++ \
7892 ++ bgrp = cgroup_to_bfqio(cgroup); \
7893 ++ spin_lock_irq(&bgrp->lock); \
7894 ++ ret = bgrp->__VAR; \
7895 ++ spin_unlock_irq(&bgrp->lock); \
7896 ++ \
7897 ++out_unlock: \
7898 ++ mutex_unlock(&bfqio_mutex); \
7899 ++ return ret; \
7900 ++}
7901 ++
7902 ++SHOW_FUNCTION(weight);
7903 ++SHOW_FUNCTION(ioprio);
7904 ++SHOW_FUNCTION(ioprio_class);
7905 ++#undef SHOW_FUNCTION
7906 ++
7907 ++#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
7908 ++static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \
7909 ++ struct cftype *cftype, \
7910 ++ u64 val) \
7911 ++{ \
7912 ++ struct bfqio_cgroup *bgrp; \
7913 ++ struct bfq_group *bfqg; \
7914 ++ int ret = -EINVAL; \
7915 ++ \
7916 ++ if (val < (__MIN) || val > (__MAX)) \
7917 ++ return ret; \
7918 ++ \
7919 ++ ret = -ENODEV; \
7920 ++ mutex_lock(&bfqio_mutex); \
7921 ++ if (bfqio_is_removed(cgroup)) \
7922 ++ goto out_unlock; \
7923 ++ ret = 0; \
7924 ++ \
7925 ++ bgrp = cgroup_to_bfqio(cgroup); \
7926 ++ \
7927 ++ spin_lock_irq(&bgrp->lock); \
7928 ++ bgrp->__VAR = (unsigned short)val; \
7929 ++ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
7930 ++ /* \
7931 ++ * Setting the ioprio_changed flag of the entity \
7932 ++ * to 1 with new_##__VAR == ##__VAR would re-set \
7933 ++ * the value of the weight to its ioprio mapping. \
7934 ++ * Set the flag only if necessary. \
7935 ++ */ \
7936 ++ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
7937 ++ bfqg->entity.new_##__VAR = (unsigned short)val; \
7938 ++ smp_wmb(); \
7939 ++ bfqg->entity.ioprio_changed = 1; \
7940 ++ } \
7941 ++ } \
7942 ++ spin_unlock_irq(&bgrp->lock); \
7943 ++ \
7944 ++out_unlock: \
7945 ++ mutex_unlock(&bfqio_mutex); \
7946 ++ return ret; \
7947 ++}
7948 ++
7949 ++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
7950 ++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
7951 ++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
7952 ++#undef STORE_FUNCTION
7953 ++
7954 ++static struct cftype bfqio_files[] = {
7955 ++ {
7956 ++ .name = "weight",
7957 ++ .read_u64 = bfqio_cgroup_weight_read,
7958 ++ .write_u64 = bfqio_cgroup_weight_write,
7959 ++ },
7960 ++ {
7961 ++ .name = "ioprio",
7962 ++ .read_u64 = bfqio_cgroup_ioprio_read,
7963 ++ .write_u64 = bfqio_cgroup_ioprio_write,
7964 ++ },
7965 ++ {
7966 ++ .name = "ioprio_class",
7967 ++ .read_u64 = bfqio_cgroup_ioprio_class_read,
7968 ++ .write_u64 = bfqio_cgroup_ioprio_class_write,
7969 ++ },
7970 ++ { }, /* terminate */
7971 ++};
7972 ++
7973 ++static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)
7974 ++{
7975 ++ struct bfqio_cgroup *bgrp;
7976 ++
7977 ++ if (cgroup->parent != NULL) {
7978 ++ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
7979 ++ if (bgrp == NULL)
7980 ++ return ERR_PTR(-ENOMEM);
7981 ++ } else
7982 ++ bgrp = &bfqio_root_cgroup;
7983 ++
7984 ++ spin_lock_init(&bgrp->lock);
7985 ++ INIT_HLIST_HEAD(&bgrp->group_data);
7986 ++ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
7987 ++ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
7988 ++
7989 ++ return &bgrp->css;
7990 ++}
7991 ++
7992 ++/*
7993 ++ * We cannot support shared io contexts, as we have no means to support
7994 ++ * two tasks with the same ioc in two different groups without major rework
7995 ++ * of the main bic/bfqq data structures. By now we allow a task to change
7996 ++ * its cgroup only if it's the only owner of its ioc; the drawback of this
7997 ++ * behavior is that a group containing a task that forked using CLONE_IO
7998 ++ * will not be destroyed until the tasks sharing the ioc die.
7999 ++ */
8000 ++static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
8001 ++{
8002 ++ struct task_struct *task;
8003 ++ struct io_context *ioc;
8004 ++ int ret = 0;
8005 ++
8006 ++ cgroup_taskset_for_each(task, cgroup, tset) {
8007 ++ /* task_lock() is needed to avoid races with exit_io_context() */
8008 ++ task_lock(task);
8009 ++ ioc = task->io_context;
8010 ++ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
8011 ++ /*
8012 ++ * ioc == NULL means that the task is either too young or
8013 ++ * exiting: if it has still no ioc the ioc can't be shared,
8014 ++ * if the task is exiting the attach will fail anyway, no
8015 ++ * matter what we return here.
8016 ++ */
8017 ++ ret = -EINVAL;
8018 ++ task_unlock(task);
8019 ++ if (ret)
8020 ++ break;
8021 ++ }
8022 ++
8023 ++ return ret;
8024 ++}
8025 ++
8026 ++static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
8027 ++{
8028 ++ struct task_struct *task;
8029 ++ struct io_context *ioc;
8030 ++ struct io_cq *icq;
8031 ++
8032 ++ /*
8033 ++ * IMPORTANT NOTE: The move of more than one process at a time to a
8034 ++ * new group has not yet been tested.
8035 ++ */
8036 ++ cgroup_taskset_for_each(task, cgroup, tset) {
8037 ++ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
8038 ++ if (ioc) {
8039 ++ /*
8040 ++ * Handle cgroup change here.
8041 ++ */
8042 ++ rcu_read_lock();
8043 ++ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
8044 ++ if (!strncmp(icq->q->elevator->type->elevator_name,
8045 ++ "bfq", ELV_NAME_MAX))
8046 ++ bfq_bic_change_cgroup(icq_to_bic(icq),
8047 ++ cgroup);
8048 ++ rcu_read_unlock();
8049 ++ put_io_context(ioc);
8050 ++ }
8051 ++ }
8052 ++}
8053 ++
8054 ++static void bfqio_destroy(struct cgroup *cgroup)
8055 ++{
8056 ++ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
8057 ++ struct hlist_node *tmp;
8058 ++ struct bfq_group *bfqg;
8059 ++
8060 ++ /*
8061 ++ * Since we are destroying the cgroup, there are no more tasks
8062 ++ * referencing it, and all the RCU grace periods that may have
8063 ++ * referenced it are ended (as the destruction of the parent
8064 ++ * cgroup is RCU-safe); bgrp->group_data will not be accessed by
8065 ++ * anything else and we don't need any synchronization.
8066 ++ */
8067 ++ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
8068 ++ bfq_destroy_group(bgrp, bfqg);
8069 ++
8070 ++ BUG_ON(!hlist_empty(&bgrp->group_data));
8071 ++
8072 ++ kfree(bgrp);
8073 ++}
8074 ++
8075 ++struct cgroup_subsys bfqio_subsys = {
8076 ++ .name = "bfqio",
8077 ++ .css_alloc = bfqio_create,
8078 ++ .can_attach = bfqio_can_attach,
8079 ++ .attach = bfqio_attach,
8080 ++ .css_free = bfqio_destroy,
8081 ++ .subsys_id = bfqio_subsys_id,
8082 ++ .base_cftypes = bfqio_files,
8083 ++};
8084 ++#else
8085 ++static inline void bfq_init_entity(struct bfq_entity *entity,
8086 ++ struct bfq_group *bfqg)
8087 ++{
8088 ++ entity->weight = entity->new_weight;
8089 ++ entity->orig_weight = entity->new_weight;
8090 ++ entity->ioprio = entity->new_ioprio;
8091 ++ entity->ioprio_class = entity->new_ioprio_class;
8092 ++ entity->sched_data = &bfqg->sched_data;
8093 ++}
8094 ++
8095 ++static inline struct bfq_group *
8096 ++bfq_bic_update_cgroup(struct bfq_io_cq *bic)
8097 ++{
8098 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
8099 ++ return bfqd->root_group;
8100 ++}
8101 ++
8102 ++static inline void bfq_bfqq_move(struct bfq_data *bfqd,
8103 ++ struct bfq_queue *bfqq,
8104 ++ struct bfq_entity *entity,
8105 ++ struct bfq_group *bfqg)
8106 ++{
8107 ++}
8108 ++
8109 ++static void bfq_end_raising_async(struct bfq_data *bfqd)
8110 ++{
8111 ++ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
8112 ++}
8113 ++
8114 ++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
8115 ++{
8116 ++ bfq_put_async_queues(bfqd, bfqd->root_group);
8117 ++}
8118 ++
8119 ++static inline void bfq_free_root_group(struct bfq_data *bfqd)
8120 ++{
8121 ++ kfree(bfqd->root_group);
8122 ++}
8123 ++
8124 ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
8125 ++{
8126 ++ struct bfq_group *bfqg;
8127 ++ int i;
8128 ++
8129 ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
8130 ++ if (bfqg == NULL)
8131 ++ return NULL;
8132 ++
8133 ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
8134 ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
8135 ++
8136 ++ return bfqg;
8137 ++}
8138 ++#endif
8139 +diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
8140 +new file mode 100644
8141 +index 0000000..326e3ec
8142 +--- /dev/null
8143 ++++ b/block/bfq-ioc.c
8144 +@@ -0,0 +1,36 @@
8145 ++/*
8146 ++ * BFQ: I/O context handling.
8147 ++ *
8148 ++ * Based on ideas and code from CFQ:
8149 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
8150 ++ *
8151 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
8152 ++ * Paolo Valente <paolo.valente@×××××××.it>
8153 ++ *
8154 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
8155 ++ */
8156 ++
8157 ++/**
8158 ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
8159 ++ * @icq: the iocontext queue.
8160 ++ */
8161 ++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
8162 ++{
8163 ++ /* bic->icq is the first member, %NULL will convert to %NULL */
8164 ++ return container_of(icq, struct bfq_io_cq, icq);
8165 ++}
8166 ++
8167 ++/**
8168 ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
8169 ++ * @bfqd: the lookup key.
8170 ++ * @ioc: the io_context of the process doing I/O.
8171 ++ *
8172 ++ * Queue lock must be held.
8173 ++ */
8174 ++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
8175 ++ struct io_context *ioc)
8176 ++{
8177 ++ if(ioc)
8178 ++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
8179 ++ return NULL;
8180 ++}
8181 +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
8182 +new file mode 100644
8183 +index 0000000..0ed2746
8184 +--- /dev/null
8185 ++++ b/block/bfq-iosched.c
8186 +@@ -0,0 +1,3082 @@
8187 ++/*
8188 ++ * BFQ, or Budget Fair Queueing, disk scheduler.
8189 ++ *
8190 ++ * Based on ideas and code from CFQ:
8191 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
8192 ++ *
8193 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
8194 ++ * Paolo Valente <paolo.valente@×××××××.it>
8195 ++ *
8196 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
8197 ++ *
8198 ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
8199 ++ *
8200 ++ * BFQ is a proportional share disk scheduling algorithm based on the
8201 ++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
8202 ++ * measured in number of sectors, to tasks instead of time slices.
8203 ++ * The disk is not granted to the active task for a given time slice,
8204 ++ * but until it has exahusted its assigned budget. This change from
8205 ++ * the time to the service domain allows BFQ to distribute the disk
8206 ++ * bandwidth among tasks as desired, without any distortion due to
8207 ++ * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc
8208 ++ * internal scheduler, called B-WF2Q+, to schedule tasks according to
8209 ++ * their budgets. Thanks to this accurate scheduler, BFQ can afford
8210 ++ * to assign high budgets to disk-bound non-seeky tasks (to boost the
8211 ++ * throughput), and yet guarantee low latencies to interactive and
8212 ++ * soft real-time applications.
8213 ++ *
8214 ++ * BFQ has been introduced in [1], where the interested reader can
8215 ++ * find an accurate description of the algorithm, the bandwidth
8216 ++ * distribution and latency guarantees it provides, plus formal proofs
8217 ++ * of all the properties. With respect to the algorithm presented in
8218 ++ * the paper, this implementation adds several little heuristics, and
8219 ++ * a hierarchical extension, based on H-WF2Q+.
8220 ++ *
8221 ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
8222 ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
8223 ++ * complexity derives from the one introduced with EEVDF in [3].
8224 ++ *
8225 ++ * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
8226 ++ * with Deterministic Guarantees on Bandwidth Distribution,'',
8227 ++ * IEEE Transactions on Computer, May 2010.
8228 ++ *
8229 ++ * http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf
8230 ++ *
8231 ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
8232 ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
8233 ++ * Oct 1997.
8234 ++ *
8235 ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
8236 ++ *
8237 ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
8238 ++ * First: A Flexible and Accurate Mechanism for Proportional Share
8239 ++ * Resource Allocation,'' technical report.
8240 ++ *
8241 ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
8242 ++ */
8243 ++#include <linux/module.h>
8244 ++#include <linux/slab.h>
8245 ++#include <linux/blkdev.h>
8246 ++#include <linux/cgroup.h>
8247 ++#include <linux/elevator.h>
8248 ++#include <linux/jiffies.h>
8249 ++#include <linux/rbtree.h>
8250 ++#include <linux/ioprio.h>
8251 ++#include "bfq.h"
8252 ++#include "blk.h"
8253 ++
8254 ++/* Max number of dispatches in one round of service. */
8255 ++static const int bfq_quantum = 4;
8256 ++
8257 ++/* Expiration time of sync (0) and async (1) requests, in jiffies. */
8258 ++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
8259 ++
8260 ++/* Maximum backwards seek, in KiB. */
8261 ++static const int bfq_back_max = 16 * 1024;
8262 ++
8263 ++/* Penalty of a backwards seek, in number of sectors. */
8264 ++static const int bfq_back_penalty = 2;
8265 ++
8266 ++/* Idling period duration, in jiffies. */
8267 ++static int bfq_slice_idle = HZ / 125;
8268 ++
8269 ++/* Default maximum budget values, in sectors and number of requests. */
8270 ++static const int bfq_default_max_budget = 16 * 1024;
8271 ++static const int bfq_max_budget_async_rq = 4;
8272 ++
8273 ++/*
8274 ++ * Async to sync throughput distribution is controlled as follows:
8275 ++ * when an async request is served, the entity is charged the number
8276 ++ * of sectors of the request, multipled by the factor below
8277 ++ */
8278 ++static const int bfq_async_charge_factor = 10;
8279 ++
8280 ++/* Default timeout values, in jiffies, approximating CFQ defaults. */
8281 ++static const int bfq_timeout_sync = HZ / 8;
8282 ++static int bfq_timeout_async = HZ / 25;
8283 ++
8284 ++struct kmem_cache *bfq_pool;
8285 ++
8286 ++/* Below this threshold (in ms), we consider thinktime immediate. */
8287 ++#define BFQ_MIN_TT 2
8288 ++
8289 ++/* hw_tag detection: parallel requests threshold and min samples needed. */
8290 ++#define BFQ_HW_QUEUE_THRESHOLD 4
8291 ++#define BFQ_HW_QUEUE_SAMPLES 32
8292 ++
8293 ++#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
8294 ++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
8295 ++
8296 ++/* Min samples used for peak rate estimation (for autotuning). */
8297 ++#define BFQ_PEAK_RATE_SAMPLES 32
8298 ++
8299 ++/* Shift used for peak rate fixed precision calculations. */
8300 ++#define BFQ_RATE_SHIFT 16
8301 ++
8302 ++/*
8303 ++ * The duration of the weight raising for interactive applications is
8304 ++ * computed automatically (as default behaviour), using the following
8305 ++ * formula: duration = (R / r) * T, where r is the peak rate of the
8306 ++ * disk, and R and T are two reference parameters. In particular, R is
8307 ++ * the peak rate of a reference disk, and T is about the maximum time
8308 ++ * for starting popular large applications on that disk, under BFQ and
8309 ++ * while reading two files in parallel. Finally, BFQ uses two
8310 ++ * different pairs (R, T) depending on whether the disk is rotational
8311 ++ * or non-rotational.
8312 ++ */
8313 ++#define T_rot (msecs_to_jiffies(5500))
8314 ++#define T_nonrot (msecs_to_jiffies(2000))
8315 ++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
8316 ++#define R_rot 17415
8317 ++#define R_nonrot 34791
8318 ++
8319 ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
8320 ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
8321 ++
8322 ++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
8323 ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
8324 ++
8325 ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
8326 ++
8327 ++#include "bfq-ioc.c"
8328 ++#include "bfq-sched.c"
8329 ++#include "bfq-cgroup.c"
8330 ++
8331 ++#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
8332 ++ IOPRIO_CLASS_IDLE)
8333 ++#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
8334 ++ IOPRIO_CLASS_RT)
8335 ++
8336 ++#define bfq_sample_valid(samples) ((samples) > 80)
8337 ++
8338 ++/*
8339 ++ * We regard a request as SYNC, if either it's a read or has the SYNC bit
8340 ++ * set (in which case it could also be a direct WRITE).
8341 ++ */
8342 ++static inline int bfq_bio_sync(struct bio *bio)
8343 ++{
8344 ++ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
8345 ++ return 1;
8346 ++
8347 ++ return 0;
8348 ++}
8349 ++
8350 ++/*
8351 ++ * Scheduler run of queue, if there are requests pending and no one in the
8352 ++ * driver that will restart queueing.
8353 ++ */
8354 ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
8355 ++{
8356 ++ if (bfqd->queued != 0) {
8357 ++ bfq_log(bfqd, "schedule dispatch");
8358 ++ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
8359 ++ }
8360 ++}
8361 ++
8362 ++/*
8363 ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
8364 ++ * We choose the request that is closesr to the head right now. Distance
8365 ++ * behind the head is penalized and only allowed to a certain extent.
8366 ++ */
8367 ++static struct request *bfq_choose_req(struct bfq_data *bfqd,
8368 ++ struct request *rq1,
8369 ++ struct request *rq2,
8370 ++ sector_t last)
8371 ++{
8372 ++ sector_t s1, s2, d1 = 0, d2 = 0;
8373 ++ unsigned long back_max;
8374 ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
8375 ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
8376 ++ unsigned wrap = 0; /* bit mask: requests behind the disk head? */
8377 ++
8378 ++ if (rq1 == NULL || rq1 == rq2)
8379 ++ return rq2;
8380 ++ if (rq2 == NULL)
8381 ++ return rq1;
8382 ++
8383 ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
8384 ++ return rq1;
8385 ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
8386 ++ return rq2;
8387 ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
8388 ++ return rq1;
8389 ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
8390 ++ return rq2;
8391 ++
8392 ++ s1 = blk_rq_pos(rq1);
8393 ++ s2 = blk_rq_pos(rq2);
8394 ++
8395 ++ /*
8396 ++ * By definition, 1KiB is 2 sectors.
8397 ++ */
8398 ++ back_max = bfqd->bfq_back_max * 2;
8399 ++
8400 ++ /*
8401 ++ * Strict one way elevator _except_ in the case where we allow
8402 ++ * short backward seeks which are biased as twice the cost of a
8403 ++ * similar forward seek.
8404 ++ */
8405 ++ if (s1 >= last)
8406 ++ d1 = s1 - last;
8407 ++ else if (s1 + back_max >= last)
8408 ++ d1 = (last - s1) * bfqd->bfq_back_penalty;
8409 ++ else
8410 ++ wrap |= BFQ_RQ1_WRAP;
8411 ++
8412 ++ if (s2 >= last)
8413 ++ d2 = s2 - last;
8414 ++ else if (s2 + back_max >= last)
8415 ++ d2 = (last - s2) * bfqd->bfq_back_penalty;
8416 ++ else
8417 ++ wrap |= BFQ_RQ2_WRAP;
8418 ++
8419 ++ /* Found required data */
8420 ++
8421 ++ /*
8422 ++ * By doing switch() on the bit mask "wrap" we avoid having to
8423 ++ * check two variables for all permutations: --> faster!
8424 ++ */
8425 ++ switch (wrap) {
8426 ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
8427 ++ if (d1 < d2)
8428 ++ return rq1;
8429 ++ else if (d2 < d1)
8430 ++ return rq2;
8431 ++ else {
8432 ++ if (s1 >= s2)
8433 ++ return rq1;
8434 ++ else
8435 ++ return rq2;
8436 ++ }
8437 ++
8438 ++ case BFQ_RQ2_WRAP:
8439 ++ return rq1;
8440 ++ case BFQ_RQ1_WRAP:
8441 ++ return rq2;
8442 ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
8443 ++ default:
8444 ++ /*
8445 ++ * Since both rqs are wrapped,
8446 ++ * start with the one that's further behind head
8447 ++ * (--> only *one* back seek required),
8448 ++ * since back seek takes more time than forward.
8449 ++ */
8450 ++ if (s1 <= s2)
8451 ++ return rq1;
8452 ++ else
8453 ++ return rq2;
8454 ++ }
8455 ++}
8456 ++
8457 ++static struct bfq_queue *
8458 ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
8459 ++ sector_t sector, struct rb_node **ret_parent,
8460 ++ struct rb_node ***rb_link)
8461 ++{
8462 ++ struct rb_node **p, *parent;
8463 ++ struct bfq_queue *bfqq = NULL;
8464 ++
8465 ++ parent = NULL;
8466 ++ p = &root->rb_node;
8467 ++ while (*p) {
8468 ++ struct rb_node **n;
8469 ++
8470 ++ parent = *p;
8471 ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
8472 ++
8473 ++ /*
8474 ++ * Sort strictly based on sector. Smallest to the left,
8475 ++ * largest to the right.
8476 ++ */
8477 ++ if (sector > blk_rq_pos(bfqq->next_rq))
8478 ++ n = &(*p)->rb_right;
8479 ++ else if (sector < blk_rq_pos(bfqq->next_rq))
8480 ++ n = &(*p)->rb_left;
8481 ++ else
8482 ++ break;
8483 ++ p = n;
8484 ++ bfqq = NULL;
8485 ++ }
8486 ++
8487 ++ *ret_parent = parent;
8488 ++ if (rb_link)
8489 ++ *rb_link = p;
8490 ++
8491 ++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
8492 ++ (long long unsigned)sector,
8493 ++ bfqq != NULL ? bfqq->pid : 0);
8494 ++
8495 ++ return bfqq;
8496 ++}
8497 ++
8498 ++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
8499 ++{
8500 ++ struct rb_node **p, *parent;
8501 ++ struct bfq_queue *__bfqq;
8502 ++
8503 ++ if (bfqq->pos_root != NULL) {
8504 ++ rb_erase(&bfqq->pos_node, bfqq->pos_root);
8505 ++ bfqq->pos_root = NULL;
8506 ++ }
8507 ++
8508 ++ if (bfq_class_idle(bfqq))
8509 ++ return;
8510 ++ if (!bfqq->next_rq)
8511 ++ return;
8512 ++
8513 ++ bfqq->pos_root = &bfqd->rq_pos_tree;
8514 ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
8515 ++ blk_rq_pos(bfqq->next_rq), &parent, &p);
8516 ++ if (__bfqq == NULL) {
8517 ++ rb_link_node(&bfqq->pos_node, parent, p);
8518 ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
8519 ++ } else
8520 ++ bfqq->pos_root = NULL;
8521 ++}
8522 ++
8523 ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
8524 ++ struct bfq_queue *bfqq,
8525 ++ struct request *last)
8526 ++{
8527 ++ struct rb_node *rbnext = rb_next(&last->rb_node);
8528 ++ struct rb_node *rbprev = rb_prev(&last->rb_node);
8529 ++ struct request *next = NULL, *prev = NULL;
8530 ++
8531 ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
8532 ++
8533 ++ if (rbprev != NULL)
8534 ++ prev = rb_entry_rq(rbprev);
8535 ++
8536 ++ if (rbnext != NULL)
8537 ++ next = rb_entry_rq(rbnext);
8538 ++ else {
8539 ++ rbnext = rb_first(&bfqq->sort_list);
8540 ++ if (rbnext && rbnext != &last->rb_node)
8541 ++ next = rb_entry_rq(rbnext);
8542 ++ }
8543 ++
8544 ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
8545 ++}
8546 ++
8547 ++static void bfq_del_rq_rb(struct request *rq)
8548 ++{
8549 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
8550 ++ struct bfq_data *bfqd = bfqq->bfqd;
8551 ++ const int sync = rq_is_sync(rq);
8552 ++
8553 ++ BUG_ON(bfqq->queued[sync] == 0);
8554 ++ bfqq->queued[sync]--;
8555 ++ bfqd->queued--;
8556 ++
8557 ++ elv_rb_del(&bfqq->sort_list, rq);
8558 ++
8559 ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
8560 ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue)
8561 ++ bfq_del_bfqq_busy(bfqd, bfqq, 1);
8562 ++ /*
8563 ++ * Remove queue from request-position tree as it is empty.
8564 ++ */
8565 ++ if (bfqq->pos_root != NULL) {
8566 ++ rb_erase(&bfqq->pos_node, bfqq->pos_root);
8567 ++ bfqq->pos_root = NULL;
8568 ++ }
8569 ++ }
8570 ++}
8571 ++
8572 ++/* see the definition of bfq_async_charge_factor for details */
8573 ++static inline unsigned long bfq_serv_to_charge(struct request *rq,
8574 ++ struct bfq_queue *bfqq)
8575 ++{
8576 ++ return blk_rq_sectors(rq) *
8577 ++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
8578 ++ bfq_async_charge_factor));
8579 ++}
8580 ++
8581 ++/**
8582 ++ * bfq_updated_next_req - update the queue after a new next_rq selection.
8583 ++ * @bfqd: the device data the queue belongs to.
8584 ++ * @bfqq: the queue to update.
8585 ++ *
8586 ++ * If the first request of a queue changes we make sure that the queue
8587 ++ * has enough budget to serve at least its first request (if the
8588 ++ * request has grown). We do this because if the queue has not enough
8589 ++ * budget for its first request, it has to go through two dispatch
8590 ++ * rounds to actually get it dispatched.
8591 ++ */
8592 ++static void bfq_updated_next_req(struct bfq_data *bfqd,
8593 ++ struct bfq_queue *bfqq)
8594 ++{
8595 ++ struct bfq_entity *entity = &bfqq->entity;
8596 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
8597 ++ struct request *next_rq = bfqq->next_rq;
8598 ++ unsigned long new_budget;
8599 ++
8600 ++ if (next_rq == NULL)
8601 ++ return;
8602 ++
8603 ++ if (bfqq == bfqd->active_queue)
8604 ++ /*
8605 ++ * In order not to break guarantees, budgets cannot be
8606 ++ * changed after an entity has been selected.
8607 ++ */
8608 ++ return;
8609 ++
8610 ++ BUG_ON(entity->tree != &st->active);
8611 ++ BUG_ON(entity == entity->sched_data->active_entity);
8612 ++
8613 ++ new_budget = max_t(unsigned long, bfqq->max_budget,
8614 ++ bfq_serv_to_charge(next_rq, bfqq));
8615 ++ entity->budget = new_budget;
8616 ++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
8617 ++ bfq_activate_bfqq(bfqd, bfqq);
8618 ++}
8619 ++
8620 ++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
8621 ++{
8622 ++ u64 dur;
8623 ++
8624 ++ if (bfqd->bfq_raising_max_time > 0)
8625 ++ return bfqd->bfq_raising_max_time;
8626 ++
8627 ++ dur = bfqd->RT_prod;
8628 ++ do_div(dur, bfqd->peak_rate);
8629 ++
8630 ++ return dur;
8631 ++}
8632 ++
8633 ++static void bfq_add_rq_rb(struct request *rq)
8634 ++{
8635 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
8636 ++ struct bfq_entity *entity = &bfqq->entity;
8637 ++ struct bfq_data *bfqd = bfqq->bfqd;
8638 ++ struct request *next_rq, *prev;
8639 ++ unsigned long old_raising_coeff = bfqq->raising_coeff;
8640 ++ int idle_for_long_time = bfqq->budget_timeout +
8641 ++ bfqd->bfq_raising_min_idle_time < jiffies;
8642 ++
8643 ++ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
8644 ++ bfqq->queued[rq_is_sync(rq)]++;
8645 ++ bfqd->queued++;
8646 ++
8647 ++ elv_rb_add(&bfqq->sort_list, rq);
8648 ++
8649 ++ /*
8650 ++ * Check if this request is a better next-serve candidate.
8651 ++ */
8652 ++ prev = bfqq->next_rq;
8653 ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
8654 ++ BUG_ON(next_rq == NULL);
8655 ++ bfqq->next_rq = next_rq;
8656 ++
8657 ++ /*
8658 ++ * Adjust priority tree position, if next_rq changes.
8659 ++ */
8660 ++ if (prev != bfqq->next_rq)
8661 ++ bfq_rq_pos_tree_add(bfqd, bfqq);
8662 ++
8663 ++ if (!bfq_bfqq_busy(bfqq)) {
8664 ++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
8665 ++ bfqq->soft_rt_next_start < jiffies;
8666 ++ entity->budget = max_t(unsigned long, bfqq->max_budget,
8667 ++ bfq_serv_to_charge(next_rq, bfqq));
8668 ++
8669 ++ if (! bfqd->low_latency)
8670 ++ goto add_bfqq_busy;
8671 ++
8672 ++ /*
8673 ++ * If the queue is not being boosted and has been idle
8674 ++ * for enough time, start a weight-raising period
8675 ++ */
8676 ++ if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {
8677 ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
8678 ++ if (idle_for_long_time)
8679 ++ bfqq->raising_cur_max_time =
8680 ++ bfq_wrais_duration(bfqd);
8681 ++ else
8682 ++ bfqq->raising_cur_max_time =
8683 ++ bfqd->bfq_raising_rt_max_time;
8684 ++ bfq_log_bfqq(bfqd, bfqq,
8685 ++ "wrais starting at %llu msec,"
8686 ++ "rais_max_time %u",
8687 ++ bfqq->last_rais_start_finish,
8688 ++ jiffies_to_msecs(bfqq->
8689 ++ raising_cur_max_time));
8690 ++ } else if (old_raising_coeff > 1) {
8691 ++ if (idle_for_long_time)
8692 ++ bfqq->raising_cur_max_time =
8693 ++ bfq_wrais_duration(bfqd);
8694 ++ else if (bfqq->raising_cur_max_time ==
8695 ++ bfqd->bfq_raising_rt_max_time &&
8696 ++ !soft_rt) {
8697 ++ bfqq->raising_coeff = 1;
8698 ++ bfq_log_bfqq(bfqd, bfqq,
8699 ++ "wrais ending at %llu msec,"
8700 ++ "rais_max_time %u",
8701 ++ bfqq->last_rais_start_finish,
8702 ++ jiffies_to_msecs(bfqq->
8703 ++ raising_cur_max_time));
8704 ++ }
8705 ++ }
8706 ++ if (old_raising_coeff != bfqq->raising_coeff)
8707 ++ entity->ioprio_changed = 1;
8708 ++add_bfqq_busy:
8709 ++ bfq_add_bfqq_busy(bfqd, bfqq);
8710 ++ } else {
8711 ++ if(bfqd->low_latency && old_raising_coeff == 1 &&
8712 ++ !rq_is_sync(rq) &&
8713 ++ bfqq->last_rais_start_finish +
8714 ++ bfqd->bfq_raising_min_inter_arr_async < jiffies) {
8715 ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
8716 ++ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
8717 ++
8718 ++ entity->ioprio_changed = 1;
8719 ++ bfq_log_bfqq(bfqd, bfqq,
8720 ++ "non-idle wrais starting at %llu msec,"
8721 ++ "rais_max_time %u",
8722 ++ bfqq->last_rais_start_finish,
8723 ++ jiffies_to_msecs(bfqq->
8724 ++ raising_cur_max_time));
8725 ++ }
8726 ++ bfq_updated_next_req(bfqd, bfqq);
8727 ++ }
8728 ++
8729 ++ if(bfqd->low_latency &&
8730 ++ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
8731 ++ idle_for_long_time))
8732 ++ bfqq->last_rais_start_finish = jiffies;
8733 ++}
8734 ++
8735 ++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
8736 ++{
8737 ++ elv_rb_del(&bfqq->sort_list, rq);
8738 ++ bfqq->queued[rq_is_sync(rq)]--;
8739 ++ bfqq->bfqd->queued--;
8740 ++ bfq_add_rq_rb(rq);
8741 ++}
8742 ++
8743 ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
8744 ++ struct bio *bio)
8745 ++{
8746 ++ struct task_struct *tsk = current;
8747 ++ struct bfq_io_cq *bic;
8748 ++ struct bfq_queue *bfqq;
8749 ++
8750 ++ bic = bfq_bic_lookup(bfqd, tsk->io_context);
8751 ++ if (bic == NULL)
8752 ++ return NULL;
8753 ++
8754 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
8755 ++ if (bfqq != NULL) {
8756 ++ sector_t sector = bio->bi_sector + bio_sectors(bio);
8757 ++
8758 ++ return elv_rb_find(&bfqq->sort_list, sector);
8759 ++ }
8760 ++
8761 ++ return NULL;
8762 ++}
8763 ++
8764 ++static void bfq_activate_request(struct request_queue *q, struct request *rq)
8765 ++{
8766 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
8767 ++
8768 ++ bfqd->rq_in_driver++;
8769 ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
8770 ++ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
8771 ++ (long long unsigned)bfqd->last_position);
8772 ++}
8773 ++
8774 ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
8775 ++{
8776 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
8777 ++
8778 ++ WARN_ON(bfqd->rq_in_driver == 0);
8779 ++ bfqd->rq_in_driver--;
8780 ++}
8781 ++
8782 ++static void bfq_remove_request(struct request *rq)
8783 ++{
8784 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
8785 ++ struct bfq_data *bfqd = bfqq->bfqd;
8786 ++
8787 ++ if (bfqq->next_rq == rq) {
8788 ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
8789 ++ bfq_updated_next_req(bfqd, bfqq);
8790 ++ }
8791 ++
8792 ++ list_del_init(&rq->queuelist);
8793 ++ bfq_del_rq_rb(rq);
8794 ++
8795 ++ if (rq->cmd_flags & REQ_META) {
8796 ++ WARN_ON(bfqq->meta_pending == 0);
8797 ++ bfqq->meta_pending--;
8798 ++ }
8799 ++}
8800 ++
8801 ++static int bfq_merge(struct request_queue *q, struct request **req,
8802 ++ struct bio *bio)
8803 ++{
8804 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
8805 ++ struct request *__rq;
8806 ++
8807 ++ __rq = bfq_find_rq_fmerge(bfqd, bio);
8808 ++ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
8809 ++ *req = __rq;
8810 ++ return ELEVATOR_FRONT_MERGE;
8811 ++ }
8812 ++
8813 ++ return ELEVATOR_NO_MERGE;
8814 ++}
8815 ++
8816 ++static void bfq_merged_request(struct request_queue *q, struct request *req,
8817 ++ int type)
8818 ++{
8819 ++ if (type == ELEVATOR_FRONT_MERGE) {
8820 ++ struct bfq_queue *bfqq = RQ_BFQQ(req);
8821 ++
8822 ++ bfq_reposition_rq_rb(bfqq, req);
8823 ++ }
8824 ++}
8825 ++
8826 ++static void bfq_merged_requests(struct request_queue *q, struct request *rq,
8827 ++ struct request *next)
8828 ++{
8829 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
8830 ++
8831 ++ /*
8832 ++ * Reposition in fifo if next is older than rq.
8833 ++ */
8834 ++ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
8835 ++ time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
8836 ++ list_move(&rq->queuelist, &next->queuelist);
8837 ++ rq_set_fifo_time(rq, rq_fifo_time(next));
8838 ++ }
8839 ++
8840 ++ if (bfqq->next_rq == next)
8841 ++ bfqq->next_rq = rq;
8842 ++
8843 ++ bfq_remove_request(next);
8844 ++}
8845 ++
8846 ++/* Must be called with bfqq != NULL */
8847 ++static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)
8848 ++{
8849 ++ BUG_ON(bfqq == NULL);
8850 ++ bfqq->raising_coeff = 1;
8851 ++ bfqq->raising_cur_max_time = 0;
8852 ++ /* Trigger a weight change on the next activation of the queue */
8853 ++ bfqq->entity.ioprio_changed = 1;
8854 ++}
8855 ++
8856 ++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
8857 ++ struct bfq_group *bfqg)
8858 ++{
8859 ++ int i, j;
8860 ++
8861 ++ for (i = 0; i < 2; i++)
8862 ++ for (j = 0; j < IOPRIO_BE_NR; j++)
8863 ++ if (bfqg->async_bfqq[i][j] != NULL)
8864 ++ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);
8865 ++ if (bfqg->async_idle_bfqq != NULL)
8866 ++ bfq_bfqq_end_raising(bfqg->async_idle_bfqq);
8867 ++}
8868 ++
8869 ++static void bfq_end_raising(struct bfq_data *bfqd)
8870 ++{
8871 ++ struct bfq_queue *bfqq;
8872 ++
8873 ++ spin_lock_irq(bfqd->queue->queue_lock);
8874 ++
8875 ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
8876 ++ bfq_bfqq_end_raising(bfqq);
8877 ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
8878 ++ bfq_bfqq_end_raising(bfqq);
8879 ++ bfq_end_raising_async(bfqd);
8880 ++
8881 ++ spin_unlock_irq(bfqd->queue->queue_lock);
8882 ++}
8883 ++
8884 ++static int bfq_allow_merge(struct request_queue *q, struct request *rq,
8885 ++ struct bio *bio)
8886 ++{
8887 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
8888 ++ struct bfq_io_cq *bic;
8889 ++ struct bfq_queue *bfqq;
8890 ++
8891 ++ /*
8892 ++ * Disallow merge of a sync bio into an async request.
8893 ++ */
8894 ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
8895 ++ return 0;
8896 ++
8897 ++ /*
8898 ++ * Lookup the bfqq that this bio will be queued with. Allow
8899 ++ * merge only if rq is queued there.
8900 ++ * Queue lock is held here.
8901 ++ */
8902 ++ bic = bfq_bic_lookup(bfqd, current->io_context);
8903 ++ if (bic == NULL)
8904 ++ return 0;
8905 ++
8906 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
8907 ++ return bfqq == RQ_BFQQ(rq);
8908 ++}
8909 ++
8910 ++static void __bfq_set_active_queue(struct bfq_data *bfqd,
8911 ++ struct bfq_queue *bfqq)
8912 ++{
8913 ++ if (bfqq != NULL) {
8914 ++ bfq_mark_bfqq_must_alloc(bfqq);
8915 ++ bfq_mark_bfqq_budget_new(bfqq);
8916 ++ bfq_clear_bfqq_fifo_expire(bfqq);
8917 ++
8918 ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
8919 ++
8920 ++ bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
8921 ++ bfqq->entity.budget);
8922 ++ }
8923 ++
8924 ++ bfqd->active_queue = bfqq;
8925 ++}
8926 ++
8927 ++/*
8928 ++ * Get and set a new active queue for service.
8929 ++ */
8930 ++static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,
8931 ++ struct bfq_queue *bfqq)
8932 ++{
8933 ++ if (!bfqq)
8934 ++ bfqq = bfq_get_next_queue(bfqd);
8935 ++ else
8936 ++ bfq_get_next_queue_forced(bfqd, bfqq);
8937 ++
8938 ++ __bfq_set_active_queue(bfqd, bfqq);
8939 ++ return bfqq;
8940 ++}
8941 ++
8942 ++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
8943 ++ struct request *rq)
8944 ++{
8945 ++ if (blk_rq_pos(rq) >= bfqd->last_position)
8946 ++ return blk_rq_pos(rq) - bfqd->last_position;
8947 ++ else
8948 ++ return bfqd->last_position - blk_rq_pos(rq);
8949 ++}
8950 ++
8951 ++/*
8952 ++ * Return true if bfqq has no request pending and rq is close enough to
8953 ++ * bfqd->last_position, or if rq is closer to bfqd->last_position than
8954 ++ * bfqq->next_rq
8955 ++ */
8956 ++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
8957 ++{
8958 ++ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
8959 ++}
8960 ++
8961 ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
8962 ++{
8963 ++ struct rb_root *root = &bfqd->rq_pos_tree;
8964 ++ struct rb_node *parent, *node;
8965 ++ struct bfq_queue *__bfqq;
8966 ++ sector_t sector = bfqd->last_position;
8967 ++
8968 ++ if (RB_EMPTY_ROOT(root))
8969 ++ return NULL;
8970 ++
8971 ++ /*
8972 ++ * First, if we find a request starting at the end of the last
8973 ++ * request, choose it.
8974 ++ */
8975 ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
8976 ++ if (__bfqq != NULL)
8977 ++ return __bfqq;
8978 ++
8979 ++ /*
8980 ++ * If the exact sector wasn't found, the parent of the NULL leaf
8981 ++ * will contain the closest sector (rq_pos_tree sorted by next_request
8982 ++ * position).
8983 ++ */
8984 ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
8985 ++ if (bfq_rq_close(bfqd, __bfqq->next_rq))
8986 ++ return __bfqq;
8987 ++
8988 ++ if (blk_rq_pos(__bfqq->next_rq) < sector)
8989 ++ node = rb_next(&__bfqq->pos_node);
8990 ++ else
8991 ++ node = rb_prev(&__bfqq->pos_node);
8992 ++ if (node == NULL)
8993 ++ return NULL;
8994 ++
8995 ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
8996 ++ if (bfq_rq_close(bfqd, __bfqq->next_rq))
8997 ++ return __bfqq;
8998 ++
8999 ++ return NULL;
9000 ++}
9001 ++
9002 ++/*
9003 ++ * bfqd - obvious
9004 ++ * cur_bfqq - passed in so that we don't decide that the current queue
9005 ++ * is closely cooperating with itself.
9006 ++ *
9007 ++ * We are assuming that cur_bfqq has dispatched at least one request,
9008 ++ * and that bfqd->last_position reflects a position on the disk associated
9009 ++ * with the I/O issued by cur_bfqq.
9010 ++ */
9011 ++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
9012 ++ struct bfq_queue *cur_bfqq)
9013 ++{
9014 ++ struct bfq_queue *bfqq;
9015 ++
9016 ++ if (bfq_class_idle(cur_bfqq))
9017 ++ return NULL;
9018 ++ if (!bfq_bfqq_sync(cur_bfqq))
9019 ++ return NULL;
9020 ++ if (BFQQ_SEEKY(cur_bfqq))
9021 ++ return NULL;
9022 ++
9023 ++ /* If device has only one backlogged bfq_queue, don't search. */
9024 ++ if (bfqd->busy_queues == 1)
9025 ++ return NULL;
9026 ++
9027 ++ /*
9028 ++ * We should notice if some of the queues are cooperating, e.g.
9029 ++ * working closely on the same area of the disk. In that case,
9030 ++ * we can group them together and don't waste time idling.
9031 ++ */
9032 ++ bfqq = bfqq_close(bfqd);
9033 ++ if (bfqq == NULL || bfqq == cur_bfqq)
9034 ++ return NULL;
9035 ++
9036 ++ /*
9037 ++ * Do not merge queues from different bfq_groups.
9038 ++ */
9039 ++ if (bfqq->entity.parent != cur_bfqq->entity.parent)
9040 ++ return NULL;
9041 ++
9042 ++ /*
9043 ++ * It only makes sense to merge sync queues.
9044 ++ */
9045 ++ if (!bfq_bfqq_sync(bfqq))
9046 ++ return NULL;
9047 ++ if (BFQQ_SEEKY(bfqq))
9048 ++ return NULL;
9049 ++
9050 ++ /*
9051 ++ * Do not merge queues of different priority classes.
9052 ++ */
9053 ++ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
9054 ++ return NULL;
9055 ++
9056 ++ return bfqq;
9057 ++}
9058 ++
9059 ++/*
9060 ++ * If enough samples have been computed, return the current max budget
9061 ++ * stored in bfqd, which is dynamically updated according to the
9062 ++ * estimated disk peak rate; otherwise return the default max budget
9063 ++ */
9064 ++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
9065 ++{
9066 ++ if (bfqd->budgets_assigned < 194)
9067 ++ return bfq_default_max_budget;
9068 ++ else
9069 ++ return bfqd->bfq_max_budget;
9070 ++}
9071 ++
9072 ++/*
9073 ++ * Return min budget, which is a fraction of the current or default
9074 ++ * max budget (trying with 1/32)
9075 ++ */
9076 ++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
9077 ++{
9078 ++ if (bfqd->budgets_assigned < 194)
9079 ++ return bfq_default_max_budget / 32;
9080 ++ else
9081 ++ return bfqd->bfq_max_budget / 32;
9082 ++}
9083 ++
9084 ++/*
9085 ++ * Decides whether idling should be done for given device and
9086 ++ * given active queue.
9087 ++ */
9088 ++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
9089 ++ struct bfq_queue *active_bfqq)
9090 ++{
9091 ++ if (active_bfqq == NULL)
9092 ++ return false;
9093 ++ /*
9094 ++ * If device is SSD it has no seek penalty, disable idling; but
9095 ++ * do so only if:
9096 ++ * - device does not support queuing, otherwise we still have
9097 ++ * a problem with sync vs async workloads;
9098 ++ * - the queue is not weight-raised, to preserve guarantees.
9099 ++ */
9100 ++ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
9101 ++ active_bfqq->raising_coeff == 1);
9102 ++}
9103 ++
9104 ++static void bfq_arm_slice_timer(struct bfq_data *bfqd)
9105 ++{
9106 ++ struct bfq_queue *bfqq = bfqd->active_queue;
9107 ++ struct bfq_io_cq *bic;
9108 ++ unsigned long sl;
9109 ++
9110 ++ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
9111 ++
9112 ++ /* Tasks have exited, don't wait. */
9113 ++ bic = bfqd->active_bic;
9114 ++ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
9115 ++ return;
9116 ++
9117 ++ bfq_mark_bfqq_wait_request(bfqq);
9118 ++
9119 ++ /*
9120 ++ * We don't want to idle for seeks, but we do want to allow
9121 ++ * fair distribution of slice time for a process doing back-to-back
9122 ++ * seeks. So allow a little bit of time for him to submit a new rq.
9123 ++ *
9124 ++ * To prevent processes with (partly) seeky workloads from
9125 ++ * being too ill-treated, grant them a small fraction of the
9126 ++ * assigned budget before reducing the waiting time to
9127 ++ * BFQ_MIN_TT. This happened to help reduce latency.
9128 ++ */
9129 ++ sl = bfqd->bfq_slice_idle;
9130 ++ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
9131 ++ bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
9132 ++ bfqq->raising_coeff == 1)
9133 ++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
9134 ++ else if (bfqq->raising_coeff > 1)
9135 ++ sl = sl * 3;
9136 ++ bfqd->last_idling_start = ktime_get();
9137 ++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
9138 ++ bfq_log(bfqd, "arm idle: %u/%u ms",
9139 ++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
9140 ++}
9141 ++
9142 ++/*
9143 ++ * Set the maximum time for the active queue to consume its
9144 ++ * budget. This prevents seeky processes from lowering the disk
9145 ++ * throughput (always guaranteed with a time slice scheme as in CFQ).
9146 ++ */
9147 ++static void bfq_set_budget_timeout(struct bfq_data *bfqd)
9148 ++{
9149 ++ struct bfq_queue *bfqq = bfqd->active_queue;
9150 ++ unsigned int timeout_coeff;
9151 ++ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
9152 ++ timeout_coeff = 1;
9153 ++ else
9154 ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
9155 ++
9156 ++ bfqd->last_budget_start = ktime_get();
9157 ++
9158 ++ bfq_clear_bfqq_budget_new(bfqq);
9159 ++ bfqq->budget_timeout = jiffies +
9160 ++ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
9161 ++
9162 ++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
9163 ++ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
9164 ++ timeout_coeff));
9165 ++}
9166 ++
9167 ++/*
9168 ++ * Move request from internal lists to the request queue dispatch list.
9169 ++ */
9170 ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
9171 ++{
9172 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
9173 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
9174 ++
9175 ++ bfq_remove_request(rq);
9176 ++ bfqq->dispatched++;
9177 ++ elv_dispatch_sort(q, rq);
9178 ++
9179 ++ if (bfq_bfqq_sync(bfqq))
9180 ++ bfqd->sync_flight++;
9181 ++}
9182 ++
9183 ++/*
9184 ++ * Return expired entry, or NULL to just start from scratch in rbtree.
9185 ++ */
9186 ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
9187 ++{
9188 ++ struct request *rq = NULL;
9189 ++
9190 ++ if (bfq_bfqq_fifo_expire(bfqq))
9191 ++ return NULL;
9192 ++
9193 ++ bfq_mark_bfqq_fifo_expire(bfqq);
9194 ++
9195 ++ if (list_empty(&bfqq->fifo))
9196 ++ return NULL;
9197 ++
9198 ++ rq = rq_entry_fifo(bfqq->fifo.next);
9199 ++
9200 ++ if (time_before(jiffies, rq_fifo_time(rq)))
9201 ++ return NULL;
9202 ++
9203 ++ return rq;
9204 ++}
9205 ++
9206 ++/*
9207 ++ * Must be called with the queue_lock held.
9208 ++ */
9209 ++static int bfqq_process_refs(struct bfq_queue *bfqq)
9210 ++{
9211 ++ int process_refs, io_refs;
9212 ++
9213 ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
9214 ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
9215 ++ BUG_ON(process_refs < 0);
9216 ++ return process_refs;
9217 ++}
9218 ++
9219 ++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
9220 ++{
9221 ++ int process_refs, new_process_refs;
9222 ++ struct bfq_queue *__bfqq;
9223 ++
9224 ++ /*
9225 ++ * If there are no process references on the new_bfqq, then it is
9226 ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
9227 ++ * may have dropped their last reference (not just their last process
9228 ++ * reference).
9229 ++ */
9230 ++ if (!bfqq_process_refs(new_bfqq))
9231 ++ return;
9232 ++
9233 ++ /* Avoid a circular list and skip interim queue merges. */
9234 ++ while ((__bfqq = new_bfqq->new_bfqq)) {
9235 ++ if (__bfqq == bfqq)
9236 ++ return;
9237 ++ new_bfqq = __bfqq;
9238 ++ }
9239 ++
9240 ++ process_refs = bfqq_process_refs(bfqq);
9241 ++ new_process_refs = bfqq_process_refs(new_bfqq);
9242 ++ /*
9243 ++ * If the process for the bfqq has gone away, there is no
9244 ++ * sense in merging the queues.
9245 ++ */
9246 ++ if (process_refs == 0 || new_process_refs == 0)
9247 ++ return;
9248 ++
9249 ++ /*
9250 ++ * Merge in the direction of the lesser amount of work.
9251 ++ */
9252 ++ if (new_process_refs >= process_refs) {
9253 ++ bfqq->new_bfqq = new_bfqq;
9254 ++ atomic_add(process_refs, &new_bfqq->ref);
9255 ++ } else {
9256 ++ new_bfqq->new_bfqq = bfqq;
9257 ++ atomic_add(new_process_refs, &bfqq->ref);
9258 ++ }
9259 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
9260 ++ new_bfqq->pid);
9261 ++}
9262 ++
9263 ++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
9264 ++{
9265 ++ struct bfq_entity *entity = &bfqq->entity;
9266 ++ return entity->budget - entity->service;
9267 ++}
9268 ++
9269 ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
9270 ++{
9271 ++ BUG_ON(bfqq != bfqd->active_queue);
9272 ++
9273 ++ __bfq_bfqd_reset_active(bfqd);
9274 ++
9275 ++ /*
9276 ++ * If this bfqq is shared between multiple processes, check
9277 ++ * to make sure that those processes are still issuing I/Os
9278 ++ * within the mean seek distance. If not, it may be time to
9279 ++ * break the queues apart again.
9280 ++ */
9281 ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
9282 ++ bfq_mark_bfqq_split_coop(bfqq);
9283 ++
9284 ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
9285 ++ /*
9286 ++ * overloading budget_timeout field to store when
9287 ++ * the queue remains with no backlog, used by
9288 ++ * the weight-raising mechanism
9289 ++ */
9290 ++ bfqq->budget_timeout = jiffies ;
9291 ++ bfq_del_bfqq_busy(bfqd, bfqq, 1);
9292 ++ } else {
9293 ++ bfq_activate_bfqq(bfqd, bfqq);
9294 ++ /*
9295 ++ * Resort priority tree of potential close cooperators.
9296 ++ */
9297 ++ bfq_rq_pos_tree_add(bfqd, bfqq);
9298 ++ }
9299 ++}
9300 ++
9301 ++/**
9302 ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
9303 ++ * @bfqd: device data.
9304 ++ * @bfqq: queue to update.
9305 ++ * @reason: reason for expiration.
9306 ++ *
9307 ++ * Handle the feedback on @bfqq budget. See the body for detailed
9308 ++ * comments.
9309 ++ */
9310 ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
9311 ++ struct bfq_queue *bfqq,
9312 ++ enum bfqq_expiration reason)
9313 ++{
9314 ++ struct request *next_rq;
9315 ++ unsigned long budget, min_budget;
9316 ++
9317 ++ budget = bfqq->max_budget;
9318 ++ min_budget = bfq_min_budget(bfqd);
9319 ++
9320 ++ BUG_ON(bfqq != bfqd->active_queue);
9321 ++
9322 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
9323 ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
9324 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
9325 ++ budget, bfq_min_budget(bfqd));
9326 ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
9327 ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));
9328 ++
9329 ++ if (bfq_bfqq_sync(bfqq)) {
9330 ++ switch (reason) {
9331 ++ /*
9332 ++ * Caveat: in all the following cases we trade latency
9333 ++ * for throughput.
9334 ++ */
9335 ++ case BFQ_BFQQ_TOO_IDLE:
9336 ++ /*
9337 ++ * This is the only case where we may reduce
9338 ++ * the budget: if there is no requets of the
9339 ++ * process still waiting for completion, then
9340 ++ * we assume (tentatively) that the timer has
9341 ++ * expired because the batch of requests of
9342 ++ * the process could have been served with a
9343 ++ * smaller budget. Hence, betting that
9344 ++ * process will behave in the same way when it
9345 ++ * becomes backlogged again, we reduce its
9346 ++ * next budget. As long as we guess right,
9347 ++ * this budget cut reduces the latency
9348 ++ * experienced by the process.
9349 ++ *
9350 ++ * However, if there are still outstanding
9351 ++ * requests, then the process may have not yet
9352 ++ * issued its next request just because it is
9353 ++ * still waiting for the completion of some of
9354 ++ * the still oustanding ones. So in this
9355 ++ * subcase we do not reduce its budget, on the
9356 ++ * contrary we increase it to possibly boost
9357 ++ * the throughput, as discussed in the
9358 ++ * comments to the BUDGET_TIMEOUT case.
9359 ++ */
9360 ++ if (bfqq->dispatched > 0) /* still oustanding reqs */
9361 ++ budget = min(budget * 2, bfqd->bfq_max_budget);
9362 ++ else {
9363 ++ if (budget > 5 * min_budget)
9364 ++ budget -= 4 * min_budget;
9365 ++ else
9366 ++ budget = min_budget;
9367 ++ }
9368 ++ break;
9369 ++ case BFQ_BFQQ_BUDGET_TIMEOUT:
9370 ++ /*
9371 ++ * We double the budget here because: 1) it
9372 ++ * gives the chance to boost the throughput if
9373 ++ * this is not a seeky process (which may have
9374 ++ * bumped into this timeout because of, e.g.,
9375 ++ * ZBR), 2) together with charge_full_budget
9376 ++ * it helps give seeky processes higher
9377 ++ * timestamps, and hence be served less
9378 ++ * frequently.
9379 ++ */
9380 ++ budget = min(budget * 2, bfqd->bfq_max_budget);
9381 ++ break;
9382 ++ case BFQ_BFQQ_BUDGET_EXHAUSTED:
9383 ++ /*
9384 ++ * The process still has backlog, and did not
9385 ++ * let either the budget timeout or the disk
9386 ++ * idling timeout expire. Hence it is not
9387 ++ * seeky, has a short thinktime and may be
9388 ++ * happy with a higher budget too. So
9389 ++ * definitely increase the budget of this good
9390 ++ * candidate to boost the disk throughput.
9391 ++ */
9392 ++ budget = min(budget * 4, bfqd->bfq_max_budget);
9393 ++ break;
9394 ++ case BFQ_BFQQ_NO_MORE_REQUESTS:
9395 ++ /*
9396 ++ * Leave the budget unchanged.
9397 ++ */
9398 ++ default:
9399 ++ return;
9400 ++ }
9401 ++ } else /* async queue */
9402 ++ /* async queues get always the maximum possible budget
9403 ++ * (their ability to dispatch is limited by
9404 ++ * @bfqd->bfq_max_budget_async_rq).
9405 ++ */
9406 ++ budget = bfqd->bfq_max_budget;
9407 ++
9408 ++ bfqq->max_budget = budget;
9409 ++
9410 ++ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
9411 ++ bfqq->max_budget > bfqd->bfq_max_budget)
9412 ++ bfqq->max_budget = bfqd->bfq_max_budget;
9413 ++
9414 ++ /*
9415 ++ * Make sure that we have enough budget for the next request.
9416 ++ * Since the finish time of the bfqq must be kept in sync with
9417 ++ * the budget, be sure to call __bfq_bfqq_expire() after the
9418 ++ * update.
9419 ++ */
9420 ++ next_rq = bfqq->next_rq;
9421 ++ if (next_rq != NULL)
9422 ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
9423 ++ bfq_serv_to_charge(next_rq, bfqq));
9424 ++ else
9425 ++ bfqq->entity.budget = bfqq->max_budget;
9426 ++
9427 ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
9428 ++ next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
9429 ++ bfqq->entity.budget);
9430 ++}
9431 ++
9432 ++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
9433 ++{
9434 ++ unsigned long max_budget;
9435 ++
9436 ++ /*
9437 ++ * The max_budget calculated when autotuning is equal to the
9438 ++ * amount of sectors transfered in timeout_sync at the
9439 ++ * estimated peak rate.
9440 ++ */
9441 ++ max_budget = (unsigned long)(peak_rate * 1000 *
9442 ++ timeout >> BFQ_RATE_SHIFT);
9443 ++
9444 ++ return max_budget;
9445 ++}
9446 ++
9447 ++/*
9448 ++ * In addition to updating the peak rate, checks whether the process
9449 ++ * is "slow", and returns 1 if so. This slow flag is used, in addition
9450 ++ * to the budget timeout, to reduce the amount of service provided to
9451 ++ * seeky processes, and hence reduce their chances to lower the
9452 ++ * throughput. See the code for more details.
9453 ++ */
9454 ++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
9455 ++ int compensate, enum bfqq_expiration reason)
9456 ++{
9457 ++ u64 bw, usecs, expected, timeout;
9458 ++ ktime_t delta;
9459 ++ int update = 0;
9460 ++
9461 ++ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
9462 ++ return 0;
9463 ++
9464 ++ if (compensate)
9465 ++ delta = bfqd->last_idling_start;
9466 ++ else
9467 ++ delta = ktime_get();
9468 ++ delta = ktime_sub(delta, bfqd->last_budget_start);
9469 ++ usecs = ktime_to_us(delta);
9470 ++
9471 ++ /* Don't trust short/unrealistic values. */
9472 ++ if (usecs < 100 || usecs >= LONG_MAX)
9473 ++ return 0;
9474 ++
9475 ++ /*
9476 ++ * Calculate the bandwidth for the last slice. We use a 64 bit
9477 ++ * value to store the peak rate, in sectors per usec in fixed
9478 ++ * point math. We do so to have enough precision in the estimate
9479 ++ * and to avoid overflows.
9480 ++ */
9481 ++ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
9482 ++ do_div(bw, (unsigned long)usecs);
9483 ++
9484 ++ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
9485 ++
9486 ++ /*
9487 ++ * Use only long (> 20ms) intervals to filter out spikes for
9488 ++ * the peak rate estimation.
9489 ++ */
9490 ++ if (usecs > 20000) {
9491 ++ if (bw > bfqd->peak_rate ||
9492 ++ (!BFQQ_SEEKY(bfqq) &&
9493 ++ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
9494 ++ bfq_log(bfqd, "measured bw =%llu", bw);
9495 ++ /*
9496 ++ * To smooth oscillations use a low-pass filter with
9497 ++ * alpha=7/8, i.e.,
9498 ++ * new_rate = (7/8) * old_rate + (1/8) * bw
9499 ++ */
9500 ++ do_div(bw, 8);
9501 ++ if (bw == 0)
9502 ++ return 0;
9503 ++ bfqd->peak_rate *= 7;
9504 ++ do_div(bfqd->peak_rate, 8);
9505 ++ bfqd->peak_rate += bw;
9506 ++ update = 1;
9507 ++ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
9508 ++ }
9509 ++
9510 ++ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
9511 ++
9512 ++ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
9513 ++ bfqd->peak_rate_samples++;
9514 ++
9515 ++ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
9516 ++ update && bfqd->bfq_user_max_budget == 0) {
9517 ++ bfqd->bfq_max_budget =
9518 ++ bfq_calc_max_budget(bfqd->peak_rate, timeout);
9519 ++ bfq_log(bfqd, "new max_budget=%lu",
9520 ++ bfqd->bfq_max_budget);
9521 ++ }
9522 ++ }
9523 ++
9524 ++ /*
9525 ++ * If the process has been served for a too short time
9526 ++ * interval to let its possible sequential accesses prevail on
9527 ++ * the initial seek time needed to move the disk head on the
9528 ++ * first sector it requested, then give the process a chance
9529 ++ * and for the moment return false.
9530 ++ */
9531 ++ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
9532 ++ return 0;
9533 ++
9534 ++ /*
9535 ++ * A process is considered ``slow'' (i.e., seeky, so that we
9536 ++ * cannot treat it fairly in the service domain, as it would
9537 ++ * slow down too much the other processes) if, when a slice
9538 ++ * ends for whatever reason, it has received service at a
9539 ++ * rate that would not be high enough to complete the budget
9540 ++ * before the budget timeout expiration.
9541 ++ */
9542 ++ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
9543 ++
9544 ++ /*
9545 ++ * Caveat: processes doing IO in the slower disk zones will
9546 ++ * tend to be slow(er) even if not seeky. And the estimated
9547 ++ * peak rate will actually be an average over the disk
9548 ++ * surface. Hence, to not be too harsh with unlucky processes,
9549 ++ * we keep a budget/3 margin of safety before declaring a
9550 ++ * process slow.
9551 ++ */
9552 ++ return expected > (4 * bfqq->entity.budget) / 3;
9553 ++}
9554 ++
9555 ++/**
9556 ++ * bfq_bfqq_expire - expire a queue.
9557 ++ * @bfqd: device owning the queue.
9558 ++ * @bfqq: the queue to expire.
9559 ++ * @compensate: if true, compensate for the time spent idling.
9560 ++ * @reason: the reason causing the expiration.
9561 ++ *
9562 ++ *
9563 ++ * If the process associated to the queue is slow (i.e., seeky), or in
9564 ++ * case of budget timeout, or, finally, if it is async, we
9565 ++ * artificially charge it an entire budget (independently of the
9566 ++ * actual service it received). As a consequence, the queue will get
9567 ++ * higher timestamps than the correct ones upon reactivation, and
9568 ++ * hence it will be rescheduled as if it had received more service
9569 ++ * than what it actually received. In the end, this class of processes
9570 ++ * will receive less service in proportion to how slowly they consume
9571 ++ * their budgets (and hence how seriously they tend to lower the
9572 ++ * throughput).
9573 ++ *
9574 ++ * In contrast, when a queue expires because it has been idling for
9575 ++ * too much or because it exhausted its budget, we do not touch the
9576 ++ * amount of service it has received. Hence when the queue will be
9577 ++ * reactivated and its timestamps updated, the latter will be in sync
9578 ++ * with the actual service received by the queue until expiration.
9579 ++ *
9580 ++ * Charging a full budget to the first type of queues and the exact
9581 ++ * service to the others has the effect of using the WF2Q+ policy to
9582 ++ * schedule the former on a timeslice basis, without violating the
9583 ++ * service domain guarantees of the latter.
9584 ++ */
9585 ++static void bfq_bfqq_expire(struct bfq_data *bfqd,
9586 ++ struct bfq_queue *bfqq,
9587 ++ int compensate,
9588 ++ enum bfqq_expiration reason)
9589 ++{
9590 ++ int slow;
9591 ++ BUG_ON(bfqq != bfqd->active_queue);
9592 ++
9593 ++ /* Update disk peak rate for autotuning and check whether the
9594 ++ * process is slow (see bfq_update_peak_rate).
9595 ++ */
9596 ++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
9597 ++
9598 ++ /*
9599 ++ * As above explained, 'punish' slow (i.e., seeky), timed-out
9600 ++ * and async queues, to favor sequential sync workloads.
9601 ++ *
9602 ++ * Processes doing IO in the slower disk zones will tend to be
9603 ++ * slow(er) even if not seeky. Hence, since the estimated peak
9604 ++ * rate is actually an average over the disk surface, these
9605 ++ * processes may timeout just for bad luck. To avoid punishing
9606 ++ * them we do not charge a full budget to a process that
9607 ++ * succeeded in consuming at least 2/3 of its budget.
9608 ++ */
9609 ++ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
9610 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
9611 ++ bfq_bfqq_charge_full_budget(bfqq);
9612 ++
9613 ++ if (bfqd->low_latency && bfqq->raising_coeff == 1)
9614 ++ bfqq->last_rais_start_finish = jiffies;
9615 ++
9616 ++ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {
9617 ++ if(reason != BFQ_BFQQ_BUDGET_TIMEOUT)
9618 ++ bfqq->soft_rt_next_start =
9619 ++ jiffies +
9620 ++ HZ * bfqq->entity.service /
9621 ++ bfqd->bfq_raising_max_softrt_rate;
9622 ++ else
9623 ++ bfqq->soft_rt_next_start = -1; /* infinity */
9624 ++ }
9625 ++ bfq_log_bfqq(bfqd, bfqq,
9626 ++ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
9627 ++ bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
9628 ++
9629 ++ /* Increase, decrease or leave budget unchanged according to reason */
9630 ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
9631 ++ __bfq_bfqq_expire(bfqd, bfqq);
9632 ++}
9633 ++
9634 ++/*
9635 ++ * Budget timeout is not implemented through a dedicated timer, but
9636 ++ * just checked on request arrivals and completions, as well as on
9637 ++ * idle timer expirations.
9638 ++ */
9639 ++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
9640 ++{
9641 ++ if (bfq_bfqq_budget_new(bfqq))
9642 ++ return 0;
9643 ++
9644 ++ if (time_before(jiffies, bfqq->budget_timeout))
9645 ++ return 0;
9646 ++
9647 ++ return 1;
9648 ++}
9649 ++
9650 ++/*
9651 ++ * If we expire a queue that is waiting for the arrival of a new
9652 ++ * request, we may prevent the fictitious timestamp backshifting that
9653 ++ * allows the guarantees of the queue to be preserved (see [1] for
9654 ++ * this tricky aspect). Hence we return true only if this condition
9655 ++ * does not hold, or if the queue is slow enough to deserve only to be
9656 ++ * kicked off for preserving a high throughput.
9657 ++*/
9658 ++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
9659 ++{
9660 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
9661 ++ "may_budget_timeout: wr %d left %d timeout %d",
9662 ++ bfq_bfqq_wait_request(bfqq),
9663 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
9664 ++ bfq_bfqq_budget_timeout(bfqq));
9665 ++
9666 ++ return (!bfq_bfqq_wait_request(bfqq) ||
9667 ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
9668 ++ &&
9669 ++ bfq_bfqq_budget_timeout(bfqq);
9670 ++}
9671 ++
9672 ++/*
9673 ++ * If the active queue is empty, but it is sync and either of the following
9674 ++ * conditions holds, then: 1) the queue must remain active and cannot be
9675 ++ * expired, and 2) the disk must be idled to wait for the possible arrival
9676 ++ * of a new request for the queue. The conditions are:
9677 ++ * - the device is rotational and not performing NCQ, and the queue has its
9678 ++ * idle window set (in this case, waiting for a new request for the queue
9679 ++ * is likely to boost the disk throughput);
9680 ++ * - the queue is weight-raised (waiting for the request is necessary for
9681 ++ * providing the queue with fairness and latency guarantees).
9682 ++ */
9683 ++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
9684 ++ int budg_timeout)
9685 ++{
9686 ++ struct bfq_data *bfqd = bfqq->bfqd;
9687 ++
9688 ++ return (bfq_bfqq_sync(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&
9689 ++ bfqd->bfq_slice_idle != 0 &&
9690 ++ ((bfq_bfqq_idle_window(bfqq) && !bfqd->hw_tag &&
9691 ++ !blk_queue_nonrot(bfqd->queue))
9692 ++ || bfqq->raising_coeff > 1) &&
9693 ++ (bfqd->rq_in_driver == 0 ||
9694 ++ budg_timeout ||
9695 ++ bfqq->raising_coeff > 1) &&
9696 ++ !bfq_close_cooperator(bfqd, bfqq) &&
9697 ++ (!bfq_bfqq_coop(bfqq) ||
9698 ++ !bfq_bfqq_some_coop_idle(bfqq)) &&
9699 ++ !bfq_queue_nonrot_noidle(bfqd, bfqq));
9700 ++}
9701 ++
9702 ++/*
9703 ++ * Select a queue for service. If we have a current active queue,
9704 ++ * check whether to continue servicing it, or retrieve and set a new one.
9705 ++ */
9706 ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
9707 ++{
9708 ++ struct bfq_queue *bfqq, *new_bfqq = NULL;
9709 ++ struct request *next_rq;
9710 ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
9711 ++ int budg_timeout;
9712 ++
9713 ++ bfqq = bfqd->active_queue;
9714 ++ if (bfqq == NULL)
9715 ++ goto new_queue;
9716 ++
9717 ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
9718 ++
9719 ++ /*
9720 ++ * If another queue has a request waiting within our mean seek
9721 ++ * distance, let it run. The expire code will check for close
9722 ++ * cooperators and put the close queue at the front of the
9723 ++ * service tree. If possible, merge the expiring queue with the
9724 ++ * new bfqq.
9725 ++ */
9726 ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq);
9727 ++ if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
9728 ++ bfq_setup_merge(bfqq, new_bfqq);
9729 ++
9730 ++ budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
9731 ++ if (budg_timeout &&
9732 ++ !bfq_bfqq_must_idle(bfqq, budg_timeout))
9733 ++ goto expire;
9734 ++
9735 ++ next_rq = bfqq->next_rq;
9736 ++ /*
9737 ++ * If bfqq has requests queued and it has enough budget left to
9738 ++ * serve them, keep the queue, otherwise expire it.
9739 ++ */
9740 ++ if (next_rq != NULL) {
9741 ++ if (bfq_serv_to_charge(next_rq, bfqq) >
9742 ++ bfq_bfqq_budget_left(bfqq)) {
9743 ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
9744 ++ goto expire;
9745 ++ } else {
9746 ++ /*
9747 ++ * The idle timer may be pending because we may not
9748 ++ * disable disk idling even when a new request arrives
9749 ++ */
9750 ++ if (timer_pending(&bfqd->idle_slice_timer)) {
9751 ++ /*
9752 ++ * If we get here: 1) at least a new request
9753 ++ * has arrived but we have not disabled the
9754 ++ * timer because the request was too small,
9755 ++ * 2) then the block layer has unplugged the
9756 ++ * device, causing the dispatch to be invoked.
9757 ++ *
9758 ++ * Since the device is unplugged, now the
9759 ++ * requests are probably large enough to
9760 ++ * provide a reasonable throughput.
9761 ++ * So we disable idling.
9762 ++ */
9763 ++ bfq_clear_bfqq_wait_request(bfqq);
9764 ++ del_timer(&bfqd->idle_slice_timer);
9765 ++ }
9766 ++ if (new_bfqq == NULL)
9767 ++ goto keep_queue;
9768 ++ else
9769 ++ goto expire;
9770 ++ }
9771 ++ }
9772 ++
9773 ++ /*
9774 ++ * No requests pending. If there is no cooperator, and the active
9775 ++ * queue still has requests in flight or is idling for a new request,
9776 ++ * then keep it.
9777 ++ */
9778 ++ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
9779 ++ (bfqq->dispatched != 0 &&
9780 ++ (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&
9781 ++ !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {
9782 ++ bfqq = NULL;
9783 ++ goto keep_queue;
9784 ++ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
9785 ++ /*
9786 ++ * Expiring the queue because there is a close cooperator,
9787 ++ * cancel timer.
9788 ++ */
9789 ++ bfq_clear_bfqq_wait_request(bfqq);
9790 ++ del_timer(&bfqd->idle_slice_timer);
9791 ++ }
9792 ++
9793 ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
9794 ++expire:
9795 ++ bfq_bfqq_expire(bfqd, bfqq, 0, reason);
9796 ++new_queue:
9797 ++ bfqq = bfq_set_active_queue(bfqd, new_bfqq);
9798 ++ bfq_log(bfqd, "select_queue: new queue %d returned",
9799 ++ bfqq != NULL ? bfqq->pid : 0);
9800 ++keep_queue:
9801 ++ return bfqq;
9802 ++}
9803 ++
9804 ++static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
9805 ++{
9806 ++ if (bfqq->raising_coeff > 1) { /* queue is being boosted */
9807 ++ struct bfq_entity *entity = &bfqq->entity;
9808 ++
9809 ++ bfq_log_bfqq(bfqd, bfqq,
9810 ++ "raising period dur %u/%u msec, "
9811 ++ "old raising coeff %u, w %d(%d)",
9812 ++ jiffies_to_msecs(jiffies -
9813 ++ bfqq->last_rais_start_finish),
9814 ++ jiffies_to_msecs(bfqq->raising_cur_max_time),
9815 ++ bfqq->raising_coeff,
9816 ++ bfqq->entity.weight, bfqq->entity.orig_weight);
9817 ++
9818 ++ BUG_ON(bfqq != bfqd->active_queue && entity->weight !=
9819 ++ entity->orig_weight * bfqq->raising_coeff);
9820 ++ if(entity->ioprio_changed)
9821 ++ bfq_log_bfqq(bfqd, bfqq,
9822 ++ "WARN: pending prio change");
9823 ++ /*
9824 ++ * If too much time has elapsed from the beginning
9825 ++ * of this weight-raising period and process is not soft
9826 ++ * real-time, stop it
9827 ++ */
9828 ++ if (jiffies - bfqq->last_rais_start_finish >
9829 ++ bfqq->raising_cur_max_time) {
9830 ++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
9831 ++ bfqq->soft_rt_next_start < jiffies;
9832 ++
9833 ++ bfqq->last_rais_start_finish = jiffies;
9834 ++ if (soft_rt)
9835 ++ bfqq->raising_cur_max_time =
9836 ++ bfqd->bfq_raising_rt_max_time;
9837 ++ else {
9838 ++ bfq_log_bfqq(bfqd, bfqq,
9839 ++ "wrais ending at %llu msec,"
9840 ++ "rais_max_time %u",
9841 ++ bfqq->last_rais_start_finish,
9842 ++ jiffies_to_msecs(bfqq->
9843 ++ raising_cur_max_time));
9844 ++ bfq_bfqq_end_raising(bfqq);
9845 ++ __bfq_entity_update_weight_prio(
9846 ++ bfq_entity_service_tree(entity),
9847 ++ entity);
9848 ++ }
9849 ++ }
9850 ++ }
9851 ++}
9852 ++
9853 ++/*
9854 ++ * Dispatch one request from bfqq, moving it to the request queue
9855 ++ * dispatch list.
9856 ++ */
9857 ++static int bfq_dispatch_request(struct bfq_data *bfqd,
9858 ++ struct bfq_queue *bfqq)
9859 ++{
9860 ++ int dispatched = 0;
9861 ++ struct request *rq;
9862 ++ unsigned long service_to_charge;
9863 ++
9864 ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
9865 ++
9866 ++ /* Follow expired path, else get first next available. */
9867 ++ rq = bfq_check_fifo(bfqq);
9868 ++ if (rq == NULL)
9869 ++ rq = bfqq->next_rq;
9870 ++ service_to_charge = bfq_serv_to_charge(rq, bfqq);
9871 ++
9872 ++ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
9873 ++ /*
9874 ++ * This may happen if the next rq is chosen
9875 ++ * in fifo order instead of sector order.
9876 ++ * The budget is properly dimensioned
9877 ++ * to be always sufficient to serve the next request
9878 ++ * only if it is chosen in sector order. The reason is
9879 ++ * that it would be quite inefficient and little useful
9880 ++ * to always make sure that the budget is large enough
9881 ++ * to serve even the possible next rq in fifo order.
9882 ++ * In fact, requests are seldom served in fifo order.
9883 ++ *
9884 ++ * Expire the queue for budget exhaustion, and
9885 ++ * make sure that the next act_budget is enough
9886 ++ * to serve the next request, even if it comes
9887 ++ * from the fifo expired path.
9888 ++ */
9889 ++ bfqq->next_rq = rq;
9890 ++ /*
9891 ++ * Since this dispatch is failed, make sure that
9892 ++ * a new one will be performed
9893 ++ */
9894 ++ if (!bfqd->rq_in_driver)
9895 ++ bfq_schedule_dispatch(bfqd);
9896 ++ goto expire;
9897 ++ }
9898 ++
9899 ++ /* Finally, insert request into driver dispatch list. */
9900 ++ bfq_bfqq_served(bfqq, service_to_charge);
9901 ++ bfq_dispatch_insert(bfqd->queue, rq);
9902 ++
9903 ++ update_raising_data(bfqd, bfqq);
9904 ++
9905 ++ bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), "
9906 ++ "budg left %lu",
9907 ++ blk_rq_sectors(rq),
9908 ++ (long long unsigned)blk_rq_pos(rq),
9909 ++ bfq_bfqq_budget_left(bfqq));
9910 ++
9911 ++ dispatched++;
9912 ++
9913 ++ if (bfqd->active_bic == NULL) {
9914 ++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
9915 ++ bfqd->active_bic = RQ_BIC(rq);
9916 ++ }
9917 ++
9918 ++ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
9919 ++ dispatched >= bfqd->bfq_max_budget_async_rq) ||
9920 ++ bfq_class_idle(bfqq)))
9921 ++ goto expire;
9922 ++
9923 ++ return dispatched;
9924 ++
9925 ++expire:
9926 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
9927 ++ return dispatched;
9928 ++}
9929 ++
9930 ++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
9931 ++{
9932 ++ int dispatched = 0;
9933 ++
9934 ++ while (bfqq->next_rq != NULL) {
9935 ++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
9936 ++ dispatched++;
9937 ++ }
9938 ++
9939 ++ BUG_ON(!list_empty(&bfqq->fifo));
9940 ++ return dispatched;
9941 ++}
9942 ++
9943 ++/*
9944 ++ * Drain our current requests. Used for barriers and when switching
9945 ++ * io schedulers on-the-fly.
9946 ++ */
9947 ++static int bfq_forced_dispatch(struct bfq_data *bfqd)
9948 ++{
9949 ++ struct bfq_queue *bfqq, *n;
9950 ++ struct bfq_service_tree *st;
9951 ++ int dispatched = 0;
9952 ++
9953 ++ bfqq = bfqd->active_queue;
9954 ++ if (bfqq != NULL)
9955 ++ __bfq_bfqq_expire(bfqd, bfqq);
9956 ++
9957 ++ /*
9958 ++ * Loop through classes, and be careful to leave the scheduler
9959 ++ * in a consistent state, as feedback mechanisms and vtime
9960 ++ * updates cannot be disabled during the process.
9961 ++ */
9962 ++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
9963 ++ st = bfq_entity_service_tree(&bfqq->entity);
9964 ++
9965 ++ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
9966 ++ bfqq->max_budget = bfq_max_budget(bfqd);
9967 ++
9968 ++ bfq_forget_idle(st);
9969 ++ }
9970 ++
9971 ++ BUG_ON(bfqd->busy_queues != 0);
9972 ++
9973 ++ return dispatched;
9974 ++}
9975 ++
9976 ++static int bfq_dispatch_requests(struct request_queue *q, int force)
9977 ++{
9978 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
9979 ++ struct bfq_queue *bfqq;
9980 ++ int max_dispatch;
9981 ++
9982 ++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
9983 ++ if (bfqd->busy_queues == 0)
9984 ++ return 0;
9985 ++
9986 ++ if (unlikely(force))
9987 ++ return bfq_forced_dispatch(bfqd);
9988 ++
9989 ++ if((bfqq = bfq_select_queue(bfqd)) == NULL)
9990 ++ return 0;
9991 ++
9992 ++ max_dispatch = bfqd->bfq_quantum;
9993 ++ if (bfq_class_idle(bfqq))
9994 ++ max_dispatch = 1;
9995 ++
9996 ++ if (!bfq_bfqq_sync(bfqq))
9997 ++ max_dispatch = bfqd->bfq_max_budget_async_rq;
9998 ++
9999 ++ if (bfqq->dispatched >= max_dispatch) {
10000 ++ if (bfqd->busy_queues > 1)
10001 ++ return 0;
10002 ++ if (bfqq->dispatched >= 4 * max_dispatch)
10003 ++ return 0;
10004 ++ }
10005 ++
10006 ++ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
10007 ++ return 0;
10008 ++
10009 ++ bfq_clear_bfqq_wait_request(bfqq);
10010 ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
10011 ++
10012 ++ if (! bfq_dispatch_request(bfqd, bfqq))
10013 ++ return 0;
10014 ++
10015 ++ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d"
10016 ++ "(max_disp %d)", bfqq->pid, max_dispatch);
10017 ++
10018 ++ return 1;
10019 ++}
10020 ++
10021 ++/*
10022 ++ * Task holds one reference to the queue, dropped when task exits. Each rq
10023 ++ * in-flight on this queue also holds a reference, dropped when rq is freed.
10024 ++ *
10025 ++ * Queue lock must be held here.
10026 ++ */
10027 ++static void bfq_put_queue(struct bfq_queue *bfqq)
10028 ++{
10029 ++ struct bfq_data *bfqd = bfqq->bfqd;
10030 ++
10031 ++ BUG_ON(atomic_read(&bfqq->ref) <= 0);
10032 ++
10033 ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
10034 ++ atomic_read(&bfqq->ref));
10035 ++ if (!atomic_dec_and_test(&bfqq->ref))
10036 ++ return;
10037 ++
10038 ++ BUG_ON(rb_first(&bfqq->sort_list) != NULL);
10039 ++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
10040 ++ BUG_ON(bfqq->entity.tree != NULL);
10041 ++ BUG_ON(bfq_bfqq_busy(bfqq));
10042 ++ BUG_ON(bfqd->active_queue == bfqq);
10043 ++
10044 ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
10045 ++
10046 ++ kmem_cache_free(bfq_pool, bfqq);
10047 ++}
10048 ++
10049 ++static void bfq_put_cooperator(struct bfq_queue *bfqq)
10050 ++{
10051 ++ struct bfq_queue *__bfqq, *next;
10052 ++
10053 ++ /*
10054 ++ * If this queue was scheduled to merge with another queue, be
10055 ++ * sure to drop the reference taken on that queue (and others in
10056 ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
10057 ++ */
10058 ++ __bfqq = bfqq->new_bfqq;
10059 ++ while (__bfqq) {
10060 ++ if (__bfqq == bfqq) {
10061 ++ WARN(1, "bfqq->new_bfqq loop detected.\n");
10062 ++ break;
10063 ++ }
10064 ++ next = __bfqq->new_bfqq;
10065 ++ bfq_put_queue(__bfqq);
10066 ++ __bfqq = next;
10067 ++ }
10068 ++}
10069 ++
10070 ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
10071 ++{
10072 ++ if (bfqq == bfqd->active_queue) {
10073 ++ __bfq_bfqq_expire(bfqd, bfqq);
10074 ++ bfq_schedule_dispatch(bfqd);
10075 ++ }
10076 ++
10077 ++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
10078 ++ atomic_read(&bfqq->ref));
10079 ++
10080 ++ bfq_put_cooperator(bfqq);
10081 ++
10082 ++ bfq_put_queue(bfqq);
10083 ++}
10084 ++
10085 ++static void bfq_init_icq(struct io_cq *icq)
10086 ++{
10087 ++ struct bfq_io_cq *bic = icq_to_bic(icq);
10088 ++
10089 ++ bic->ttime.last_end_request = jiffies;
10090 ++}
10091 ++
10092 ++static void bfq_exit_icq(struct io_cq *icq)
10093 ++{
10094 ++ struct bfq_io_cq *bic = icq_to_bic(icq);
10095 ++ struct bfq_data *bfqd = bic_to_bfqd(bic);
10096 ++
10097 ++ if (bic->bfqq[BLK_RW_ASYNC]) {
10098 ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
10099 ++ bic->bfqq[BLK_RW_ASYNC] = NULL;
10100 ++ }
10101 ++
10102 ++ if (bic->bfqq[BLK_RW_SYNC]) {
10103 ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
10104 ++ bic->bfqq[BLK_RW_SYNC] = NULL;
10105 ++ }
10106 ++}
10107 ++
10108 ++/*
10109 ++ * Update the entity prio values; note that the new values will not
10110 ++ * be used until the next (re)activation.
10111 ++ */
10112 ++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
10113 ++{
10114 ++ struct task_struct *tsk = current;
10115 ++ int ioprio_class;
10116 ++
10117 ++ if (!bfq_bfqq_prio_changed(bfqq))
10118 ++ return;
10119 ++
10120 ++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
10121 ++ switch (ioprio_class) {
10122 ++ default:
10123 ++ printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);
10124 ++ case IOPRIO_CLASS_NONE:
10125 ++ /*
10126 ++ * No prio set, inherit CPU scheduling settings.
10127 ++ */
10128 ++ bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
10129 ++ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
10130 ++ break;
10131 ++ case IOPRIO_CLASS_RT:
10132 ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
10133 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
10134 ++ break;
10135 ++ case IOPRIO_CLASS_BE:
10136 ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
10137 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
10138 ++ break;
10139 ++ case IOPRIO_CLASS_IDLE:
10140 ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
10141 ++ bfqq->entity.new_ioprio = 7;
10142 ++ bfq_clear_bfqq_idle_window(bfqq);
10143 ++ break;
10144 ++ }
10145 ++
10146 ++ bfqq->entity.ioprio_changed = 1;
10147 ++
10148 ++ /*
10149 ++ * Keep track of original prio settings in case we have to temporarily
10150 ++ * elevate the priority of this queue.
10151 ++ */
10152 ++ bfqq->org_ioprio = bfqq->entity.new_ioprio;
10153 ++ bfq_clear_bfqq_prio_changed(bfqq);
10154 ++}
10155 ++
10156 ++static void bfq_changed_ioprio(struct bfq_io_cq *bic)
10157 ++{
10158 ++ struct bfq_data *bfqd;
10159 ++ struct bfq_queue *bfqq, *new_bfqq;
10160 ++ struct bfq_group *bfqg;
10161 ++ unsigned long uninitialized_var(flags);
10162 ++ int ioprio = bic->icq.ioc->ioprio;
10163 ++
10164 ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
10165 ++ /*
10166 ++ * This condition may trigger on a newly created bic, be sure to drop the
10167 ++ * lock before returning.
10168 ++ */
10169 ++ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
10170 ++ goto out;
10171 ++
10172 ++ bfqq = bic->bfqq[BLK_RW_ASYNC];
10173 ++ if (bfqq != NULL) {
10174 ++ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
10175 ++ sched_data);
10176 ++ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
10177 ++ GFP_ATOMIC);
10178 ++ if (new_bfqq != NULL) {
10179 ++ bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
10180 ++ bfq_log_bfqq(bfqd, bfqq,
10181 ++ "changed_ioprio: bfqq %p %d",
10182 ++ bfqq, atomic_read(&bfqq->ref));
10183 ++ bfq_put_queue(bfqq);
10184 ++ }
10185 ++ }
10186 ++
10187 ++ bfqq = bic->bfqq[BLK_RW_SYNC];
10188 ++ if (bfqq != NULL)
10189 ++ bfq_mark_bfqq_prio_changed(bfqq);
10190 ++
10191 ++ bic->ioprio = ioprio;
10192 ++
10193 ++out:
10194 ++ bfq_put_bfqd_unlock(bfqd, &flags);
10195 ++}
10196 ++
10197 ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
10198 ++ pid_t pid, int is_sync)
10199 ++{
10200 ++ RB_CLEAR_NODE(&bfqq->entity.rb_node);
10201 ++ INIT_LIST_HEAD(&bfqq->fifo);
10202 ++
10203 ++ atomic_set(&bfqq->ref, 0);
10204 ++ bfqq->bfqd = bfqd;
10205 ++
10206 ++ bfq_mark_bfqq_prio_changed(bfqq);
10207 ++
10208 ++ if (is_sync) {
10209 ++ if (!bfq_class_idle(bfqq))
10210 ++ bfq_mark_bfqq_idle_window(bfqq);
10211 ++ bfq_mark_bfqq_sync(bfqq);
10212 ++ }
10213 ++
10214 ++ /* Tentative initial value to trade off between thr and lat */
10215 ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
10216 ++ bfqq->pid = pid;
10217 ++
10218 ++ bfqq->raising_coeff = 1;
10219 ++ bfqq->last_rais_start_finish = 0;
10220 ++ bfqq->soft_rt_next_start = -1;
10221 ++}
10222 ++
10223 ++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
10224 ++ struct bfq_group *bfqg,
10225 ++ int is_sync,
10226 ++ struct bfq_io_cq *bic,
10227 ++ gfp_t gfp_mask)
10228 ++{
10229 ++ struct bfq_queue *bfqq, *new_bfqq = NULL;
10230 ++
10231 ++retry:
10232 ++ /* bic always exists here */
10233 ++ bfqq = bic_to_bfqq(bic, is_sync);
10234 ++
10235 ++ /*
10236 ++ * Always try a new alloc if we fall back to the OOM bfqq
10237 ++ * originally, since it should just be a temporary situation.
10238 ++ */
10239 ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
10240 ++ bfqq = NULL;
10241 ++ if (new_bfqq != NULL) {
10242 ++ bfqq = new_bfqq;
10243 ++ new_bfqq = NULL;
10244 ++ } else if (gfp_mask & __GFP_WAIT) {
10245 ++ spin_unlock_irq(bfqd->queue->queue_lock);
10246 ++ new_bfqq = kmem_cache_alloc_node(bfq_pool,
10247 ++ gfp_mask | __GFP_ZERO,
10248 ++ bfqd->queue->node);
10249 ++ spin_lock_irq(bfqd->queue->queue_lock);
10250 ++ if (new_bfqq != NULL)
10251 ++ goto retry;
10252 ++ } else {
10253 ++ bfqq = kmem_cache_alloc_node(bfq_pool,
10254 ++ gfp_mask | __GFP_ZERO,
10255 ++ bfqd->queue->node);
10256 ++ }
10257 ++
10258 ++ if (bfqq != NULL) {
10259 ++ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
10260 ++ bfq_log_bfqq(bfqd, bfqq, "allocated");
10261 ++ } else {
10262 ++ bfqq = &bfqd->oom_bfqq;
10263 ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
10264 ++ }
10265 ++
10266 ++ bfq_init_prio_data(bfqq, bic);
10267 ++ bfq_init_entity(&bfqq->entity, bfqg);
10268 ++ }
10269 ++
10270 ++ if (new_bfqq != NULL)
10271 ++ kmem_cache_free(bfq_pool, new_bfqq);
10272 ++
10273 ++ return bfqq;
10274 ++}
10275 ++
10276 ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
10277 ++ struct bfq_group *bfqg,
10278 ++ int ioprio_class, int ioprio)
10279 ++{
10280 ++ switch (ioprio_class) {
10281 ++ case IOPRIO_CLASS_RT:
10282 ++ return &bfqg->async_bfqq[0][ioprio];
10283 ++ case IOPRIO_CLASS_NONE:
10284 ++ ioprio = IOPRIO_NORM;
10285 ++ /* fall through */
10286 ++ case IOPRIO_CLASS_BE:
10287 ++ return &bfqg->async_bfqq[1][ioprio];
10288 ++ case IOPRIO_CLASS_IDLE:
10289 ++ return &bfqg->async_idle_bfqq;
10290 ++ default:
10291 ++ BUG();
10292 ++ }
10293 ++}
10294 ++
10295 ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
10296 ++ struct bfq_group *bfqg, int is_sync,
10297 ++ struct bfq_io_cq *bic, gfp_t gfp_mask)
10298 ++{
10299 ++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
10300 ++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
10301 ++ struct bfq_queue **async_bfqq = NULL;
10302 ++ struct bfq_queue *bfqq = NULL;
10303 ++
10304 ++ if (!is_sync) {
10305 ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
10306 ++ ioprio);
10307 ++ bfqq = *async_bfqq;
10308 ++ }
10309 ++
10310 ++ if (bfqq == NULL)
10311 ++ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
10312 ++
10313 ++ /*
10314 ++ * Pin the queue now that it's allocated, scheduler exit will prune it.
10315 ++ */
10316 ++ if (!is_sync && *async_bfqq == NULL) {
10317 ++ atomic_inc(&bfqq->ref);
10318 ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
10319 ++ bfqq, atomic_read(&bfqq->ref));
10320 ++ *async_bfqq = bfqq;
10321 ++ }
10322 ++
10323 ++ atomic_inc(&bfqq->ref);
10324 ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
10325 ++ atomic_read(&bfqq->ref));
10326 ++ return bfqq;
10327 ++}
10328 ++
10329 ++static void bfq_update_io_thinktime(struct bfq_data *bfqd,
10330 ++ struct bfq_io_cq *bic)
10331 ++{
10332 ++ unsigned long elapsed = jiffies - bic->ttime.last_end_request;
10333 ++ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
10334 ++
10335 ++ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
10336 ++ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
10337 ++ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / bic->ttime.ttime_samples;
10338 ++}
10339 ++
10340 ++static void bfq_update_io_seektime(struct bfq_data *bfqd,
10341 ++ struct bfq_queue *bfqq,
10342 ++ struct request *rq)
10343 ++{
10344 ++ sector_t sdist;
10345 ++ u64 total;
10346 ++
10347 ++ if (bfqq->last_request_pos < blk_rq_pos(rq))
10348 ++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
10349 ++ else
10350 ++ sdist = bfqq->last_request_pos - blk_rq_pos(rq);
10351 ++
10352 ++ /*
10353 ++ * Don't allow the seek distance to get too large from the
10354 ++ * odd fragment, pagein, etc.
10355 ++ */
10356 ++ if (bfqq->seek_samples == 0) /* first request, not really a seek */
10357 ++ sdist = 0;
10358 ++ else if (bfqq->seek_samples <= 60) /* second & third seek */
10359 ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
10360 ++ else
10361 ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
10362 ++
10363 ++ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
10364 ++ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
10365 ++ total = bfqq->seek_total + (bfqq->seek_samples/2);
10366 ++ do_div(total, bfqq->seek_samples);
10367 ++ if (bfq_bfqq_coop(bfqq)) {
10368 ++ /*
10369 ++ * If the mean seektime increases for a (non-seeky) shared
10370 ++ * queue, some cooperator is likely to be idling too much.
10371 ++ * On the contrary, if it decreases, some cooperator has
10372 ++ * probably waked up.
10373 ++ *
10374 ++ */
10375 ++ if ((sector_t)total < bfqq->seek_mean)
10376 ++ bfq_mark_bfqq_some_coop_idle(bfqq) ;
10377 ++ else if ((sector_t)total > bfqq->seek_mean)
10378 ++ bfq_clear_bfqq_some_coop_idle(bfqq) ;
10379 ++ }
10380 ++ bfqq->seek_mean = (sector_t)total;
10381 ++
10382 ++ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
10383 ++ (u64)bfqq->seek_mean);
10384 ++}
10385 ++
10386 ++/*
10387 ++ * Disable idle window if the process thinks too long or seeks so much that
10388 ++ * it doesn't matter.
10389 ++ */
10390 ++static void bfq_update_idle_window(struct bfq_data *bfqd,
10391 ++ struct bfq_queue *bfqq,
10392 ++ struct bfq_io_cq *bic)
10393 ++{
10394 ++ int enable_idle;
10395 ++
10396 ++ /* Don't idle for async or idle io prio class. */
10397 ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
10398 ++ return;
10399 ++
10400 ++ enable_idle = bfq_bfqq_idle_window(bfqq);
10401 ++
10402 ++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
10403 ++ bfqd->bfq_slice_idle == 0 ||
10404 ++ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
10405 ++ bfqq->raising_coeff == 1))
10406 ++ enable_idle = 0;
10407 ++ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
10408 ++ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
10409 ++ bfqq->raising_coeff == 1)
10410 ++ enable_idle = 0;
10411 ++ else
10412 ++ enable_idle = 1;
10413 ++ }
10414 ++ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
10415 ++ enable_idle);
10416 ++
10417 ++ if (enable_idle)
10418 ++ bfq_mark_bfqq_idle_window(bfqq);
10419 ++ else
10420 ++ bfq_clear_bfqq_idle_window(bfqq);
10421 ++}
10422 ++
10423 ++/*
10424 ++ * Called when a new fs request (rq) is added to bfqq. Check if there's
10425 ++ * something we should do about it.
10426 ++ */
10427 ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
10428 ++ struct request *rq)
10429 ++{
10430 ++ struct bfq_io_cq *bic = RQ_BIC(rq);
10431 ++
10432 ++ if (rq->cmd_flags & REQ_META)
10433 ++ bfqq->meta_pending++;
10434 ++
10435 ++ bfq_update_io_thinktime(bfqd, bic);
10436 ++ bfq_update_io_seektime(bfqd, bfqq, rq);
10437 ++ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
10438 ++ !BFQQ_SEEKY(bfqq))
10439 ++ bfq_update_idle_window(bfqd, bfqq, bic);
10440 ++
10441 ++ bfq_log_bfqq(bfqd, bfqq,
10442 ++ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
10443 ++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
10444 ++ (long long unsigned)bfqq->seek_mean);
10445 ++
10446 ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
10447 ++
10448 ++ if (bfqq == bfqd->active_queue) {
10449 ++ /*
10450 ++ * If there is just this request queued and the request
10451 ++ * is small, just exit.
10452 ++ * In this way, if the disk is being idled to wait for a new
10453 ++ * request from the active queue, we avoid unplugging the
10454 ++ * device now.
10455 ++ *
10456 ++ * By doing so, we spare the disk to be committed
10457 ++ * to serve just a small request. On the contrary, we wait for
10458 ++ * the block layer to decide when to unplug the device:
10459 ++ * hopefully, new requests will be merged to this
10460 ++ * one quickly, then the device will be unplugged
10461 ++ * and larger requests will be dispatched.
10462 ++ */
10463 ++ if (bfqq->queued[rq_is_sync(rq)] == 1 &&
10464 ++ blk_rq_sectors(rq) < 32) {
10465 ++ return;
10466 ++ }
10467 ++ if (bfq_bfqq_wait_request(bfqq)) {
10468 ++ /*
10469 ++ * If we are waiting for a request for this queue, let
10470 ++ * it rip immediately and flag that we must not expire
10471 ++ * this queue just now.
10472 ++ */
10473 ++ bfq_clear_bfqq_wait_request(bfqq);
10474 ++ del_timer(&bfqd->idle_slice_timer);
10475 ++ /*
10476 ++ * Here we can safely expire the queue, in
10477 ++ * case of budget timeout, without wasting
10478 ++ * guarantees
10479 ++ */
10480 ++ if (bfq_bfqq_budget_timeout(bfqq))
10481 ++ bfq_bfqq_expire(bfqd, bfqq, 0,
10482 ++ BFQ_BFQQ_BUDGET_TIMEOUT);
10483 ++ __blk_run_queue(bfqd->queue);
10484 ++ }
10485 ++ }
10486 ++}
10487 ++
10488 ++static void bfq_insert_request(struct request_queue *q, struct request *rq)
10489 ++{
10490 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
10491 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
10492 ++
10493 ++ assert_spin_locked(bfqd->queue->queue_lock);
10494 ++ bfq_init_prio_data(bfqq, RQ_BIC(rq));
10495 ++
10496 ++ bfq_add_rq_rb(rq);
10497 ++
10498 ++ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
10499 ++ list_add_tail(&rq->queuelist, &bfqq->fifo);
10500 ++
10501 ++ bfq_rq_enqueued(bfqd, bfqq, rq);
10502 ++}
10503 ++
10504 ++static void bfq_update_hw_tag(struct bfq_data *bfqd)
10505 ++{
10506 ++ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
10507 ++ bfqd->rq_in_driver);
10508 ++
10509 ++ if (bfqd->hw_tag == 1)
10510 ++ return;
10511 ++
10512 ++ /*
10513 ++ * This sample is valid if the number of outstanding requests
10514 ++ * is large enough to allow a queueing behavior. Note that the
10515 ++ * sum is not exact, as it's not taking into account deactivated
10516 ++ * requests.
10517 ++ */
10518 ++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
10519 ++ return;
10520 ++
10521 ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
10522 ++ return;
10523 ++
10524 ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
10525 ++ bfqd->max_rq_in_driver = 0;
10526 ++ bfqd->hw_tag_samples = 0;
10527 ++}
10528 ++
10529 ++static void bfq_completed_request(struct request_queue *q, struct request *rq)
10530 ++{
10531 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
10532 ++ struct bfq_data *bfqd = bfqq->bfqd;
10533 ++ const int sync = rq_is_sync(rq);
10534 ++
10535 ++ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
10536 ++ blk_rq_sectors(rq), sync);
10537 ++
10538 ++ bfq_update_hw_tag(bfqd);
10539 ++
10540 ++ WARN_ON(!bfqd->rq_in_driver);
10541 ++ WARN_ON(!bfqq->dispatched);
10542 ++ bfqd->rq_in_driver--;
10543 ++ bfqq->dispatched--;
10544 ++
10545 ++ if (bfq_bfqq_sync(bfqq))
10546 ++ bfqd->sync_flight--;
10547 ++
10548 ++ if (sync)
10549 ++ RQ_BIC(rq)->ttime.last_end_request = jiffies;
10550 ++
10551 ++ /*
10552 ++ * If this is the active queue, check if it needs to be expired,
10553 ++ * or if we want to idle in case it has no pending requests.
10554 ++ */
10555 ++ if (bfqd->active_queue == bfqq) {
10556 ++ int budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
10557 ++ if (bfq_bfqq_budget_new(bfqq))
10558 ++ bfq_set_budget_timeout(bfqd);
10559 ++
10560 ++ /* Idling is disabled also for cooperation issues:
10561 ++ * 1) there is a close cooperator for the queue, or
10562 ++ * 2) the queue is shared and some cooperator is likely
10563 ++ * to be idle (in this case, by not arming the idle timer,
10564 ++ * we try to slow down the queue, to prevent the zones
10565 ++ * of the disk accessed by the active cooperators to become
10566 ++ * too distant from the zone that will be accessed by the
10567 ++ * currently idle cooperators)
10568 ++ */
10569 ++ if (bfq_bfqq_must_idle(bfqq, budg_timeout))
10570 ++ bfq_arm_slice_timer(bfqd);
10571 ++ else if (budg_timeout)
10572 ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
10573 ++ }
10574 ++
10575 ++ if (!bfqd->rq_in_driver)
10576 ++ bfq_schedule_dispatch(bfqd);
10577 ++}
10578 ++
10579 ++static inline int __bfq_may_queue(struct bfq_queue *bfqq)
10580 ++{
10581 ++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
10582 ++ bfq_clear_bfqq_must_alloc(bfqq);
10583 ++ return ELV_MQUEUE_MUST;
10584 ++ }
10585 ++
10586 ++ return ELV_MQUEUE_MAY;
10587 ++}
10588 ++
10589 ++static int bfq_may_queue(struct request_queue *q, int rw)
10590 ++{
10591 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
10592 ++ struct task_struct *tsk = current;
10593 ++ struct bfq_io_cq *bic;
10594 ++ struct bfq_queue *bfqq;
10595 ++
10596 ++ /*
10597 ++ * Don't force setup of a queue from here, as a call to may_queue
10598 ++ * does not necessarily imply that a request actually will be queued.
10599 ++ * So just lookup a possibly existing queue, or return 'may queue'
10600 ++ * if that fails.
10601 ++ */
10602 ++ bic = bfq_bic_lookup(bfqd, tsk->io_context);
10603 ++ if (bic == NULL)
10604 ++ return ELV_MQUEUE_MAY;
10605 ++
10606 ++ bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
10607 ++ if (bfqq != NULL) {
10608 ++ bfq_init_prio_data(bfqq, bic);
10609 ++
10610 ++ return __bfq_may_queue(bfqq);
10611 ++ }
10612 ++
10613 ++ return ELV_MQUEUE_MAY;
10614 ++}
10615 ++
10616 ++/*
10617 ++ * Queue lock held here.
10618 ++ */
10619 ++static void bfq_put_request(struct request *rq)
10620 ++{
10621 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq);
10622 ++
10623 ++ if (bfqq != NULL) {
10624 ++ const int rw = rq_data_dir(rq);
10625 ++
10626 ++ BUG_ON(!bfqq->allocated[rw]);
10627 ++ bfqq->allocated[rw]--;
10628 ++
10629 ++ rq->elv.priv[0] = NULL;
10630 ++ rq->elv.priv[1] = NULL;
10631 ++
10632 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
10633 ++ bfqq, atomic_read(&bfqq->ref));
10634 ++ bfq_put_queue(bfqq);
10635 ++ }
10636 ++}
10637 ++
10638 ++static struct bfq_queue *
10639 ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
10640 ++ struct bfq_queue *bfqq)
10641 ++{
10642 ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
10643 ++ (long unsigned)bfqq->new_bfqq->pid);
10644 ++ bic_set_bfqq(bic, bfqq->new_bfqq, 1);
10645 ++ bfq_mark_bfqq_coop(bfqq->new_bfqq);
10646 ++ bfq_put_queue(bfqq);
10647 ++ return bic_to_bfqq(bic, 1);
10648 ++}
10649 ++
10650 ++/*
10651 ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
10652 ++ * was the last process referring to said bfqq.
10653 ++ */
10654 ++static struct bfq_queue *
10655 ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
10656 ++{
10657 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
10658 ++ if (bfqq_process_refs(bfqq) == 1) {
10659 ++ bfqq->pid = current->pid;
10660 ++ bfq_clear_bfqq_some_coop_idle(bfqq);
10661 ++ bfq_clear_bfqq_coop(bfqq);
10662 ++ bfq_clear_bfqq_split_coop(bfqq);
10663 ++ return bfqq;
10664 ++ }
10665 ++
10666 ++ bic_set_bfqq(bic, NULL, 1);
10667 ++
10668 ++ bfq_put_cooperator(bfqq);
10669 ++
10670 ++ bfq_put_queue(bfqq);
10671 ++ return NULL;
10672 ++}
10673 ++
10674 ++/*
10675 ++ * Allocate bfq data structures associated with this request.
10676 ++ */
10677 ++static int bfq_set_request(struct request_queue *q, struct request *rq,
10678 ++ struct bio *bio, gfp_t gfp_mask)
10679 ++{
10680 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
10681 ++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
10682 ++ const int rw = rq_data_dir(rq);
10683 ++ const int is_sync = rq_is_sync(rq);
10684 ++ struct bfq_queue *bfqq;
10685 ++ struct bfq_group *bfqg;
10686 ++ unsigned long flags;
10687 ++
10688 ++ might_sleep_if(gfp_mask & __GFP_WAIT);
10689 ++
10690 ++ bfq_changed_ioprio(bic);
10691 ++
10692 ++ spin_lock_irqsave(q->queue_lock, flags);
10693 ++
10694 ++ if (bic == NULL)
10695 ++ goto queue_fail;
10696 ++
10697 ++ bfqg = bfq_bic_update_cgroup(bic);
10698 ++
10699 ++new_queue:
10700 ++ bfqq = bic_to_bfqq(bic, is_sync);
10701 ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
10702 ++ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
10703 ++ bic_set_bfqq(bic, bfqq, is_sync);
10704 ++ } else {
10705 ++ /*
10706 ++ * If the queue was seeky for too long, break it apart.
10707 ++ */
10708 ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
10709 ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
10710 ++ bfqq = bfq_split_bfqq(bic, bfqq);
10711 ++ if (!bfqq)
10712 ++ goto new_queue;
10713 ++ }
10714 ++
10715 ++ /*
10716 ++ * Check to see if this queue is scheduled to merge with
10717 ++ * another closely cooperating queue. The merging of queues
10718 ++ * happens here as it must be done in process context.
10719 ++ * The reference on new_bfqq was taken in merge_bfqqs.
10720 ++ */
10721 ++ if (bfqq->new_bfqq != NULL)
10722 ++ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
10723 ++ }
10724 ++
10725 ++ bfqq->allocated[rw]++;
10726 ++ atomic_inc(&bfqq->ref);
10727 ++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
10728 ++ atomic_read(&bfqq->ref));
10729 ++
10730 ++ rq->elv.priv[0] = bic;
10731 ++ rq->elv.priv[1] = bfqq;
10732 ++
10733 ++ spin_unlock_irqrestore(q->queue_lock, flags);
10734 ++
10735 ++ return 0;
10736 ++
10737 ++queue_fail:
10738 ++ bfq_schedule_dispatch(bfqd);
10739 ++ spin_unlock_irqrestore(q->queue_lock, flags);
10740 ++
10741 ++ return 1;
10742 ++}
10743 ++
10744 ++static void bfq_kick_queue(struct work_struct *work)
10745 ++{
10746 ++ struct bfq_data *bfqd =
10747 ++ container_of(work, struct bfq_data, unplug_work);
10748 ++ struct request_queue *q = bfqd->queue;
10749 ++
10750 ++ spin_lock_irq(q->queue_lock);
10751 ++ __blk_run_queue(q);
10752 ++ spin_unlock_irq(q->queue_lock);
10753 ++}
10754 ++
10755 ++/*
10756 ++ * Handler of the expiration of the timer running if the active_queue
10757 ++ * is idling inside its time slice.
10758 ++ */
10759 ++static void bfq_idle_slice_timer(unsigned long data)
10760 ++{
10761 ++ struct bfq_data *bfqd = (struct bfq_data *)data;
10762 ++ struct bfq_queue *bfqq;
10763 ++ unsigned long flags;
10764 ++ enum bfqq_expiration reason;
10765 ++
10766 ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
10767 ++
10768 ++ bfqq = bfqd->active_queue;
10769 ++ /*
10770 ++ * Theoretical race here: active_queue can be NULL or different
10771 ++ * from the queue that was idling if the timer handler spins on
10772 ++ * the queue_lock and a new request arrives for the current
10773 ++ * queue and there is a full dispatch cycle that changes the
10774 ++ * active_queue. This can hardly happen, but in the worst case
10775 ++ * we just expire a queue too early.
10776 ++ */
10777 ++ if (bfqq != NULL) {
10778 ++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
10779 ++ if (bfq_bfqq_budget_timeout(bfqq))
10780 ++ /*
10781 ++ * Also here the queue can be safely expired
10782 ++ * for budget timeout without wasting
10783 ++ * guarantees
10784 ++ */
10785 ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
10786 ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
10787 ++ /*
10788 ++ * The queue may not be empty upon timer expiration,
10789 ++ * because we may not disable the timer when the first
10790 ++ * request of the active queue arrives during
10791 ++ * disk idling
10792 ++ */
10793 ++ reason = BFQ_BFQQ_TOO_IDLE;
10794 ++ else
10795 ++ goto schedule_dispatch;
10796 ++
10797 ++ bfq_bfqq_expire(bfqd, bfqq, 1, reason);
10798 ++ }
10799 ++
10800 ++schedule_dispatch:
10801 ++ bfq_schedule_dispatch(bfqd);
10802 ++
10803 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
10804 ++}
10805 ++
10806 ++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
10807 ++{
10808 ++ del_timer_sync(&bfqd->idle_slice_timer);
10809 ++ cancel_work_sync(&bfqd->unplug_work);
10810 ++}
10811 ++
10812 ++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
10813 ++ struct bfq_queue **bfqq_ptr)
10814 ++{
10815 ++ struct bfq_group *root_group = bfqd->root_group;
10816 ++ struct bfq_queue *bfqq = *bfqq_ptr;
10817 ++
10818 ++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
10819 ++ if (bfqq != NULL) {
10820 ++ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
10821 ++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
10822 ++ bfqq, atomic_read(&bfqq->ref));
10823 ++ bfq_put_queue(bfqq);
10824 ++ *bfqq_ptr = NULL;
10825 ++ }
10826 ++}
10827 ++
10828 ++/*
10829 ++ * Release all the bfqg references to its async queues. If we are
10830 ++ * deallocating the group these queues may still contain requests, so
10831 ++ * we reparent them to the root cgroup (i.e., the only one that will
10832 ++ * exist for sure untill all the requests on a device are gone).
10833 ++ */
10834 ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
10835 ++{
10836 ++ int i, j;
10837 ++
10838 ++ for (i = 0; i < 2; i++)
10839 ++ for (j = 0; j < IOPRIO_BE_NR; j++)
10840 ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
10841 ++
10842 ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
10843 ++}
10844 ++
10845 ++static void bfq_exit_queue(struct elevator_queue *e)
10846 ++{
10847 ++ struct bfq_data *bfqd = e->elevator_data;
10848 ++ struct request_queue *q = bfqd->queue;
10849 ++ struct bfq_queue *bfqq, *n;
10850 ++
10851 ++ bfq_shutdown_timer_wq(bfqd);
10852 ++
10853 ++ spin_lock_irq(q->queue_lock);
10854 ++
10855 ++ BUG_ON(bfqd->active_queue != NULL);
10856 ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
10857 ++ bfq_deactivate_bfqq(bfqd, bfqq, 0);
10858 ++
10859 ++ bfq_disconnect_groups(bfqd);
10860 ++ spin_unlock_irq(q->queue_lock);
10861 ++
10862 ++ bfq_shutdown_timer_wq(bfqd);
10863 ++
10864 ++ synchronize_rcu();
10865 ++
10866 ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
10867 ++
10868 ++ bfq_free_root_group(bfqd);
10869 ++ kfree(bfqd);
10870 ++}
10871 ++
10872 ++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
10873 ++{
10874 ++ struct bfq_group *bfqg;
10875 ++ struct bfq_data *bfqd;
10876 ++ struct elevator_queue *eq;
10877 ++
10878 ++ eq = elevator_alloc(q, e);
10879 ++ if (eq == NULL)
10880 ++ return -ENOMEM;
10881 ++
10882 ++ bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node);
10883 ++ if (bfqd == NULL) {
10884 ++ kobject_put(&eq->kobj);
10885 ++ return -ENOMEM;
10886 ++ }
10887 ++ eq->elevator_data = bfqd;
10888 ++
10889 ++ /*
10890 ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
10891 ++ * Grab a permanent reference to it, so that the normal code flow
10892 ++ * will not attempt to free it.
10893 ++ */
10894 ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
10895 ++ atomic_inc(&bfqd->oom_bfqq.ref);
10896 ++
10897 ++ bfqd->queue = q;
10898 ++
10899 ++ spin_lock_irq(q->queue_lock);
10900 ++ q->elevator = eq;
10901 ++ spin_unlock_irq(q->queue_lock);
10902 ++
10903 ++ bfqg = bfq_alloc_root_group(bfqd, q->node);
10904 ++ if (bfqg == NULL) {
10905 ++ kfree(bfqd);
10906 ++ kobject_put(&eq->kobj);
10907 ++ return -ENOMEM;
10908 ++ }
10909 ++
10910 ++ bfqd->root_group = bfqg;
10911 ++
10912 ++ init_timer(&bfqd->idle_slice_timer);
10913 ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
10914 ++ bfqd->idle_slice_timer.data = (unsigned long)bfqd;
10915 ++
10916 ++ bfqd->rq_pos_tree = RB_ROOT;
10917 ++
10918 ++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
10919 ++
10920 ++ INIT_LIST_HEAD(&bfqd->active_list);
10921 ++ INIT_LIST_HEAD(&bfqd->idle_list);
10922 ++
10923 ++ bfqd->hw_tag = -1;
10924 ++
10925 ++ bfqd->bfq_max_budget = bfq_default_max_budget;
10926 ++
10927 ++ bfqd->bfq_quantum = bfq_quantum;
10928 ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
10929 ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
10930 ++ bfqd->bfq_back_max = bfq_back_max;
10931 ++ bfqd->bfq_back_penalty = bfq_back_penalty;
10932 ++ bfqd->bfq_slice_idle = bfq_slice_idle;
10933 ++ bfqd->bfq_class_idle_last_service = 0;
10934 ++ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
10935 ++ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
10936 ++ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
10937 ++
10938 ++ bfqd->low_latency = true;
10939 ++
10940 ++ bfqd->bfq_raising_coeff = 20;
10941 ++ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
10942 ++ bfqd->bfq_raising_max_time = 0;
10943 ++ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
10944 ++ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
10945 ++ bfqd->bfq_raising_max_softrt_rate = 7000;
10946 ++
10947 ++ /* Initially estimate the device's peak rate as the reference rate */
10948 ++ if (blk_queue_nonrot(bfqd->queue)) {
10949 ++ bfqd->RT_prod = R_nonrot * T_nonrot;
10950 ++ bfqd->peak_rate = R_nonrot;
10951 ++ } else {
10952 ++ bfqd->RT_prod = R_rot * T_rot;
10953 ++ bfqd->peak_rate = R_rot;
10954 ++ }
10955 ++
10956 ++ return 0;
10957 ++}
10958 ++
10959 ++static void bfq_slab_kill(void)
10960 ++{
10961 ++ if (bfq_pool != NULL)
10962 ++ kmem_cache_destroy(bfq_pool);
10963 ++}
10964 ++
10965 ++static int __init bfq_slab_setup(void)
10966 ++{
10967 ++ bfq_pool = KMEM_CACHE(bfq_queue, 0);
10968 ++ if (bfq_pool == NULL)
10969 ++ return -ENOMEM;
10970 ++ return 0;
10971 ++}
10972 ++
10973 ++static ssize_t bfq_var_show(unsigned int var, char *page)
10974 ++{
10975 ++ return sprintf(page, "%d\n", var);
10976 ++}
10977 ++
10978 ++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
10979 ++{
10980 ++ unsigned long new_val;
10981 ++ int ret = strict_strtoul(page, 10, &new_val);
10982 ++
10983 ++ if (ret == 0)
10984 ++ *var = new_val;
10985 ++
10986 ++ return count;
10987 ++}
10988 ++
10989 ++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
10990 ++{
10991 ++ struct bfq_data *bfqd = e->elevator_data;
10992 ++ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
10993 ++ jiffies_to_msecs(bfqd->bfq_raising_max_time) :
10994 ++ jiffies_to_msecs(bfq_wrais_duration(bfqd)));
10995 ++}
10996 ++
10997 ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
10998 ++{
10999 ++ struct bfq_queue *bfqq;
11000 ++ struct bfq_data *bfqd = e->elevator_data;
11001 ++ ssize_t num_char = 0;
11002 ++
11003 ++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
11004 ++ bfqd->queued);
11005 ++
11006 ++ spin_lock_irq(bfqd->queue->queue_lock);
11007 ++
11008 ++ num_char += sprintf(page + num_char, "Active:\n");
11009 ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
11010 ++ num_char += sprintf(page + num_char,
11011 ++ "pid%d: weight %hu, nr_queued %d %d,"
11012 ++ " dur %d/%u\n",
11013 ++ bfqq->pid,
11014 ++ bfqq->entity.weight,
11015 ++ bfqq->queued[0],
11016 ++ bfqq->queued[1],
11017 ++ jiffies_to_msecs(jiffies -
11018 ++ bfqq->last_rais_start_finish),
11019 ++ jiffies_to_msecs(bfqq->raising_cur_max_time));
11020 ++ }
11021 ++
11022 ++ num_char += sprintf(page + num_char, "Idle:\n");
11023 ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
11024 ++ num_char += sprintf(page + num_char,
11025 ++ "pid%d: weight %hu, dur %d/%u\n",
11026 ++ bfqq->pid,
11027 ++ bfqq->entity.weight,
11028 ++ jiffies_to_msecs(jiffies -
11029 ++ bfqq->last_rais_start_finish),
11030 ++ jiffies_to_msecs(bfqq->raising_cur_max_time));
11031 ++ }
11032 ++
11033 ++ spin_unlock_irq(bfqd->queue->queue_lock);
11034 ++
11035 ++ return num_char;
11036 ++}
11037 ++
11038 ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
11039 ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \
11040 ++{ \
11041 ++ struct bfq_data *bfqd = e->elevator_data; \
11042 ++ unsigned int __data = __VAR; \
11043 ++ if (__CONV) \
11044 ++ __data = jiffies_to_msecs(__data); \
11045 ++ return bfq_var_show(__data, (page)); \
11046 ++}
11047 ++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
11048 ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
11049 ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
11050 ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
11051 ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
11052 ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
11053 ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
11054 ++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
11055 ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
11056 ++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
11057 ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
11058 ++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
11059 ++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
11060 ++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
11061 ++ 1);
11062 ++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
11063 ++ bfqd->bfq_raising_min_inter_arr_async,
11064 ++ 1);
11065 ++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
11066 ++ bfqd->bfq_raising_max_softrt_rate, 0);
11067 ++#undef SHOW_FUNCTION
11068 ++
11069 ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
11070 ++static ssize_t \
11071 ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \
11072 ++{ \
11073 ++ struct bfq_data *bfqd = e->elevator_data; \
11074 ++ unsigned long uninitialized_var(__data); \
11075 ++ int ret = bfq_var_store(&__data, (page), count); \
11076 ++ if (__data < (MIN)) \
11077 ++ __data = (MIN); \
11078 ++ else if (__data > (MAX)) \
11079 ++ __data = (MAX); \
11080 ++ if (__CONV) \
11081 ++ *(__PTR) = msecs_to_jiffies(__data); \
11082 ++ else \
11083 ++ *(__PTR) = __data; \
11084 ++ return ret; \
11085 ++}
11086 ++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
11087 ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
11088 ++ INT_MAX, 1);
11089 ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
11090 ++ INT_MAX, 1);
11091 ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
11092 ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
11093 ++ INT_MAX, 0);
11094 ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
11095 ++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
11096 ++ 1, INT_MAX, 0);
11097 ++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
11098 ++ INT_MAX, 1);
11099 ++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
11100 ++ INT_MAX, 0);
11101 ++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
11102 ++ INT_MAX, 1);
11103 ++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
11104 ++ INT_MAX, 1);
11105 ++STORE_FUNCTION(bfq_raising_min_idle_time_store,
11106 ++ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
11107 ++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
11108 ++ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
11109 ++STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
11110 ++ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
11111 ++#undef STORE_FUNCTION
11112 ++
11113 ++/* do nothing for the moment */
11114 ++static ssize_t bfq_weights_store(struct elevator_queue *e,
11115 ++ const char *page, size_t count)
11116 ++{
11117 ++ return count;
11118 ++}
11119 ++
11120 ++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
11121 ++{
11122 ++ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
11123 ++
11124 ++ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
11125 ++ return bfq_calc_max_budget(bfqd->peak_rate, timeout);
11126 ++ else
11127 ++ return bfq_default_max_budget;
11128 ++}
11129 ++
11130 ++static ssize_t bfq_max_budget_store(struct elevator_queue *e,
11131 ++ const char *page, size_t count)
11132 ++{
11133 ++ struct bfq_data *bfqd = e->elevator_data;
11134 ++ unsigned long uninitialized_var(__data);
11135 ++ int ret = bfq_var_store(&__data, (page), count);
11136 ++
11137 ++ if (__data == 0)
11138 ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
11139 ++ else {
11140 ++ if (__data > INT_MAX)
11141 ++ __data = INT_MAX;
11142 ++ bfqd->bfq_max_budget = __data;
11143 ++ }
11144 ++
11145 ++ bfqd->bfq_user_max_budget = __data;
11146 ++
11147 ++ return ret;
11148 ++}
11149 ++
11150 ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
11151 ++ const char *page, size_t count)
11152 ++{
11153 ++ struct bfq_data *bfqd = e->elevator_data;
11154 ++ unsigned long uninitialized_var(__data);
11155 ++ int ret = bfq_var_store(&__data, (page), count);
11156 ++
11157 ++ if (__data < 1)
11158 ++ __data = 1;
11159 ++ else if (__data > INT_MAX)
11160 ++ __data = INT_MAX;
11161 ++
11162 ++ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
11163 ++ if (bfqd->bfq_user_max_budget == 0)
11164 ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
11165 ++
11166 ++ return ret;
11167 ++}
11168 ++
11169 ++static ssize_t bfq_low_latency_store(struct elevator_queue *e,
11170 ++ const char *page, size_t count)
11171 ++{
11172 ++ struct bfq_data *bfqd = e->elevator_data;
11173 ++ unsigned long uninitialized_var(__data);
11174 ++ int ret = bfq_var_store(&__data, (page), count);
11175 ++
11176 ++ if (__data > 1)
11177 ++ __data = 1;
11178 ++ if (__data == 0 && bfqd->low_latency != 0)
11179 ++ bfq_end_raising(bfqd);
11180 ++ bfqd->low_latency = __data;
11181 ++
11182 ++ return ret;
11183 ++}
11184 ++
11185 ++#define BFQ_ATTR(name) \
11186 ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
11187 ++
11188 ++static struct elv_fs_entry bfq_attrs[] = {
11189 ++ BFQ_ATTR(quantum),
11190 ++ BFQ_ATTR(fifo_expire_sync),
11191 ++ BFQ_ATTR(fifo_expire_async),
11192 ++ BFQ_ATTR(back_seek_max),
11193 ++ BFQ_ATTR(back_seek_penalty),
11194 ++ BFQ_ATTR(slice_idle),
11195 ++ BFQ_ATTR(max_budget),
11196 ++ BFQ_ATTR(max_budget_async_rq),
11197 ++ BFQ_ATTR(timeout_sync),
11198 ++ BFQ_ATTR(timeout_async),
11199 ++ BFQ_ATTR(low_latency),
11200 ++ BFQ_ATTR(raising_coeff),
11201 ++ BFQ_ATTR(raising_max_time),
11202 ++ BFQ_ATTR(raising_rt_max_time),
11203 ++ BFQ_ATTR(raising_min_idle_time),
11204 ++ BFQ_ATTR(raising_min_inter_arr_async),
11205 ++ BFQ_ATTR(raising_max_softrt_rate),
11206 ++ BFQ_ATTR(weights),
11207 ++ __ATTR_NULL
11208 ++};
11209 ++
11210 ++static struct elevator_type iosched_bfq = {
11211 ++ .ops = {
11212 ++ .elevator_merge_fn = bfq_merge,
11213 ++ .elevator_merged_fn = bfq_merged_request,
11214 ++ .elevator_merge_req_fn = bfq_merged_requests,
11215 ++ .elevator_allow_merge_fn = bfq_allow_merge,
11216 ++ .elevator_dispatch_fn = bfq_dispatch_requests,
11217 ++ .elevator_add_req_fn = bfq_insert_request,
11218 ++ .elevator_activate_req_fn = bfq_activate_request,
11219 ++ .elevator_deactivate_req_fn = bfq_deactivate_request,
11220 ++ .elevator_completed_req_fn = bfq_completed_request,
11221 ++ .elevator_former_req_fn = elv_rb_former_request,
11222 ++ .elevator_latter_req_fn = elv_rb_latter_request,
11223 ++ .elevator_init_icq_fn = bfq_init_icq,
11224 ++ .elevator_exit_icq_fn = bfq_exit_icq,
11225 ++ .elevator_set_req_fn = bfq_set_request,
11226 ++ .elevator_put_req_fn = bfq_put_request,
11227 ++ .elevator_may_queue_fn = bfq_may_queue,
11228 ++ .elevator_init_fn = bfq_init_queue,
11229 ++ .elevator_exit_fn = bfq_exit_queue,
11230 ++ },
11231 ++ .icq_size = sizeof(struct bfq_io_cq),
11232 ++ .icq_align = __alignof__(struct bfq_io_cq),
11233 ++ .elevator_attrs = bfq_attrs,
11234 ++ .elevator_name = "bfq",
11235 ++ .elevator_owner = THIS_MODULE,
11236 ++};
11237 ++
11238 ++static int __init bfq_init(void)
11239 ++{
11240 ++ /*
11241 ++ * Can be 0 on HZ < 1000 setups.
11242 ++ */
11243 ++ if (bfq_slice_idle == 0)
11244 ++ bfq_slice_idle = 1;
11245 ++
11246 ++ if (bfq_timeout_async == 0)
11247 ++ bfq_timeout_async = 1;
11248 ++
11249 ++ if (bfq_slab_setup())
11250 ++ return -ENOMEM;
11251 ++
11252 ++ elv_register(&iosched_bfq);
11253 ++
11254 ++ return 0;
11255 ++}
11256 ++
11257 ++static void __exit bfq_exit(void)
11258 ++{
11259 ++ elv_unregister(&iosched_bfq);
11260 ++ bfq_slab_kill();
11261 ++}
11262 ++
11263 ++module_init(bfq_init);
11264 ++module_exit(bfq_exit);
11265 ++
11266 ++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
11267 ++MODULE_LICENSE("GPL");
11268 ++MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
11269 +diff --git a/block/bfq-sched.c b/block/bfq-sched.c
11270 +new file mode 100644
11271 +index 0000000..03f8061
11272 +--- /dev/null
11273 ++++ b/block/bfq-sched.c
11274 +@@ -0,0 +1,1072 @@
11275 ++/*
11276 ++ * BFQ: Hierarchical B-WF2Q+ scheduler.
11277 ++ *
11278 ++ * Based on ideas and code from CFQ:
11279 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
11280 ++ *
11281 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
11282 ++ * Paolo Valente <paolo.valente@×××××××.it>
11283 ++ *
11284 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
11285 ++ */
11286 ++
11287 ++#ifdef CONFIG_CGROUP_BFQIO
11288 ++#define for_each_entity(entity) \
11289 ++ for (; entity != NULL; entity = entity->parent)
11290 ++
11291 ++#define for_each_entity_safe(entity, parent) \
11292 ++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
11293 ++
11294 ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
11295 ++ int extract,
11296 ++ struct bfq_data *bfqd);
11297 ++
11298 ++static inline void bfq_update_budget(struct bfq_entity *next_active)
11299 ++{
11300 ++ struct bfq_entity *bfqg_entity;
11301 ++ struct bfq_group *bfqg;
11302 ++ struct bfq_sched_data *group_sd;
11303 ++
11304 ++ BUG_ON(next_active == NULL);
11305 ++
11306 ++ group_sd = next_active->sched_data;
11307 ++
11308 ++ bfqg = container_of(group_sd, struct bfq_group, sched_data);
11309 ++ /*
11310 ++ * bfq_group's my_entity field is not NULL only if the group
11311 ++ * is not the root group. We must not touch the root entity
11312 ++ * as it must never become an active entity.
11313 ++ */
11314 ++ bfqg_entity = bfqg->my_entity;
11315 ++ if (bfqg_entity != NULL)
11316 ++ bfqg_entity->budget = next_active->budget;
11317 ++}
11318 ++
11319 ++static int bfq_update_next_active(struct bfq_sched_data *sd)
11320 ++{
11321 ++ struct bfq_entity *next_active;
11322 ++
11323 ++ if (sd->active_entity != NULL)
11324 ++ /* will update/requeue at the end of service */
11325 ++ return 0;
11326 ++
11327 ++ /*
11328 ++ * NOTE: this can be improved in many ways, such as returning
11329 ++ * 1 (and thus propagating upwards the update) only when the
11330 ++ * budget changes, or caching the bfqq that will be scheduled
11331 ++ * next from this subtree. By now we worry more about
11332 ++ * correctness than about performance...
11333 ++ */
11334 ++ next_active = bfq_lookup_next_entity(sd, 0, NULL);
11335 ++ sd->next_active = next_active;
11336 ++
11337 ++ if (next_active != NULL)
11338 ++ bfq_update_budget(next_active);
11339 ++
11340 ++ return 1;
11341 ++}
11342 ++
11343 ++static inline void bfq_check_next_active(struct bfq_sched_data *sd,
11344 ++ struct bfq_entity *entity)
11345 ++{
11346 ++ BUG_ON(sd->next_active != entity);
11347 ++}
11348 ++#else
11349 ++#define for_each_entity(entity) \
11350 ++ for (; entity != NULL; entity = NULL)
11351 ++
11352 ++#define for_each_entity_safe(entity, parent) \
11353 ++ for (parent = NULL; entity != NULL; entity = parent)
11354 ++
11355 ++static inline int bfq_update_next_active(struct bfq_sched_data *sd)
11356 ++{
11357 ++ return 0;
11358 ++}
11359 ++
11360 ++static inline void bfq_check_next_active(struct bfq_sched_data *sd,
11361 ++ struct bfq_entity *entity)
11362 ++{
11363 ++}
11364 ++
11365 ++static inline void bfq_update_budget(struct bfq_entity *next_active)
11366 ++{
11367 ++}
11368 ++#endif
11369 ++
11370 ++/*
11371 ++ * Shift for timestamp calculations. This actually limits the maximum
11372 ++ * service allowed in one timestamp delta (small shift values increase it),
11373 ++ * the maximum total weight that can be used for the queues in the system
11374 ++ * (big shift values increase it), and the period of virtual time wraparounds.
11375 ++ */
11376 ++#define WFQ_SERVICE_SHIFT 22
11377 ++
11378 ++/**
11379 ++ * bfq_gt - compare two timestamps.
11380 ++ * @a: first ts.
11381 ++ * @b: second ts.
11382 ++ *
11383 ++ * Return @a > @b, dealing with wrapping correctly.
11384 ++ */
11385 ++static inline int bfq_gt(u64 a, u64 b)
11386 ++{
11387 ++ return (s64)(a - b) > 0;
11388 ++}
11389 ++
11390 ++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
11391 ++{
11392 ++ struct bfq_queue *bfqq = NULL;
11393 ++
11394 ++ BUG_ON(entity == NULL);
11395 ++
11396 ++ if (entity->my_sched_data == NULL)
11397 ++ bfqq = container_of(entity, struct bfq_queue, entity);
11398 ++
11399 ++ return bfqq;
11400 ++}
11401 ++
11402 ++
11403 ++/**
11404 ++ * bfq_delta - map service into the virtual time domain.
11405 ++ * @service: amount of service.
11406 ++ * @weight: scale factor (weight of an entity or weight sum).
11407 ++ */
11408 ++static inline u64 bfq_delta(unsigned long service,
11409 ++ unsigned long weight)
11410 ++{
11411 ++ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
11412 ++
11413 ++ do_div(d, weight);
11414 ++ return d;
11415 ++}
11416 ++
11417 ++/**
11418 ++ * bfq_calc_finish - assign the finish time to an entity.
11419 ++ * @entity: the entity to act upon.
11420 ++ * @service: the service to be charged to the entity.
11421 ++ */
11422 ++static inline void bfq_calc_finish(struct bfq_entity *entity,
11423 ++ unsigned long service)
11424 ++{
11425 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11426 ++
11427 ++ BUG_ON(entity->weight == 0);
11428 ++
11429 ++ entity->finish = entity->start +
11430 ++ bfq_delta(service, entity->weight);
11431 ++
11432 ++ if (bfqq != NULL) {
11433 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
11434 ++ "calc_finish: serv %lu, w %d",
11435 ++ service, entity->weight);
11436 ++ bfq_log_bfqq(bfqq->bfqd, bfqq,
11437 ++ "calc_finish: start %llu, finish %llu, delta %llu",
11438 ++ entity->start, entity->finish,
11439 ++ bfq_delta(service, entity->weight));
11440 ++ }
11441 ++}
11442 ++
11443 ++/**
11444 ++ * bfq_entity_of - get an entity from a node.
11445 ++ * @node: the node field of the entity.
11446 ++ *
11447 ++ * Convert a node pointer to the relative entity. This is used only
11448 ++ * to simplify the logic of some functions and not as the generic
11449 ++ * conversion mechanism because, e.g., in the tree walking functions,
11450 ++ * the check for a %NULL value would be redundant.
11451 ++ */
11452 ++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
11453 ++{
11454 ++ struct bfq_entity *entity = NULL;
11455 ++
11456 ++ if (node != NULL)
11457 ++ entity = rb_entry(node, struct bfq_entity, rb_node);
11458 ++
11459 ++ return entity;
11460 ++}
11461 ++
11462 ++/**
11463 ++ * bfq_extract - remove an entity from a tree.
11464 ++ * @root: the tree root.
11465 ++ * @entity: the entity to remove.
11466 ++ */
11467 ++static inline void bfq_extract(struct rb_root *root,
11468 ++ struct bfq_entity *entity)
11469 ++{
11470 ++ BUG_ON(entity->tree != root);
11471 ++
11472 ++ entity->tree = NULL;
11473 ++ rb_erase(&entity->rb_node, root);
11474 ++}
11475 ++
11476 ++/**
11477 ++ * bfq_idle_extract - extract an entity from the idle tree.
11478 ++ * @st: the service tree of the owning @entity.
11479 ++ * @entity: the entity being removed.
11480 ++ */
11481 ++static void bfq_idle_extract(struct bfq_service_tree *st,
11482 ++ struct bfq_entity *entity)
11483 ++{
11484 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11485 ++ struct rb_node *next;
11486 ++
11487 ++ BUG_ON(entity->tree != &st->idle);
11488 ++
11489 ++ if (entity == st->first_idle) {
11490 ++ next = rb_next(&entity->rb_node);
11491 ++ st->first_idle = bfq_entity_of(next);
11492 ++ }
11493 ++
11494 ++ if (entity == st->last_idle) {
11495 ++ next = rb_prev(&entity->rb_node);
11496 ++ st->last_idle = bfq_entity_of(next);
11497 ++ }
11498 ++
11499 ++ bfq_extract(&st->idle, entity);
11500 ++
11501 ++ if (bfqq != NULL)
11502 ++ list_del(&bfqq->bfqq_list);
11503 ++}
11504 ++
11505 ++/**
11506 ++ * bfq_insert - generic tree insertion.
11507 ++ * @root: tree root.
11508 ++ * @entity: entity to insert.
11509 ++ *
11510 ++ * This is used for the idle and the active tree, since they are both
11511 ++ * ordered by finish time.
11512 ++ */
11513 ++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
11514 ++{
11515 ++ struct bfq_entity *entry;
11516 ++ struct rb_node **node = &root->rb_node;
11517 ++ struct rb_node *parent = NULL;
11518 ++
11519 ++ BUG_ON(entity->tree != NULL);
11520 ++
11521 ++ while (*node != NULL) {
11522 ++ parent = *node;
11523 ++ entry = rb_entry(parent, struct bfq_entity, rb_node);
11524 ++
11525 ++ if (bfq_gt(entry->finish, entity->finish))
11526 ++ node = &parent->rb_left;
11527 ++ else
11528 ++ node = &parent->rb_right;
11529 ++ }
11530 ++
11531 ++ rb_link_node(&entity->rb_node, parent, node);
11532 ++ rb_insert_color(&entity->rb_node, root);
11533 ++
11534 ++ entity->tree = root;
11535 ++}
11536 ++
11537 ++/**
11538 ++ * bfq_update_min - update the min_start field of a entity.
11539 ++ * @entity: the entity to update.
11540 ++ * @node: one of its children.
11541 ++ *
11542 ++ * This function is called when @entity may store an invalid value for
11543 ++ * min_start due to updates to the active tree. The function assumes
11544 ++ * that the subtree rooted at @node (which may be its left or its right
11545 ++ * child) has a valid min_start value.
11546 ++ */
11547 ++static inline void bfq_update_min(struct bfq_entity *entity,
11548 ++ struct rb_node *node)
11549 ++{
11550 ++ struct bfq_entity *child;
11551 ++
11552 ++ if (node != NULL) {
11553 ++ child = rb_entry(node, struct bfq_entity, rb_node);
11554 ++ if (bfq_gt(entity->min_start, child->min_start))
11555 ++ entity->min_start = child->min_start;
11556 ++ }
11557 ++}
11558 ++
11559 ++/**
11560 ++ * bfq_update_active_node - recalculate min_start.
11561 ++ * @node: the node to update.
11562 ++ *
11563 ++ * @node may have changed position or one of its children may have moved,
11564 ++ * this function updates its min_start value. The left and right subtrees
11565 ++ * are assumed to hold a correct min_start value.
11566 ++ */
11567 ++static inline void bfq_update_active_node(struct rb_node *node)
11568 ++{
11569 ++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
11570 ++
11571 ++ entity->min_start = entity->start;
11572 ++ bfq_update_min(entity, node->rb_right);
11573 ++ bfq_update_min(entity, node->rb_left);
11574 ++}
11575 ++
11576 ++/**
11577 ++ * bfq_update_active_tree - update min_start for the whole active tree.
11578 ++ * @node: the starting node.
11579 ++ *
11580 ++ * @node must be the deepest modified node after an update. This function
11581 ++ * updates its min_start using the values held by its children, assuming
11582 ++ * that they did not change, and then updates all the nodes that may have
11583 ++ * changed in the path to the root. The only nodes that may have changed
11584 ++ * are the ones in the path or their siblings.
11585 ++ */
11586 ++static void bfq_update_active_tree(struct rb_node *node)
11587 ++{
11588 ++ struct rb_node *parent;
11589 ++
11590 ++up:
11591 ++ bfq_update_active_node(node);
11592 ++
11593 ++ parent = rb_parent(node);
11594 ++ if (parent == NULL)
11595 ++ return;
11596 ++
11597 ++ if (node == parent->rb_left && parent->rb_right != NULL)
11598 ++ bfq_update_active_node(parent->rb_right);
11599 ++ else if (parent->rb_left != NULL)
11600 ++ bfq_update_active_node(parent->rb_left);
11601 ++
11602 ++ node = parent;
11603 ++ goto up;
11604 ++}
11605 ++
11606 ++/**
11607 ++ * bfq_active_insert - insert an entity in the active tree of its group/device.
11608 ++ * @st: the service tree of the entity.
11609 ++ * @entity: the entity being inserted.
11610 ++ *
11611 ++ * The active tree is ordered by finish time, but an extra key is kept
11612 ++ * per each node, containing the minimum value for the start times of
11613 ++ * its children (and the node itself), so it's possible to search for
11614 ++ * the eligible node with the lowest finish time in logarithmic time.
11615 ++ */
11616 ++static void bfq_active_insert(struct bfq_service_tree *st,
11617 ++ struct bfq_entity *entity)
11618 ++{
11619 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11620 ++ struct rb_node *node = &entity->rb_node;
11621 ++
11622 ++ bfq_insert(&st->active, entity);
11623 ++
11624 ++ if (node->rb_left != NULL)
11625 ++ node = node->rb_left;
11626 ++ else if (node->rb_right != NULL)
11627 ++ node = node->rb_right;
11628 ++
11629 ++ bfq_update_active_tree(node);
11630 ++
11631 ++ if (bfqq != NULL)
11632 ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
11633 ++}
11634 ++
11635 ++/**
11636 ++ * bfq_ioprio_to_weight - calc a weight from an ioprio.
11637 ++ * @ioprio: the ioprio value to convert.
11638 ++ */
11639 ++static unsigned short bfq_ioprio_to_weight(int ioprio)
11640 ++{
11641 ++ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
11642 ++ return IOPRIO_BE_NR - ioprio;
11643 ++}
11644 ++
11645 ++/**
11646 ++ * bfq_weight_to_ioprio - calc an ioprio from a weight.
11647 ++ * @weight: the weight value to convert.
11648 ++ *
11649 ++ * To preserve as mush as possible the old only-ioprio user interface,
11650 ++ * 0 is used as an escape ioprio value for weights (numerically) equal or
11651 ++ * larger than IOPRIO_BE_NR
11652 ++ */
11653 ++static unsigned short bfq_weight_to_ioprio(int weight)
11654 ++{
11655 ++ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
11656 ++ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
11657 ++}
11658 ++
11659 ++static inline void bfq_get_entity(struct bfq_entity *entity)
11660 ++{
11661 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11662 ++ struct bfq_sched_data *sd;
11663 ++
11664 ++ if (bfqq != NULL) {
11665 ++ sd = entity->sched_data;
11666 ++ atomic_inc(&bfqq->ref);
11667 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
11668 ++ bfqq, atomic_read(&bfqq->ref));
11669 ++ }
11670 ++}
11671 ++
11672 ++/**
11673 ++ * bfq_find_deepest - find the deepest node that an extraction can modify.
11674 ++ * @node: the node being removed.
11675 ++ *
11676 ++ * Do the first step of an extraction in an rb tree, looking for the
11677 ++ * node that will replace @node, and returning the deepest node that
11678 ++ * the following modifications to the tree can touch. If @node is the
11679 ++ * last node in the tree return %NULL.
11680 ++ */
11681 ++static struct rb_node *bfq_find_deepest(struct rb_node *node)
11682 ++{
11683 ++ struct rb_node *deepest;
11684 ++
11685 ++ if (node->rb_right == NULL && node->rb_left == NULL)
11686 ++ deepest = rb_parent(node);
11687 ++ else if (node->rb_right == NULL)
11688 ++ deepest = node->rb_left;
11689 ++ else if (node->rb_left == NULL)
11690 ++ deepest = node->rb_right;
11691 ++ else {
11692 ++ deepest = rb_next(node);
11693 ++ if (deepest->rb_right != NULL)
11694 ++ deepest = deepest->rb_right;
11695 ++ else if (rb_parent(deepest) != node)
11696 ++ deepest = rb_parent(deepest);
11697 ++ }
11698 ++
11699 ++ return deepest;
11700 ++}
11701 ++
11702 ++/**
11703 ++ * bfq_active_extract - remove an entity from the active tree.
11704 ++ * @st: the service_tree containing the tree.
11705 ++ * @entity: the entity being removed.
11706 ++ */
11707 ++static void bfq_active_extract(struct bfq_service_tree *st,
11708 ++ struct bfq_entity *entity)
11709 ++{
11710 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11711 ++ struct rb_node *node;
11712 ++
11713 ++ node = bfq_find_deepest(&entity->rb_node);
11714 ++ bfq_extract(&st->active, entity);
11715 ++
11716 ++ if (node != NULL)
11717 ++ bfq_update_active_tree(node);
11718 ++
11719 ++ if (bfqq != NULL)
11720 ++ list_del(&bfqq->bfqq_list);
11721 ++}
11722 ++
11723 ++/**
11724 ++ * bfq_idle_insert - insert an entity into the idle tree.
11725 ++ * @st: the service tree containing the tree.
11726 ++ * @entity: the entity to insert.
11727 ++ */
11728 ++static void bfq_idle_insert(struct bfq_service_tree *st,
11729 ++ struct bfq_entity *entity)
11730 ++{
11731 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11732 ++ struct bfq_entity *first_idle = st->first_idle;
11733 ++ struct bfq_entity *last_idle = st->last_idle;
11734 ++
11735 ++ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
11736 ++ st->first_idle = entity;
11737 ++ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
11738 ++ st->last_idle = entity;
11739 ++
11740 ++ bfq_insert(&st->idle, entity);
11741 ++
11742 ++ if (bfqq != NULL)
11743 ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
11744 ++}
11745 ++
11746 ++/**
11747 ++ * bfq_forget_entity - remove an entity from the wfq trees.
11748 ++ * @st: the service tree.
11749 ++ * @entity: the entity being removed.
11750 ++ *
11751 ++ * Update the device status and forget everything about @entity, putting
11752 ++ * the device reference to it, if it is a queue. Entities belonging to
11753 ++ * groups are not refcounted.
11754 ++ */
11755 ++static void bfq_forget_entity(struct bfq_service_tree *st,
11756 ++ struct bfq_entity *entity)
11757 ++{
11758 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11759 ++ struct bfq_sched_data *sd;
11760 ++
11761 ++ BUG_ON(!entity->on_st);
11762 ++
11763 ++ entity->on_st = 0;
11764 ++ st->wsum -= entity->weight;
11765 ++ if (bfqq != NULL) {
11766 ++ sd = entity->sched_data;
11767 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
11768 ++ bfqq, atomic_read(&bfqq->ref));
11769 ++ bfq_put_queue(bfqq);
11770 ++ }
11771 ++}
11772 ++
11773 ++/**
11774 ++ * bfq_put_idle_entity - release the idle tree ref of an entity.
11775 ++ * @st: service tree for the entity.
11776 ++ * @entity: the entity being released.
11777 ++ */
11778 ++static void bfq_put_idle_entity(struct bfq_service_tree *st,
11779 ++ struct bfq_entity *entity)
11780 ++{
11781 ++ bfq_idle_extract(st, entity);
11782 ++ bfq_forget_entity(st, entity);
11783 ++}
11784 ++
11785 ++/**
11786 ++ * bfq_forget_idle - update the idle tree if necessary.
11787 ++ * @st: the service tree to act upon.
11788 ++ *
11789 ++ * To preserve the global O(log N) complexity we only remove one entry here;
11790 ++ * as the idle tree will not grow indefinitely this can be done safely.
11791 ++ */
11792 ++static void bfq_forget_idle(struct bfq_service_tree *st)
11793 ++{
11794 ++ struct bfq_entity *first_idle = st->first_idle;
11795 ++ struct bfq_entity *last_idle = st->last_idle;
11796 ++
11797 ++ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
11798 ++ !bfq_gt(last_idle->finish, st->vtime)) {
11799 ++ /*
11800 ++ * Forget the whole idle tree, increasing the vtime past
11801 ++ * the last finish time of idle entities.
11802 ++ */
11803 ++ st->vtime = last_idle->finish;
11804 ++ }
11805 ++
11806 ++ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
11807 ++ bfq_put_idle_entity(st, first_idle);
11808 ++}
11809 ++
11810 ++static struct bfq_service_tree *
11811 ++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
11812 ++ struct bfq_entity *entity)
11813 ++{
11814 ++ struct bfq_service_tree *new_st = old_st;
11815 ++
11816 ++ if (entity->ioprio_changed) {
11817 ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11818 ++
11819 ++ BUG_ON(old_st->wsum < entity->weight);
11820 ++ old_st->wsum -= entity->weight;
11821 ++
11822 ++ if (entity->new_weight != entity->orig_weight) {
11823 ++ entity->orig_weight = entity->new_weight;
11824 ++ entity->ioprio =
11825 ++ bfq_weight_to_ioprio(entity->orig_weight);
11826 ++ } else if (entity->new_ioprio != entity->ioprio) {
11827 ++ entity->ioprio = entity->new_ioprio;
11828 ++ entity->orig_weight =
11829 ++ bfq_ioprio_to_weight(entity->ioprio);
11830 ++ } else
11831 ++ entity->new_weight = entity->orig_weight =
11832 ++ bfq_ioprio_to_weight(entity->ioprio);
11833 ++
11834 ++ entity->ioprio_class = entity->new_ioprio_class;
11835 ++ entity->ioprio_changed = 0;
11836 ++
11837 ++ /*
11838 ++ * NOTE: here we may be changing the weight too early,
11839 ++ * this will cause unfairness. The correct approach
11840 ++ * would have required additional complexity to defer
11841 ++ * weight changes to the proper time instants (i.e.,
11842 ++ * when entity->finish <= old_st->vtime).
11843 ++ */
11844 ++ new_st = bfq_entity_service_tree(entity);
11845 ++ entity->weight = entity->orig_weight *
11846 ++ (bfqq != NULL ? bfqq->raising_coeff : 1);
11847 ++ new_st->wsum += entity->weight;
11848 ++
11849 ++ if (new_st != old_st)
11850 ++ entity->start = new_st->vtime;
11851 ++ }
11852 ++
11853 ++ return new_st;
11854 ++}
11855 ++
11856 ++/**
11857 ++ * bfq_bfqq_served - update the scheduler status after selection for service.
11858 ++ * @bfqq: the queue being served.
11859 ++ * @served: bytes to transfer.
11860 ++ *
11861 ++ * NOTE: this can be optimized, as the timestamps of upper level entities
11862 ++ * are synchronized every time a new bfqq is selected for service. By now,
11863 ++ * we keep it to better check consistency.
11864 ++ */
11865 ++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
11866 ++{
11867 ++ struct bfq_entity *entity = &bfqq->entity;
11868 ++ struct bfq_service_tree *st;
11869 ++
11870 ++ for_each_entity(entity) {
11871 ++ st = bfq_entity_service_tree(entity);
11872 ++
11873 ++ entity->service += served;
11874 ++ BUG_ON(entity->service > entity->budget);
11875 ++ BUG_ON(st->wsum == 0);
11876 ++
11877 ++ st->vtime += bfq_delta(served, st->wsum);
11878 ++ bfq_forget_idle(st);
11879 ++ }
11880 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
11881 ++}
11882 ++
11883 ++/**
11884 ++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
11885 ++ * @bfqq: the queue that needs a service update.
11886 ++ *
11887 ++ * When it's not possible to be fair in the service domain, because
11888 ++ * a queue is not consuming its budget fast enough (the meaning of
11889 ++ * fast depends on the timeout parameter), we charge it a full
11890 ++ * budget. In this way we should obtain a sort of time-domain
11891 ++ * fairness among all the seeky/slow queues.
11892 ++ */
11893 ++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
11894 ++{
11895 ++ struct bfq_entity *entity = &bfqq->entity;
11896 ++
11897 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
11898 ++
11899 ++ bfq_bfqq_served(bfqq, entity->budget - entity->service);
11900 ++}
11901 ++
11902 ++/**
11903 ++ * __bfq_activate_entity - activate an entity.
11904 ++ * @entity: the entity being activated.
11905 ++ *
11906 ++ * Called whenever an entity is activated, i.e., it is not active and one
11907 ++ * of its children receives a new request, or has to be reactivated due to
11908 ++ * budget exhaustion. It uses the current budget of the entity (and the
11909 ++ * service received if @entity is active) of the queue to calculate its
11910 ++ * timestamps.
11911 ++ */
11912 ++static void __bfq_activate_entity(struct bfq_entity *entity)
11913 ++{
11914 ++ struct bfq_sched_data *sd = entity->sched_data;
11915 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
11916 ++
11917 ++ if (entity == sd->active_entity) {
11918 ++ BUG_ON(entity->tree != NULL);
11919 ++ /*
11920 ++ * If we are requeueing the current entity we have
11921 ++ * to take care of not charging to it service it has
11922 ++ * not received.
11923 ++ */
11924 ++ bfq_calc_finish(entity, entity->service);
11925 ++ entity->start = entity->finish;
11926 ++ sd->active_entity = NULL;
11927 ++ } else if (entity->tree == &st->active) {
11928 ++ /*
11929 ++ * Requeueing an entity due to a change of some
11930 ++ * next_active entity below it. We reuse the old
11931 ++ * start time.
11932 ++ */
11933 ++ bfq_active_extract(st, entity);
11934 ++ } else if (entity->tree == &st->idle) {
11935 ++ /*
11936 ++ * Must be on the idle tree, bfq_idle_extract() will
11937 ++ * check for that.
11938 ++ */
11939 ++ bfq_idle_extract(st, entity);
11940 ++ entity->start = bfq_gt(st->vtime, entity->finish) ?
11941 ++ st->vtime : entity->finish;
11942 ++ } else {
11943 ++ /*
11944 ++ * The finish time of the entity may be invalid, and
11945 ++ * it is in the past for sure, otherwise the queue
11946 ++ * would have been on the idle tree.
11947 ++ */
11948 ++ entity->start = st->vtime;
11949 ++ st->wsum += entity->weight;
11950 ++ bfq_get_entity(entity);
11951 ++
11952 ++ BUG_ON(entity->on_st);
11953 ++ entity->on_st = 1;
11954 ++ }
11955 ++
11956 ++ st = __bfq_entity_update_weight_prio(st, entity);
11957 ++ bfq_calc_finish(entity, entity->budget);
11958 ++ bfq_active_insert(st, entity);
11959 ++}
11960 ++
11961 ++/**
11962 ++ * bfq_activate_entity - activate an entity and its ancestors if necessary.
11963 ++ * @entity: the entity to activate.
11964 ++ *
11965 ++ * Activate @entity and all the entities on the path from it to the root.
11966 ++ */
11967 ++static void bfq_activate_entity(struct bfq_entity *entity)
11968 ++{
11969 ++ struct bfq_sched_data *sd;
11970 ++
11971 ++ for_each_entity(entity) {
11972 ++ __bfq_activate_entity(entity);
11973 ++
11974 ++ sd = entity->sched_data;
11975 ++ if (!bfq_update_next_active(sd))
11976 ++ /*
11977 ++ * No need to propagate the activation to the
11978 ++ * upper entities, as they will be updated when
11979 ++ * the active entity is rescheduled.
11980 ++ */
11981 ++ break;
11982 ++ }
11983 ++}
11984 ++
11985 ++/**
11986 ++ * __bfq_deactivate_entity - deactivate an entity from its service tree.
11987 ++ * @entity: the entity to deactivate.
11988 ++ * @requeue: if false, the entity will not be put into the idle tree.
11989 ++ *
11990 ++ * Deactivate an entity, independently from its previous state. If the
11991 ++ * entity was not on a service tree just return, otherwise if it is on
11992 ++ * any scheduler tree, extract it from that tree, and if necessary
11993 ++ * and if the caller did not specify @requeue, put it on the idle tree.
11994 ++ *
11995 ++ * Return %1 if the caller should update the entity hierarchy, i.e.,
11996 ++ * if the entity was under service or if it was the next_active for
11997 ++ * its sched_data; return %0 otherwise.
11998 ++ */
11999 ++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
12000 ++{
12001 ++ struct bfq_sched_data *sd = entity->sched_data;
12002 ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
12003 ++ int was_active = entity == sd->active_entity;
12004 ++ int ret = 0;
12005 ++
12006 ++ if (!entity->on_st)
12007 ++ return 0;
12008 ++
12009 ++ BUG_ON(was_active && entity->tree != NULL);
12010 ++
12011 ++ if (was_active) {
12012 ++ bfq_calc_finish(entity, entity->service);
12013 ++ sd->active_entity = NULL;
12014 ++ } else if (entity->tree == &st->active)
12015 ++ bfq_active_extract(st, entity);
12016 ++ else if (entity->tree == &st->idle)
12017 ++ bfq_idle_extract(st, entity);
12018 ++ else if (entity->tree != NULL)
12019 ++ BUG();
12020 ++
12021 ++ if (was_active || sd->next_active == entity)
12022 ++ ret = bfq_update_next_active(sd);
12023 ++
12024 ++ if (!requeue || !bfq_gt(entity->finish, st->vtime))
12025 ++ bfq_forget_entity(st, entity);
12026 ++ else
12027 ++ bfq_idle_insert(st, entity);
12028 ++
12029 ++ BUG_ON(sd->active_entity == entity);
12030 ++ BUG_ON(sd->next_active == entity);
12031 ++
12032 ++ return ret;
12033 ++}
12034 ++
12035 ++/**
12036 ++ * bfq_deactivate_entity - deactivate an entity.
12037 ++ * @entity: the entity to deactivate.
12038 ++ * @requeue: true if the entity can be put on the idle tree
12039 ++ */
12040 ++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
12041 ++{
12042 ++ struct bfq_sched_data *sd;
12043 ++ struct bfq_entity *parent;
12044 ++
12045 ++ for_each_entity_safe(entity, parent) {
12046 ++ sd = entity->sched_data;
12047 ++
12048 ++ if (!__bfq_deactivate_entity(entity, requeue))
12049 ++ /*
12050 ++ * The parent entity is still backlogged, and
12051 ++ * we don't need to update it as it is still
12052 ++ * under service.
12053 ++ */
12054 ++ break;
12055 ++
12056 ++ if (sd->next_active != NULL)
12057 ++ /*
12058 ++ * The parent entity is still backlogged and
12059 ++ * the budgets on the path towards the root
12060 ++ * need to be updated.
12061 ++ */
12062 ++ goto update;
12063 ++
12064 ++ /*
12065 ++ * If we reach there the parent is no more backlogged and
12066 ++ * we want to propagate the dequeue upwards.
12067 ++ */
12068 ++ requeue = 1;
12069 ++ }
12070 ++
12071 ++ return;
12072 ++
12073 ++update:
12074 ++ entity = parent;
12075 ++ for_each_entity(entity) {
12076 ++ __bfq_activate_entity(entity);
12077 ++
12078 ++ sd = entity->sched_data;
12079 ++ if (!bfq_update_next_active(sd))
12080 ++ break;
12081 ++ }
12082 ++}
12083 ++
12084 ++/**
12085 ++ * bfq_update_vtime - update vtime if necessary.
12086 ++ * @st: the service tree to act upon.
12087 ++ *
12088 ++ * If necessary update the service tree vtime to have at least one
12089 ++ * eligible entity, skipping to its start time. Assumes that the
12090 ++ * active tree of the device is not empty.
12091 ++ *
12092 ++ * NOTE: this hierarchical implementation updates vtimes quite often,
12093 ++ * we may end up with reactivated tasks getting timestamps after a
12094 ++ * vtime skip done because we needed a ->first_active entity on some
12095 ++ * intermediate node.
12096 ++ */
12097 ++static void bfq_update_vtime(struct bfq_service_tree *st)
12098 ++{
12099 ++ struct bfq_entity *entry;
12100 ++ struct rb_node *node = st->active.rb_node;
12101 ++
12102 ++ entry = rb_entry(node, struct bfq_entity, rb_node);
12103 ++ if (bfq_gt(entry->min_start, st->vtime)) {
12104 ++ st->vtime = entry->min_start;
12105 ++ bfq_forget_idle(st);
12106 ++ }
12107 ++}
12108 ++
12109 ++/**
12110 ++ * bfq_first_active - find the eligible entity with the smallest finish time
12111 ++ * @st: the service tree to select from.
12112 ++ *
12113 ++ * This function searches the first schedulable entity, starting from the
12114 ++ * root of the tree and going on the left every time on this side there is
12115 ++ * a subtree with at least one eligible (start >= vtime) entity. The path
12116 ++ * on the right is followed only if a) the left subtree contains no eligible
12117 ++ * entities and b) no eligible entity has been found yet.
12118 ++ */
12119 ++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
12120 ++{
12121 ++ struct bfq_entity *entry, *first = NULL;
12122 ++ struct rb_node *node = st->active.rb_node;
12123 ++
12124 ++ while (node != NULL) {
12125 ++ entry = rb_entry(node, struct bfq_entity, rb_node);
12126 ++left:
12127 ++ if (!bfq_gt(entry->start, st->vtime))
12128 ++ first = entry;
12129 ++
12130 ++ BUG_ON(bfq_gt(entry->min_start, st->vtime));
12131 ++
12132 ++ if (node->rb_left != NULL) {
12133 ++ entry = rb_entry(node->rb_left,
12134 ++ struct bfq_entity, rb_node);
12135 ++ if (!bfq_gt(entry->min_start, st->vtime)) {
12136 ++ node = node->rb_left;
12137 ++ goto left;
12138 ++ }
12139 ++ }
12140 ++ if (first != NULL)
12141 ++ break;
12142 ++ node = node->rb_right;
12143 ++ }
12144 ++
12145 ++ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
12146 ++ return first;
12147 ++}
12148 ++
12149 ++/**
12150 ++ * __bfq_lookup_next_entity - return the first eligible entity in @st.
12151 ++ * @st: the service tree.
12152 ++ *
12153 ++ * Update the virtual time in @st and return the first eligible entity
12154 ++ * it contains.
12155 ++ */
12156 ++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
12157 ++ bool force)
12158 ++{
12159 ++ struct bfq_entity *entity, *new_next_active = NULL;
12160 ++
12161 ++ if (RB_EMPTY_ROOT(&st->active))
12162 ++ return NULL;
12163 ++
12164 ++ bfq_update_vtime(st);
12165 ++ entity = bfq_first_active_entity(st);
12166 ++ BUG_ON(bfq_gt(entity->start, st->vtime));
12167 ++
12168 ++ /*
12169 ++ * If the chosen entity does not match with the sched_data's
12170 ++ * next_active and we are forcedly serving the IDLE priority
12171 ++ * class tree, bubble up budget update.
12172 ++ */
12173 ++ if (unlikely(force && entity != entity->sched_data->next_active)) {
12174 ++ new_next_active = entity;
12175 ++ for_each_entity(new_next_active)
12176 ++ bfq_update_budget(new_next_active);
12177 ++ }
12178 ++
12179 ++ return entity;
12180 ++}
12181 ++
12182 ++/**
12183 ++ * bfq_lookup_next_entity - return the first eligible entity in @sd.
12184 ++ * @sd: the sched_data.
12185 ++ * @extract: if true the returned entity will be also extracted from @sd.
12186 ++ *
12187 ++ * NOTE: since we cache the next_active entity at each level of the
12188 ++ * hierarchy, the complexity of the lookup can be decreased with
12189 ++ * absolutely no effort just returning the cached next_active value;
12190 ++ * we prefer to do full lookups to test the consistency of * the data
12191 ++ * structures.
12192 ++ */
12193 ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
12194 ++ int extract,
12195 ++ struct bfq_data *bfqd)
12196 ++{
12197 ++ struct bfq_service_tree *st = sd->service_tree;
12198 ++ struct bfq_entity *entity;
12199 ++ int i=0;
12200 ++
12201 ++ BUG_ON(sd->active_entity != NULL);
12202 ++
12203 ++ if (bfqd != NULL &&
12204 ++ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
12205 ++ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true);
12206 ++ if (entity != NULL) {
12207 ++ i = BFQ_IOPRIO_CLASSES - 1;
12208 ++ bfqd->bfq_class_idle_last_service = jiffies;
12209 ++ sd->next_active = entity;
12210 ++ }
12211 ++ }
12212 ++ for (; i < BFQ_IOPRIO_CLASSES; i++) {
12213 ++ entity = __bfq_lookup_next_entity(st + i, false);
12214 ++ if (entity != NULL) {
12215 ++ if (extract) {
12216 ++ bfq_check_next_active(sd, entity);
12217 ++ bfq_active_extract(st + i, entity);
12218 ++ sd->active_entity = entity;
12219 ++ sd->next_active = NULL;
12220 ++ }
12221 ++ break;
12222 ++ }
12223 ++ }
12224 ++
12225 ++ return entity;
12226 ++}
12227 ++
12228 ++/*
12229 ++ * Get next queue for service.
12230 ++ */
12231 ++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
12232 ++{
12233 ++ struct bfq_entity *entity = NULL;
12234 ++ struct bfq_sched_data *sd;
12235 ++ struct bfq_queue *bfqq;
12236 ++
12237 ++ BUG_ON(bfqd->active_queue != NULL);
12238 ++
12239 ++ if (bfqd->busy_queues == 0)
12240 ++ return NULL;
12241 ++
12242 ++ sd = &bfqd->root_group->sched_data;
12243 ++ for (; sd != NULL; sd = entity->my_sched_data) {
12244 ++ entity = bfq_lookup_next_entity(sd, 1, bfqd);
12245 ++ BUG_ON(entity == NULL);
12246 ++ entity->service = 0;
12247 ++ }
12248 ++
12249 ++ bfqq = bfq_entity_to_bfqq(entity);
12250 ++ BUG_ON(bfqq == NULL);
12251 ++
12252 ++ return bfqq;
12253 ++}
12254 ++
12255 ++/*
12256 ++ * Forced extraction of the given queue.
12257 ++ */
12258 ++static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
12259 ++ struct bfq_queue *bfqq)
12260 ++{
12261 ++ struct bfq_entity *entity;
12262 ++ struct bfq_sched_data *sd;
12263 ++
12264 ++ BUG_ON(bfqd->active_queue != NULL);
12265 ++
12266 ++ entity = &bfqq->entity;
12267 ++ /*
12268 ++ * Bubble up extraction/update from the leaf to the root.
12269 ++ */
12270 ++ for_each_entity(entity) {
12271 ++ sd = entity->sched_data;
12272 ++ bfq_update_budget(entity);
12273 ++ bfq_update_vtime(bfq_entity_service_tree(entity));
12274 ++ bfq_active_extract(bfq_entity_service_tree(entity), entity);
12275 ++ sd->active_entity = entity;
12276 ++ sd->next_active = NULL;
12277 ++ entity->service = 0;
12278 ++ }
12279 ++
12280 ++ return;
12281 ++}
12282 ++
12283 ++static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)
12284 ++{
12285 ++ if (bfqd->active_bic != NULL) {
12286 ++ put_io_context(bfqd->active_bic->icq.ioc);
12287 ++ bfqd->active_bic = NULL;
12288 ++ }
12289 ++
12290 ++ bfqd->active_queue = NULL;
12291 ++ del_timer(&bfqd->idle_slice_timer);
12292 ++}
12293 ++
12294 ++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
12295 ++ int requeue)
12296 ++{
12297 ++ struct bfq_entity *entity = &bfqq->entity;
12298 ++
12299 ++ if (bfqq == bfqd->active_queue)
12300 ++ __bfq_bfqd_reset_active(bfqd);
12301 ++
12302 ++ bfq_deactivate_entity(entity, requeue);
12303 ++}
12304 ++
12305 ++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
12306 ++{
12307 ++ struct bfq_entity *entity = &bfqq->entity;
12308 ++
12309 ++ bfq_activate_entity(entity);
12310 ++}
12311 ++
12312 ++/*
12313 ++ * Called when the bfqq no longer has requests pending, remove it from
12314 ++ * the service tree.
12315 ++ */
12316 ++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
12317 ++ int requeue)
12318 ++{
12319 ++ BUG_ON(!bfq_bfqq_busy(bfqq));
12320 ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
12321 ++
12322 ++ bfq_log_bfqq(bfqd, bfqq, "del from busy");
12323 ++
12324 ++ bfq_clear_bfqq_busy(bfqq);
12325 ++
12326 ++ BUG_ON(bfqd->busy_queues == 0);
12327 ++ bfqd->busy_queues--;
12328 ++
12329 ++ bfq_deactivate_bfqq(bfqd, bfqq, requeue);
12330 ++}
12331 ++
12332 ++/*
12333 ++ * Called when an inactive queue receives a new request.
12334 ++ */
12335 ++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
12336 ++{
12337 ++ BUG_ON(bfq_bfqq_busy(bfqq));
12338 ++ BUG_ON(bfqq == bfqd->active_queue);
12339 ++
12340 ++ bfq_log_bfqq(bfqd, bfqq, "add to busy");
12341 ++
12342 ++ bfq_activate_bfqq(bfqd, bfqq);
12343 ++
12344 ++ bfq_mark_bfqq_busy(bfqq);
12345 ++ bfqd->busy_queues++;
12346 ++}
12347 +diff --git a/block/bfq.h b/block/bfq.h
12348 +new file mode 100644
12349 +index 0000000..48ecde9
12350 +--- /dev/null
12351 ++++ b/block/bfq.h
12352 +@@ -0,0 +1,603 @@
12353 ++/*
12354 ++ * BFQ-v6r2 for 3.10.0: data structures and common functions prototypes.
12355 ++ *
12356 ++ * Based on ideas and code from CFQ:
12357 ++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
12358 ++ *
12359 ++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
12360 ++ * Paolo Valente <paolo.valente@×××××××.it>
12361 ++ *
12362 ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
12363 ++ */
12364 ++
12365 ++#ifndef _BFQ_H
12366 ++#define _BFQ_H
12367 ++
12368 ++#include <linux/blktrace_api.h>
12369 ++#include <linux/hrtimer.h>
12370 ++#include <linux/ioprio.h>
12371 ++#include <linux/rbtree.h>
12372 ++
12373 ++#define BFQ_IOPRIO_CLASSES 3
12374 ++#define BFQ_CL_IDLE_TIMEOUT HZ/5
12375 ++
12376 ++#define BFQ_MIN_WEIGHT 1
12377 ++#define BFQ_MAX_WEIGHT 1000
12378 ++
12379 ++#define BFQ_DEFAULT_GRP_WEIGHT 10
12380 ++#define BFQ_DEFAULT_GRP_IOPRIO 0
12381 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
12382 ++
12383 ++struct bfq_entity;
12384 ++
12385 ++/**
12386 ++ * struct bfq_service_tree - per ioprio_class service tree.
12387 ++ * @active: tree for active entities (i.e., those backlogged).
12388 ++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
12389 ++ * @first_idle: idle entity with minimum F_i.
12390 ++ * @last_idle: idle entity with maximum F_i.
12391 ++ * @vtime: scheduler virtual time.
12392 ++ * @wsum: scheduler weight sum; active and idle entities contribute to it.
12393 ++ *
12394 ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
12395 ++ * ioprio_class has its own independent scheduler, and so its own
12396 ++ * bfq_service_tree. All the fields are protected by the queue lock
12397 ++ * of the containing bfqd.
12398 ++ */
12399 ++struct bfq_service_tree {
12400 ++ struct rb_root active;
12401 ++ struct rb_root idle;
12402 ++
12403 ++ struct bfq_entity *first_idle;
12404 ++ struct bfq_entity *last_idle;
12405 ++
12406 ++ u64 vtime;
12407 ++ unsigned long wsum;
12408 ++};
12409 ++
12410 ++/**
12411 ++ * struct bfq_sched_data - multi-class scheduler.
12412 ++ * @active_entity: entity under service.
12413 ++ * @next_active: head-of-the-line entity in the scheduler.
12414 ++ * @service_tree: array of service trees, one per ioprio_class.
12415 ++ *
12416 ++ * bfq_sched_data is the basic scheduler queue. It supports three
12417 ++ * ioprio_classes, and can be used either as a toplevel queue or as
12418 ++ * an intermediate queue on a hierarchical setup.
12419 ++ * @next_active points to the active entity of the sched_data service
12420 ++ * trees that will be scheduled next.
12421 ++ *
12422 ++ * The supported ioprio_classes are the same as in CFQ, in descending
12423 ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
12424 ++ * Requests from higher priority queues are served before all the
12425 ++ * requests from lower priority queues; among requests of the same
12426 ++ * queue requests are served according to B-WF2Q+.
12427 ++ * All the fields are protected by the queue lock of the containing bfqd.
12428 ++ */
12429 ++struct bfq_sched_data {
12430 ++ struct bfq_entity *active_entity;
12431 ++ struct bfq_entity *next_active;
12432 ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
12433 ++};
12434 ++
12435 ++/**
12436 ++ * struct bfq_entity - schedulable entity.
12437 ++ * @rb_node: service_tree member.
12438 ++ * @on_st: flag, true if the entity is on a tree (either the active or
12439 ++ * the idle one of its service_tree).
12440 ++ * @finish: B-WF2Q+ finish timestamp (aka F_i).
12441 ++ * @start: B-WF2Q+ start timestamp (aka S_i).
12442 ++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
12443 ++ * @min_start: minimum start time of the (active) subtree rooted at
12444 ++ * this entity; used for O(log N) lookups into active trees.
12445 ++ * @service: service received during the last round of service.
12446 ++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
12447 ++ * @weight: weight of the queue
12448 ++ * @parent: parent entity, for hierarchical scheduling.
12449 ++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
12450 ++ * associated scheduler queue, %NULL on leaf nodes.
12451 ++ * @sched_data: the scheduler queue this entity belongs to.
12452 ++ * @ioprio: the ioprio in use.
12453 ++ * @new_weight: when a weight change is requested, the new weight value.
12454 ++ * @orig_weight: original weight, used to implement weight boosting
12455 ++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
12456 ++ * @ioprio_class: the ioprio_class in use.
12457 ++ * @new_ioprio_class: when an ioprio_class change is requested, the new
12458 ++ * ioprio_class value.
12459 ++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
12460 ++ * ioprio_class change.
12461 ++ *
12462 ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
12463 ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
12464 ++ * entity belongs to the sched_data of the parent group in the cgroup
12465 ++ * hierarchy. Non-leaf entities have also their own sched_data, stored
12466 ++ * in @my_sched_data.
12467 ++ *
12468 ++ * Each entity stores independently its priority values; this would
12469 ++ * allow different weights on different devices, but this
12470 ++ * functionality is not exported to userspace by now. Priorities and
12471 ++ * weights are updated lazily, first storing the new values into the
12472 ++ * new_* fields, then setting the @ioprio_changed flag. As soon as
12473 ++ * there is a transition in the entity state that allows the priority
12474 ++ * update to take place the effective and the requested priority
12475 ++ * values are synchronized.
12476 ++ *
12477 ++ * Unless cgroups are used, the weight value is calculated from the
12478 ++ * ioprio to export the same interface as CFQ. When dealing with
12479 ++ * ``well-behaved'' queues (i.e., queues that do not spend too much
12480 ++ * time to consume their budget and have true sequential behavior, and
12481 ++ * when there are no external factors breaking anticipation) the
12482 ++ * relative weights at each level of the cgroups hierarchy should be
12483 ++ * guaranteed. All the fields are protected by the queue lock of the
12484 ++ * containing bfqd.
12485 ++ */
12486 ++struct bfq_entity {
12487 ++ struct rb_node rb_node;
12488 ++
12489 ++ int on_st;
12490 ++
12491 ++ u64 finish;
12492 ++ u64 start;
12493 ++
12494 ++ struct rb_root *tree;
12495 ++
12496 ++ u64 min_start;
12497 ++
12498 ++ unsigned long service, budget;
12499 ++ unsigned short weight, new_weight;
12500 ++ unsigned short orig_weight;
12501 ++
12502 ++ struct bfq_entity *parent;
12503 ++
12504 ++ struct bfq_sched_data *my_sched_data;
12505 ++ struct bfq_sched_data *sched_data;
12506 ++
12507 ++ unsigned short ioprio, new_ioprio;
12508 ++ unsigned short ioprio_class, new_ioprio_class;
12509 ++
12510 ++ int ioprio_changed;
12511 ++};
12512 ++
12513 ++struct bfq_group;
12514 ++
12515 ++/**
12516 ++ * struct bfq_queue - leaf schedulable entity.
12517 ++ * @ref: reference counter.
12518 ++ * @bfqd: parent bfq_data.
12519 ++ * @new_bfqq: shared bfq_queue if queue is cooperating with
12520 ++ * one or more other queues.
12521 ++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
12522 ++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
12523 ++ * @sort_list: sorted list of pending requests.
12524 ++ * @next_rq: if fifo isn't expired, next request to serve.
12525 ++ * @queued: nr of requests queued in @sort_list.
12526 ++ * @allocated: currently allocated requests.
12527 ++ * @meta_pending: pending metadata requests.
12528 ++ * @fifo: fifo list of requests in sort_list.
12529 ++ * @entity: entity representing this queue in the scheduler.
12530 ++ * @max_budget: maximum budget allowed from the feedback mechanism.
12531 ++ * @budget_timeout: budget expiration (in jiffies).
12532 ++ * @dispatched: number of requests on the dispatch list or inside driver.
12533 ++ * @org_ioprio: saved ioprio during boosted periods.
12534 ++ * @flags: status flags.
12535 ++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
12536 ++ * @seek_samples: number of seeks sampled
12537 ++ * @seek_total: sum of the distances of the seeks sampled
12538 ++ * @seek_mean: mean seek distance
12539 ++ * @last_request_pos: position of the last request enqueued
12540 ++ * @pid: pid of the process owning the queue, used for logging purposes.
12541 ++ * @last_rais_start_time: last (idle -> weight-raised) transition attempt
12542 ++ * @raising_cur_max_time: current max raising time for this queue
12543 ++ *
12544 ++ * A bfq_queue is a leaf request queue; it can be associated to an io_context
12545 ++ * or more (if it is an async one). @cgroup holds a reference to the
12546 ++ * cgroup, to be sure that it does not disappear while a bfqq still
12547 ++ * references it (mostly to avoid races between request issuing and task
12548 ++ * migration followed by cgroup distruction).
12549 ++ * All the fields are protected by the queue lock of the containing bfqd.
12550 ++ */
12551 ++struct bfq_queue {
12552 ++ atomic_t ref;
12553 ++ struct bfq_data *bfqd;
12554 ++
12555 ++ /* fields for cooperating queues handling */
12556 ++ struct bfq_queue *new_bfqq;
12557 ++ struct rb_node pos_node;
12558 ++ struct rb_root *pos_root;
12559 ++
12560 ++ struct rb_root sort_list;
12561 ++ struct request *next_rq;
12562 ++ int queued[2];
12563 ++ int allocated[2];
12564 ++ int meta_pending;
12565 ++ struct list_head fifo;
12566 ++
12567 ++ struct bfq_entity entity;
12568 ++
12569 ++ unsigned long max_budget;
12570 ++ unsigned long budget_timeout;
12571 ++
12572 ++ int dispatched;
12573 ++
12574 ++ unsigned short org_ioprio;
12575 ++
12576 ++ unsigned int flags;
12577 ++
12578 ++ struct list_head bfqq_list;
12579 ++
12580 ++ unsigned int seek_samples;
12581 ++ u64 seek_total;
12582 ++ sector_t seek_mean;
12583 ++ sector_t last_request_pos;
12584 ++
12585 ++ pid_t pid;
12586 ++
12587 ++ /* weight-raising fields */
12588 ++ unsigned int raising_cur_max_time;
12589 ++ u64 last_rais_start_finish, soft_rt_next_start;
12590 ++ unsigned int raising_coeff;
12591 ++};
12592 ++
12593 ++/**
12594 ++ * struct bfq_ttime - per process thinktime stats.
12595 ++ * @ttime_total: total process thinktime
12596 ++ * @ttime_samples: number of thinktime samples
12597 ++ * @ttime_mean: average process thinktime
12598 ++ */
12599 ++struct bfq_ttime {
12600 ++ unsigned long last_end_request;
12601 ++
12602 ++ unsigned long ttime_total;
12603 ++ unsigned long ttime_samples;
12604 ++ unsigned long ttime_mean;
12605 ++};
12606 ++
12607 ++/**
12608 ++ * struct bfq_io_cq - per (request_queue, io_context) structure.
12609 ++ * @icq: associated io_cq structure
12610 ++ * @bfqq: array of two process queues, the sync and the async
12611 ++ * @ttime: associated @bfq_ttime struct
12612 ++ */
12613 ++struct bfq_io_cq {
12614 ++ struct io_cq icq; /* must be the first member */
12615 ++ struct bfq_queue *bfqq[2];
12616 ++ struct bfq_ttime ttime;
12617 ++ int ioprio;
12618 ++};
12619 ++
12620 ++/**
12621 ++ * struct bfq_data - per device data structure.
12622 ++ * @queue: request queue for the managed device.
12623 ++ * @root_group: root bfq_group for the device.
12624 ++ * @rq_pos_tree: rbtree sorted by next_request position,
12625 ++ * used when determining if two or more queues
12626 ++ * have interleaving requests (see bfq_close_cooperator).
12627 ++ * @busy_queues: number of bfq_queues containing requests (including the
12628 ++ * queue under service, even if it is idling).
12629 ++ * @queued: number of queued requests.
12630 ++ * @rq_in_driver: number of requests dispatched and waiting for completion.
12631 ++ * @sync_flight: number of sync requests in the driver.
12632 ++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
12633 ++ * completed requests .
12634 ++ * @hw_tag_samples: nr of samples used to calculate hw_tag.
12635 ++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
12636 ++ * @budgets_assigned: number of budgets assigned.
12637 ++ * @idle_slice_timer: timer set when idling for the next sequential request
12638 ++ * from the queue under service.
12639 ++ * @unplug_work: delayed work to restart dispatching on the request queue.
12640 ++ * @active_queue: bfq_queue under service.
12641 ++ * @active_bic: bfq_io_cq (bic) associated with the @active_queue.
12642 ++ * @last_position: on-disk position of the last served request.
12643 ++ * @last_budget_start: beginning of the last budget.
12644 ++ * @last_idling_start: beginning of the last idle slice.
12645 ++ * @peak_rate: peak transfer rate observed for a budget.
12646 ++ * @peak_rate_samples: number of samples used to calculate @peak_rate.
12647 ++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
12648 ++ * @group_list: list of all the bfq_groups active on the device.
12649 ++ * @active_list: list of all the bfq_queues active on the device.
12650 ++ * @idle_list: list of all the bfq_queues idle on the device.
12651 ++ * @bfq_quantum: max number of requests dispatched per dispatch round.
12652 ++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
12653 ++ * requests are served in fifo order.
12654 ++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
12655 ++ * @bfq_back_max: maximum allowed backward seek.
12656 ++ * @bfq_slice_idle: maximum idling time.
12657 ++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
12658 ++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
12659 ++ * async queues.
12660 ++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
12661 ++ * to prevent seeky queues to impose long latencies to well
12662 ++ * behaved ones (this also implies that seeky queues cannot
12663 ++ * receive guarantees in the service domain; after a timeout
12664 ++ * they are charged for the whole allocated budget, to try
12665 ++ * to preserve a behavior reasonably fair among them, but
12666 ++ * without service-domain guarantees).
12667 ++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
12668 ++ * queue is multiplied
12669 ++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
12670 ++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
12671 ++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
12672 ++ * may be reactivated for a queue (in jiffies)
12673 ++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
12674 ++ * after which weight-raising may be
12675 ++ * reactivated for an already busy queue
12676 ++ * (in jiffies)
12677 ++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
12678 ++ * sectors per seconds
12679 ++ * @RT_prod: cached value of the product R*T used for computing the maximum
12680 ++ * duration of the weight raising automatically
12681 ++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
12682 ++ *
12683 ++ * All the fields are protected by the @queue lock.
12684 ++ */
12685 ++struct bfq_data {
12686 ++ struct request_queue *queue;
12687 ++
12688 ++ struct bfq_group *root_group;
12689 ++
12690 ++ struct rb_root rq_pos_tree;
12691 ++
12692 ++ int busy_queues;
12693 ++ int queued;
12694 ++ int rq_in_driver;
12695 ++ int sync_flight;
12696 ++
12697 ++ int max_rq_in_driver;
12698 ++ int hw_tag_samples;
12699 ++ int hw_tag;
12700 ++
12701 ++ int budgets_assigned;
12702 ++
12703 ++ struct timer_list idle_slice_timer;
12704 ++ struct work_struct unplug_work;
12705 ++
12706 ++ struct bfq_queue *active_queue;
12707 ++ struct bfq_io_cq *active_bic;
12708 ++
12709 ++ sector_t last_position;
12710 ++
12711 ++ ktime_t last_budget_start;
12712 ++ ktime_t last_idling_start;
12713 ++ int peak_rate_samples;
12714 ++ u64 peak_rate;
12715 ++ unsigned long bfq_max_budget;
12716 ++
12717 ++ struct hlist_head group_list;
12718 ++ struct list_head active_list;
12719 ++ struct list_head idle_list;
12720 ++
12721 ++ unsigned int bfq_quantum;
12722 ++ unsigned int bfq_fifo_expire[2];
12723 ++ unsigned int bfq_back_penalty;
12724 ++ unsigned int bfq_back_max;
12725 ++ unsigned int bfq_slice_idle;
12726 ++ u64 bfq_class_idle_last_service;
12727 ++
12728 ++ unsigned int bfq_user_max_budget;
12729 ++ unsigned int bfq_max_budget_async_rq;
12730 ++ unsigned int bfq_timeout[2];
12731 ++
12732 ++ bool low_latency;
12733 ++
12734 ++ /* parameters of the low_latency heuristics */
12735 ++ unsigned int bfq_raising_coeff;
12736 ++ unsigned int bfq_raising_max_time;
12737 ++ unsigned int bfq_raising_rt_max_time;
12738 ++ unsigned int bfq_raising_min_idle_time;
12739 ++ unsigned int bfq_raising_min_inter_arr_async;
12740 ++ unsigned int bfq_raising_max_softrt_rate;
12741 ++ u64 RT_prod;
12742 ++
12743 ++ struct bfq_queue oom_bfqq;
12744 ++};
12745 ++
12746 ++enum bfqq_state_flags {
12747 ++ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */
12748 ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
12749 ++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
12750 ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
12751 ++ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
12752 ++ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */
12753 ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
12754 ++ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
12755 ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
12756 ++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
12757 ++ BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */
12758 ++};
12759 ++
12760 ++#define BFQ_BFQQ_FNS(name) \
12761 ++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
12762 ++{ \
12763 ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
12764 ++} \
12765 ++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
12766 ++{ \
12767 ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
12768 ++} \
12769 ++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
12770 ++{ \
12771 ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
12772 ++}
12773 ++
12774 ++BFQ_BFQQ_FNS(busy);
12775 ++BFQ_BFQQ_FNS(wait_request);
12776 ++BFQ_BFQQ_FNS(must_alloc);
12777 ++BFQ_BFQQ_FNS(fifo_expire);
12778 ++BFQ_BFQQ_FNS(idle_window);
12779 ++BFQ_BFQQ_FNS(prio_changed);
12780 ++BFQ_BFQQ_FNS(sync);
12781 ++BFQ_BFQQ_FNS(budget_new);
12782 ++BFQ_BFQQ_FNS(coop);
12783 ++BFQ_BFQQ_FNS(split_coop);
12784 ++BFQ_BFQQ_FNS(some_coop_idle);
12785 ++#undef BFQ_BFQQ_FNS
12786 ++
12787 ++/* Logging facilities. */
12788 ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
12789 ++ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
12790 ++
12791 ++#define bfq_log(bfqd, fmt, args...) \
12792 ++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
12793 ++
12794 ++/* Expiration reasons. */
12795 ++enum bfqq_expiration {
12796 ++ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */
12797 ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
12798 ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
12799 ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
12800 ++};
12801 ++
12802 ++#ifdef CONFIG_CGROUP_BFQIO
12803 ++/**
12804 ++ * struct bfq_group - per (device, cgroup) data structure.
12805 ++ * @entity: schedulable entity to insert into the parent group sched_data.
12806 ++ * @sched_data: own sched_data, to contain child entities (they may be
12807 ++ * both bfq_queues and bfq_groups).
12808 ++ * @group_node: node to be inserted into the bfqio_cgroup->group_data
12809 ++ * list of the containing cgroup's bfqio_cgroup.
12810 ++ * @bfqd_node: node to be inserted into the @bfqd->group_list list
12811 ++ * of the groups active on the same device; used for cleanup.
12812 ++ * @bfqd: the bfq_data for the device this group acts upon.
12813 ++ * @async_bfqq: array of async queues for all the tasks belonging to
12814 ++ * the group, one queue per ioprio value per ioprio_class,
12815 ++ * except for the idle class that has only one queue.
12816 ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
12817 ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
12818 ++ * to avoid too many special cases during group creation/migration.
12819 ++ *
12820 ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
12821 ++ * there is a set of bfq_groups, each one collecting the lower-level
12822 ++ * entities belonging to the group that are acting on the same device.
12823 ++ *
12824 ++ * Locking works as follows:
12825 ++ * o @group_node is protected by the bfqio_cgroup lock, and is accessed
12826 ++ * via RCU from its readers.
12827 ++ * o @bfqd is protected by the queue lock, RCU is used to access it
12828 ++ * from the readers.
12829 ++ * o All the other fields are protected by the @bfqd queue lock.
12830 ++ */
12831 ++struct bfq_group {
12832 ++ struct bfq_entity entity;
12833 ++ struct bfq_sched_data sched_data;
12834 ++
12835 ++ struct hlist_node group_node;
12836 ++ struct hlist_node bfqd_node;
12837 ++
12838 ++ void *bfqd;
12839 ++
12840 ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
12841 ++ struct bfq_queue *async_idle_bfqq;
12842 ++
12843 ++ struct bfq_entity *my_entity;
12844 ++};
12845 ++
12846 ++/**
12847 ++ * struct bfqio_cgroup - bfq cgroup data structure.
12848 ++ * @css: subsystem state for bfq in the containing cgroup.
12849 ++ * @weight: cgroup weight.
12850 ++ * @ioprio: cgroup ioprio.
12851 ++ * @ioprio_class: cgroup ioprio_class.
12852 ++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
12853 ++ * @group_data: list containing the bfq_group belonging to this cgroup.
12854 ++ *
12855 ++ * @group_data is accessed using RCU, with @lock protecting the updates,
12856 ++ * @ioprio and @ioprio_class are protected by @lock.
12857 ++ */
12858 ++struct bfqio_cgroup {
12859 ++ struct cgroup_subsys_state css;
12860 ++
12861 ++ unsigned short weight, ioprio, ioprio_class;
12862 ++
12863 ++ spinlock_t lock;
12864 ++ struct hlist_head group_data;
12865 ++};
12866 ++#else
12867 ++struct bfq_group {
12868 ++ struct bfq_sched_data sched_data;
12869 ++
12870 ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
12871 ++ struct bfq_queue *async_idle_bfqq;
12872 ++};
12873 ++#endif
12874 ++
12875 ++static inline struct bfq_service_tree *
12876 ++bfq_entity_service_tree(struct bfq_entity *entity)
12877 ++{
12878 ++ struct bfq_sched_data *sched_data = entity->sched_data;
12879 ++ unsigned int idx = entity->ioprio_class - 1;
12880 ++
12881 ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
12882 ++ BUG_ON(sched_data == NULL);
12883 ++
12884 ++ return sched_data->service_tree + idx;
12885 ++}
12886 ++
12887 ++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
12888 ++ int is_sync)
12889 ++{
12890 ++ return bic->bfqq[!!is_sync];
12891 ++}
12892 ++
12893 ++static inline void bic_set_bfqq(struct bfq_io_cq *bic,
12894 ++ struct bfq_queue *bfqq, int is_sync)
12895 ++{
12896 ++ bic->bfqq[!!is_sync] = bfqq;
12897 ++}
12898 ++
12899 ++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
12900 ++{
12901 ++ return bic->icq.q->elevator->elevator_data;
12902 ++}
12903 ++
12904 ++/**
12905 ++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
12906 ++ * @ptr: a pointer to a bfqd.
12907 ++ * @flags: storage for the flags to be saved.
12908 ++ *
12909 ++ * This function allows bfqg->bfqd to be protected by the
12910 ++ * queue lock of the bfqd they reference; the pointer is dereferenced
12911 ++ * under RCU, so the storage for bfqd is assured to be safe as long
12912 ++ * as the RCU read side critical section does not end. After the
12913 ++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
12914 ++ * sure that no other writer accessed it. If we raced with a writer,
12915 ++ * the function returns NULL, with the queue unlocked, otherwise it
12916 ++ * returns the dereferenced pointer, with the queue locked.
12917 ++ */
12918 ++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
12919 ++ unsigned long *flags)
12920 ++{
12921 ++ struct bfq_data *bfqd;
12922 ++
12923 ++ rcu_read_lock();
12924 ++ bfqd = rcu_dereference(*(struct bfq_data **)ptr);
12925 ++
12926 ++ if (bfqd != NULL) {
12927 ++ spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
12928 ++ if (*ptr == bfqd)
12929 ++ goto out;
12930 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
12931 ++ }
12932 ++
12933 ++ bfqd = NULL;
12934 ++out:
12935 ++ rcu_read_unlock();
12936 ++ return bfqd;
12937 ++}
12938 ++
12939 ++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
12940 ++ unsigned long *flags)
12941 ++{
12942 ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
12943 ++}
12944 ++
12945 ++static void bfq_changed_ioprio(struct bfq_io_cq *bic);
12946 ++static void bfq_put_queue(struct bfq_queue *bfqq);
12947 ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
12948 ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
12949 ++ struct bfq_group *bfqg, int is_sync,
12950 ++ struct bfq_io_cq *bic, gfp_t gfp_mask);
12951 ++static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
12952 ++ struct bfq_group *bfqg);
12953 ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
12954 ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
12955 ++#endif
12956 +--
12957 +1.8.1.4
12958 +
12959
12960 Added: genpatches-2.6/trunk/3.12/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1
12961 ===================================================================
12962 --- genpatches-2.6/trunk/3.12/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1 (rev 0)
12963 +++ genpatches-2.6/trunk/3.12/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.11.0.patch1 2013-11-04 10:09:31 UTC (rev 2565)
12964 @@ -0,0 +1,1049 @@
12965 +From 9acaa783ecab69925d38c6aca7252ff565a093d0 Mon Sep 17 00:00:00 2001
12966 +From: Mauro Andreolini <mauro.andreolini@×××××××.it>
12967 +Date: Fri, 14 Jun 2013 13:46:47 +0200
12968 +Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v6r2 for
12969 + 3.11.0
12970 +
12971 +A set of processes may happen to perform interleaved reads, i.e., requests
12972 +whose union would give rise to a sequential read pattern. There are two
12973 +typical cases: in the first case, processes read fixed-size chunks of
12974 +data at a fixed distance from each other, while in the second case processes
12975 +may read variable-size chunks at variable distances. The latter case occurs
12976 +for example with KVM, which splits the I/O generated by the guest into
12977 +multiple chunks, and lets these chunks be served by a pool of cooperating
12978 +processes, iteratively assigning the next chunk of I/O to the first
12979 +available process. CFQ uses actual queue merging for the first type of
12980 +processes, whereas it uses preemption to get a sequential read pattern out
12981 +of the read requests performed by the second type of processes. In the end
12982 +it uses two different mechanisms to achieve the same goal: boosting the
12983 +throughput with interleaved I/O.
12984 +
12985 +This patch introduces Early Queue Merge (EQM), a unified mechanism to get a
12986 +sequential read pattern with both types of processes. The main idea is
12987 +checking newly arrived requests against the next request of the active queue
12988 +both in case of actual request insert and in case of request merge. By doing
12989 +so, both the types of processes can be handled by just merging their queues.
12990 +EQM is then simpler and more compact than the pair of mechanisms used in
12991 +CFQ.
12992 +
12993 +Finally, EQM also preserves the typical low-latency properties of BFQ, by
12994 +properly restoring the weight-raising state of a queue when it gets back to
12995 +a non-merged state.
12996 +
12997 +Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>
12998 +Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
12999 +Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>
13000 +---
13001 + block/bfq-iosched.c | 653 ++++++++++++++++++++++++++++++++++++----------------
13002 + block/bfq-sched.c | 28 ---
13003 + block/bfq.h | 16 ++
13004 + 3 files changed, 466 insertions(+), 231 deletions(-)
13005 +
13006 +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
13007 +index 0ed2746..bbe79fb 100644
13008 +--- a/block/bfq-iosched.c
13009 ++++ b/block/bfq-iosched.c
13010 +@@ -444,6 +444,43 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
13011 + return dur;
13012 + }
13013 +
13014 ++static inline void
13015 ++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
13016 ++{
13017 ++ if (bic->saved_idle_window)
13018 ++ bfq_mark_bfqq_idle_window(bfqq);
13019 ++ else
13020 ++ bfq_clear_bfqq_idle_window(bfqq);
13021 ++ if (bic->raising_time_left && bfqq->bfqd->low_latency) {
13022 ++ /*
13023 ++ * Start a weight raising period with the duration given by
13024 ++ * the raising_time_left snapshot.
13025 ++ */
13026 ++ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;
13027 ++ bfqq->raising_cur_max_time = bic->raising_time_left;
13028 ++ bfqq->last_rais_start_finish = jiffies;
13029 ++ }
13030 ++ /*
13031 ++ * Clear raising_time_left to prevent bfq_bfqq_save_state() from
13032 ++ * getting confused about the queue's need of a weight-raising
13033 ++ * period.
13034 ++ */
13035 ++ bic->raising_time_left = 0;
13036 ++}
13037 ++
13038 ++/*
13039 ++ * Must be called with the queue_lock held.
13040 ++ */
13041 ++static int bfqq_process_refs(struct bfq_queue *bfqq)
13042 ++{
13043 ++ int process_refs, io_refs;
13044 ++
13045 ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
13046 ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
13047 ++ BUG_ON(process_refs < 0);
13048 ++ return process_refs;
13049 ++}
13050 ++
13051 + static void bfq_add_rq_rb(struct request *rq)
13052 + {
13053 + struct bfq_queue *bfqq = RQ_BFQQ(rq);
13054 +@@ -483,11 +520,20 @@ static void bfq_add_rq_rb(struct request *rq)
13055 + if (! bfqd->low_latency)
13056 + goto add_bfqq_busy;
13057 +
13058 ++ if (bfq_bfqq_just_split(bfqq))
13059 ++ goto set_ioprio_changed;
13060 ++
13061 + /*
13062 +- * If the queue is not being boosted and has been idle
13063 +- * for enough time, start a weight-raising period
13064 ++ * If the queue:
13065 ++ * - is not being boosted,
13066 ++ * - has been idle for enough time,
13067 ++ * - is not a sync queue or is linked to a bfq_io_cq (it is
13068 ++ * shared "for its nature" or it is not shared and its
13069 ++ * requests have not been redirected to a shared queue)
13070 ++ * start a weight-raising period.
13071 + */
13072 +- if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) {
13073 ++ if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&
13074 ++ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
13075 + bfqq->raising_coeff = bfqd->bfq_raising_coeff;
13076 + if (idle_for_long_time)
13077 + bfqq->raising_cur_max_time =
13078 +@@ -517,6 +563,7 @@ static void bfq_add_rq_rb(struct request *rq)
13079 + raising_cur_max_time));
13080 + }
13081 + }
13082 ++set_ioprio_changed:
13083 + if (old_raising_coeff != bfqq->raising_coeff)
13084 + entity->ioprio_changed = 1;
13085 + add_bfqq_busy:
13086 +@@ -695,89 +742,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)
13087 + spin_unlock_irq(bfqd->queue->queue_lock);
13088 + }
13089 +
13090 +-static int bfq_allow_merge(struct request_queue *q, struct request *rq,
13091 +- struct bio *bio)
13092 ++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
13093 + {
13094 +- struct bfq_data *bfqd = q->elevator->elevator_data;
13095 +- struct bfq_io_cq *bic;
13096 +- struct bfq_queue *bfqq;
13097 +-
13098 +- /*
13099 +- * Disallow merge of a sync bio into an async request.
13100 +- */
13101 +- if (bfq_bio_sync(bio) && !rq_is_sync(rq))
13102 +- return 0;
13103 +-
13104 +- /*
13105 +- * Lookup the bfqq that this bio will be queued with. Allow
13106 +- * merge only if rq is queued there.
13107 +- * Queue lock is held here.
13108 +- */
13109 +- bic = bfq_bic_lookup(bfqd, current->io_context);
13110 +- if (bic == NULL)
13111 +- return 0;
13112 +-
13113 +- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
13114 +- return bfqq == RQ_BFQQ(rq);
13115 +-}
13116 +-
13117 +-static void __bfq_set_active_queue(struct bfq_data *bfqd,
13118 +- struct bfq_queue *bfqq)
13119 +-{
13120 +- if (bfqq != NULL) {
13121 +- bfq_mark_bfqq_must_alloc(bfqq);
13122 +- bfq_mark_bfqq_budget_new(bfqq);
13123 +- bfq_clear_bfqq_fifo_expire(bfqq);
13124 +-
13125 +- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
13126 +-
13127 +- bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
13128 +- bfqq->entity.budget);
13129 +- }
13130 +-
13131 +- bfqd->active_queue = bfqq;
13132 +-}
13133 +-
13134 +-/*
13135 +- * Get and set a new active queue for service.
13136 +- */
13137 +-static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd,
13138 +- struct bfq_queue *bfqq)
13139 +-{
13140 +- if (!bfqq)
13141 +- bfqq = bfq_get_next_queue(bfqd);
13142 ++ if (request)
13143 ++ return blk_rq_pos(io_struct);
13144 + else
13145 +- bfq_get_next_queue_forced(bfqd, bfqq);
13146 +-
13147 +- __bfq_set_active_queue(bfqd, bfqq);
13148 +- return bfqq;
13149 ++ return ((struct bio *)io_struct)->bi_sector;
13150 + }
13151 +
13152 +-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
13153 +- struct request *rq)
13154 ++static inline sector_t bfq_dist_from(sector_t pos1,
13155 ++ sector_t pos2)
13156 + {
13157 +- if (blk_rq_pos(rq) >= bfqd->last_position)
13158 +- return blk_rq_pos(rq) - bfqd->last_position;
13159 ++ if (pos1 >= pos2)
13160 ++ return pos1 - pos2;
13161 + else
13162 +- return bfqd->last_position - blk_rq_pos(rq);
13163 ++ return pos2 - pos1;
13164 + }
13165 +
13166 +-/*
13167 +- * Return true if bfqq has no request pending and rq is close enough to
13168 +- * bfqd->last_position, or if rq is closer to bfqd->last_position than
13169 +- * bfqq->next_rq
13170 +- */
13171 +-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
13172 ++static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
13173 ++ sector_t sector)
13174 + {
13175 +- return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
13176 ++ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
13177 ++ BFQQ_SEEK_THR;
13178 + }
13179 +
13180 +-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13181 ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
13182 + {
13183 + struct rb_root *root = &bfqd->rq_pos_tree;
13184 + struct rb_node *parent, *node;
13185 + struct bfq_queue *__bfqq;
13186 +- sector_t sector = bfqd->last_position;
13187 +
13188 + if (RB_EMPTY_ROOT(root))
13189 + return NULL;
13190 +@@ -796,7 +789,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13191 + * position).
13192 + */
13193 + __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
13194 +- if (bfq_rq_close(bfqd, __bfqq->next_rq))
13195 ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
13196 + return __bfqq;
13197 +
13198 + if (blk_rq_pos(__bfqq->next_rq) < sector)
13199 +@@ -807,7 +800,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13200 + return NULL;
13201 +
13202 + __bfqq = rb_entry(node, struct bfq_queue, pos_node);
13203 +- if (bfq_rq_close(bfqd, __bfqq->next_rq))
13204 ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
13205 + return __bfqq;
13206 +
13207 + return NULL;
13208 +@@ -816,14 +809,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13209 + /*
13210 + * bfqd - obvious
13211 + * cur_bfqq - passed in so that we don't decide that the current queue
13212 +- * is closely cooperating with itself.
13213 +- *
13214 +- * We are assuming that cur_bfqq has dispatched at least one request,
13215 +- * and that bfqd->last_position reflects a position on the disk associated
13216 +- * with the I/O issued by cur_bfqq.
13217 ++ * is closely cooperating with itself
13218 ++ * sector - used as a reference point to search for a close queue
13219 + */
13220 + static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
13221 +- struct bfq_queue *cur_bfqq)
13222 ++ struct bfq_queue *cur_bfqq,
13223 ++ sector_t sector)
13224 + {
13225 + struct bfq_queue *bfqq;
13226 +
13227 +@@ -843,7 +834,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
13228 + * working closely on the same area of the disk. In that case,
13229 + * we can group them together and don't waste time idling.
13230 + */
13231 +- bfqq = bfqq_close(bfqd);
13232 ++ bfqq = bfqq_close(bfqd, sector);
13233 + if (bfqq == NULL || bfqq == cur_bfqq)
13234 + return NULL;
13235 +
13236 +@@ -870,6 +861,275 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
13237 + return bfqq;
13238 + }
13239 +
13240 ++static struct bfq_queue *
13241 ++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
13242 ++{
13243 ++ int process_refs, new_process_refs;
13244 ++ struct bfq_queue *__bfqq;
13245 ++
13246 ++ /*
13247 ++ * If there are no process references on the new_bfqq, then it is
13248 ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
13249 ++ * may have dropped their last reference (not just their last process
13250 ++ * reference).
13251 ++ */
13252 ++ if (!bfqq_process_refs(new_bfqq))
13253 ++ return NULL;
13254 ++
13255 ++ /* Avoid a circular list and skip interim queue merges. */
13256 ++ while ((__bfqq = new_bfqq->new_bfqq)) {
13257 ++ if (__bfqq == bfqq)
13258 ++ return NULL;
13259 ++ new_bfqq = __bfqq;
13260 ++ }
13261 ++
13262 ++ process_refs = bfqq_process_refs(bfqq);
13263 ++ new_process_refs = bfqq_process_refs(new_bfqq);
13264 ++ /*
13265 ++ * If the process for the bfqq has gone away, there is no
13266 ++ * sense in merging the queues.
13267 ++ */
13268 ++ if (process_refs == 0 || new_process_refs == 0)
13269 ++ return NULL;
13270 ++
13271 ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
13272 ++ new_bfqq->pid);
13273 ++
13274 ++ /*
13275 ++ * Merging is just a redirection: the requests of the process owning
13276 ++ * one of the two queues are redirected to the other queue. The latter
13277 ++ * queue, in its turn, is set as shared if this is the first time that
13278 ++ * the requests of some process are redirected to it.
13279 ++ *
13280 ++ * We redirect bfqq to new_bfqq and not the opposite, because we
13281 ++ * are in the context of the process owning bfqq, hence we have the
13282 ++ * io_cq of this process. So we can immediately configure this io_cq
13283 ++ * to redirect the requests of the process to new_bfqq.
13284 ++ *
13285 ++ * NOTE, even if new_bfqq coincides with the active queue, the io_cq of
13286 ++ * new_bfqq is not available, because, if the active queue is shared,
13287 ++ * bfqd->active_bic may not point to the io_cq of the active queue.
13288 ++ * Redirecting the requests of the process owning bfqq to the currently
13289 ++ * active queue is in any case the best option, as we feed the active queue
13290 ++ * with new requests close to the last request served and, by doing so,
13291 ++ * hopefully increase the throughput.
13292 ++ */
13293 ++ bfqq->new_bfqq = new_bfqq;
13294 ++ atomic_add(process_refs, &new_bfqq->ref);
13295 ++ return new_bfqq;
13296 ++}
13297 ++
13298 ++/*
13299 ++ * Attempt to schedule a merge of bfqq with the currently active queue or
13300 ++ * with a close queue among the scheduled queues.
13301 ++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
13302 ++ * structure otherwise.
13303 ++ */
13304 ++static struct bfq_queue *
13305 ++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
13306 ++ void *io_struct, bool request)
13307 ++{
13308 ++ struct bfq_queue *active_bfqq, *new_bfqq;
13309 ++
13310 ++ if (bfqq->new_bfqq)
13311 ++ return bfqq->new_bfqq;
13312 ++
13313 ++ if (!io_struct)
13314 ++ return NULL;
13315 ++
13316 ++ active_bfqq = bfqd->active_queue;
13317 ++
13318 ++ if (active_bfqq == NULL || active_bfqq == bfqq || !bfqd->active_bic)
13319 ++ goto check_scheduled;
13320 ++
13321 ++ if (bfq_class_idle(active_bfqq) || bfq_class_idle(bfqq))
13322 ++ goto check_scheduled;
13323 ++
13324 ++ if (bfq_class_rt(active_bfqq) != bfq_class_rt(bfqq))
13325 ++ goto check_scheduled;
13326 ++
13327 ++ if (active_bfqq->entity.parent != bfqq->entity.parent)
13328 ++ goto check_scheduled;
13329 ++
13330 ++ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
13331 ++ bfq_bfqq_sync(active_bfqq) && bfq_bfqq_sync(bfqq))
13332 ++ if ((new_bfqq = bfq_setup_merge(bfqq, active_bfqq)))
13333 ++ return new_bfqq; /* Merge with the active queue */
13334 ++
13335 ++ /*
13336 ++ * Check whether there is a cooperator among currently scheduled
13337 ++ * queues. The only thing we need is that the bio/request is not
13338 ++ * NULL, as we need it to establish whether a cooperator exists.
13339 ++ */
13340 ++check_scheduled:
13341 ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq,
13342 ++ bfq_io_struct_pos(io_struct, request));
13343 ++ if (new_bfqq)
13344 ++ return bfq_setup_merge(bfqq, new_bfqq);
13345 ++
13346 ++ return NULL;
13347 ++}
13348 ++
13349 ++static inline void
13350 ++bfq_bfqq_save_state(struct bfq_queue *bfqq)
13351 ++{
13352 ++ /*
13353 ++ * If bfqq->bic == NULL, the queue is already shared or its requests
13354 ++ * have already been redirected to a shared queue; both idle window
13355 ++ * and weight raising state have already been saved. Do nothing.
13356 ++ */
13357 ++ if (bfqq->bic == NULL)
13358 ++ return;
13359 ++ if (bfqq->bic->raising_time_left)
13360 ++ /*
13361 ++ * This is the queue of a just-started process, and would
13362 ++ * deserve weight raising: we set raising_time_left to the full
13363 ++ * weight-raising duration to trigger weight-raising when and
13364 ++ * if the queue is split and the first request of the queue
13365 ++ * is enqueued.
13366 ++ */
13367 ++ bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);
13368 ++ else if (bfqq->raising_coeff > 1) {
13369 ++ unsigned long wrais_duration =
13370 ++ jiffies - bfqq->last_rais_start_finish;
13371 ++ /*
13372 ++ * It may happen that a queue's weight raising period lasts
13373 ++ * longer than its raising_cur_max_time, as weight raising is
13374 ++ * handled only when a request is enqueued or dispatched (it
13375 ++ * does not use any timer). If the weight raising period is
13376 ++ * about to end, don't save it.
13377 ++ */
13378 ++ if (bfqq->raising_cur_max_time <= wrais_duration)
13379 ++ bfqq->bic->raising_time_left = 0;
13380 ++ else
13381 ++ bfqq->bic->raising_time_left =
13382 ++ bfqq->raising_cur_max_time - wrais_duration;
13383 ++ /*
13384 ++ * The bfq_queue is becoming shared or the requests of the
13385 ++ * process owning the queue are being redirected to a shared
13386 ++ * queue. Stop the weight raising period of the queue, as in
13387 ++ * both cases it should not be owned by an interactive or soft
13388 ++ * real-time application.
13389 ++ */
13390 ++ bfq_bfqq_end_raising(bfqq);
13391 ++ } else
13392 ++ bfqq->bic->raising_time_left = 0;
13393 ++ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
13394 ++}
13395 ++
13396 ++static inline void
13397 ++bfq_get_bic_reference(struct bfq_queue *bfqq)
13398 ++{
13399 ++ /*
13400 ++ * If bfqq->bic has a non-NULL value, the bic to which it belongs
13401 ++ * is about to begin using a shared bfq_queue.
13402 ++ */
13403 ++ if (bfqq->bic)
13404 ++ atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
13405 ++}
13406 ++
13407 ++static void
13408 ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
13409 ++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
13410 ++{
13411 ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
13412 ++ (long unsigned)new_bfqq->pid);
13413 ++ /* Save weight raising and idle window of the merged queues */
13414 ++ bfq_bfqq_save_state(bfqq);
13415 ++ bfq_bfqq_save_state(new_bfqq);
13416 ++ /*
13417 ++ * Grab a reference to the bic, to prevent it from being destroyed
13418 ++ * before being possibly touched by a bfq_split_bfqq().
13419 ++ */
13420 ++ bfq_get_bic_reference(bfqq);
13421 ++ bfq_get_bic_reference(new_bfqq);
13422 ++ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */
13423 ++ bic_set_bfqq(bic, new_bfqq, 1);
13424 ++ bfq_mark_bfqq_coop(new_bfqq);
13425 ++ /*
13426 ++ * new_bfqq now belongs to at least two bics (it is a shared queue): set
13427 ++ * new_bfqq->bic to NULL. bfqq either:
13428 ++ * - does not belong to any bic any more, and hence bfqq->bic must
13429 ++ * be set to NULL, or
13430 ++ * - is a queue whose owning bics have already been redirected to a
13431 ++ * different queue, hence the queue is destined to not belong to any
13432 ++ * bic soon and bfqq->bic is already NULL (therefore the next
13433 ++ * assignment causes no harm).
13434 ++ */
13435 ++ new_bfqq->bic = NULL;
13436 ++ bfqq->bic = NULL;
13437 ++ bfq_put_queue(bfqq);
13438 ++}
13439 ++
13440 ++static int bfq_allow_merge(struct request_queue *q, struct request *rq,
13441 ++ struct bio *bio)
13442 ++{
13443 ++ struct bfq_data *bfqd = q->elevator->elevator_data;
13444 ++ struct bfq_io_cq *bic;
13445 ++ struct bfq_queue *bfqq, *new_bfqq;
13446 ++
13447 ++ /*
13448 ++ * Disallow merge of a sync bio into an async request.
13449 ++ */
13450 ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
13451 ++ return 0;
13452 ++
13453 ++ /*
13454 ++ * Lookup the bfqq that this bio will be queued with. Allow
13455 ++ * merge only if rq is queued there.
13456 ++ * Queue lock is held here.
13457 ++ */
13458 ++ bic = bfq_bic_lookup(bfqd, current->io_context);
13459 ++ if (bic == NULL)
13460 ++ return 0;
13461 ++
13462 ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
13463 ++ /*
13464 ++ * We take advantage of this function to perform an early merge
13465 ++ * of the queues of possible cooperating processes.
13466 ++ */
13467 ++ if (bfqq != NULL &&
13468 ++ (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false))) {
13469 ++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
13470 ++ /*
13471 ++ * If we get here, the bio will be queued in the shared queue,
13472 ++ * i.e., new_bfqq, so use new_bfqq to decide whether bio and
13473 ++ * rq can be merged.
13474 ++ */
13475 ++ bfqq = new_bfqq;
13476 ++ }
13477 ++
13478 ++ return bfqq == RQ_BFQQ(rq);
13479 ++}
13480 ++
13481 ++static void __bfq_set_active_queue(struct bfq_data *bfqd,
13482 ++ struct bfq_queue *bfqq)
13483 ++{
13484 ++ if (bfqq != NULL) {
13485 ++ bfq_mark_bfqq_must_alloc(bfqq);
13486 ++ bfq_mark_bfqq_budget_new(bfqq);
13487 ++ bfq_clear_bfqq_fifo_expire(bfqq);
13488 ++
13489 ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
13490 ++
13491 ++ bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
13492 ++ bfqq->entity.budget);
13493 ++ }
13494 ++
13495 ++ bfqd->active_queue = bfqq;
13496 ++}
13497 ++
13498 ++/*
13499 ++ * Get and set a new active queue for service.
13500 ++ */
13501 ++static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd)
13502 ++{
13503 ++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
13504 ++
13505 ++ __bfq_set_active_queue(bfqd, bfqq);
13506 ++ return bfqq;
13507 ++}
13508 ++
13509 + /*
13510 + * If enough samples have been computed, return the current max budget
13511 + * stored in bfqd, which is dynamically updated according to the
13512 +@@ -1017,63 +1277,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
13513 + return rq;
13514 + }
13515 +
13516 +-/*
13517 +- * Must be called with the queue_lock held.
13518 +- */
13519 +-static int bfqq_process_refs(struct bfq_queue *bfqq)
13520 +-{
13521 +- int process_refs, io_refs;
13522 +-
13523 +- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
13524 +- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
13525 +- BUG_ON(process_refs < 0);
13526 +- return process_refs;
13527 +-}
13528 +-
13529 +-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
13530 +-{
13531 +- int process_refs, new_process_refs;
13532 +- struct bfq_queue *__bfqq;
13533 +-
13534 +- /*
13535 +- * If there are no process references on the new_bfqq, then it is
13536 +- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
13537 +- * may have dropped their last reference (not just their last process
13538 +- * reference).
13539 +- */
13540 +- if (!bfqq_process_refs(new_bfqq))
13541 +- return;
13542 +-
13543 +- /* Avoid a circular list and skip interim queue merges. */
13544 +- while ((__bfqq = new_bfqq->new_bfqq)) {
13545 +- if (__bfqq == bfqq)
13546 +- return;
13547 +- new_bfqq = __bfqq;
13548 +- }
13549 +-
13550 +- process_refs = bfqq_process_refs(bfqq);
13551 +- new_process_refs = bfqq_process_refs(new_bfqq);
13552 +- /*
13553 +- * If the process for the bfqq has gone away, there is no
13554 +- * sense in merging the queues.
13555 +- */
13556 +- if (process_refs == 0 || new_process_refs == 0)
13557 +- return;
13558 +-
13559 +- /*
13560 +- * Merge in the direction of the lesser amount of work.
13561 +- */
13562 +- if (new_process_refs >= process_refs) {
13563 +- bfqq->new_bfqq = new_bfqq;
13564 +- atomic_add(process_refs, &new_bfqq->ref);
13565 +- } else {
13566 +- new_bfqq->new_bfqq = bfqq;
13567 +- atomic_add(new_process_refs, &bfqq->ref);
13568 +- }
13569 +- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
13570 +- new_bfqq->pid);
13571 +-}
13572 +-
13573 + static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
13574 + {
13575 + struct bfq_entity *entity = &bfqq->entity;
13576 +@@ -1493,6 +1696,14 @@ static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
13577 + * is likely to boost the disk throughput);
13578 + * - the queue is weight-raised (waiting for the request is necessary for
13579 + * providing the queue with fairness and latency guarantees).
13580 ++ *
13581 ++ * In any case, idling can be disabled for cooperation issues, if
13582 ++ * 1) there is a close cooperator for the queue, or
13583 ++ * 2) the queue is shared and some cooperator is likely to be idle (in this
13584 ++ * case, by not arming the idle timer, we try to slow down the queue, to
13585 ++ * prevent the zones of the disk accessed by the active cooperators to
13586 ++ * become too distant from the zone that will be accessed by the currently
13587 ++ * idle cooperators).
13588 + */
13589 + static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
13590 + int budg_timeout)
13591 +@@ -1507,7 +1718,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
13592 + (bfqd->rq_in_driver == 0 ||
13593 + budg_timeout ||
13594 + bfqq->raising_coeff > 1) &&
13595 +- !bfq_close_cooperator(bfqd, bfqq) &&
13596 ++ !bfq_close_cooperator(bfqd, bfqq, bfqd->last_position) &&
13597 + (!bfq_bfqq_coop(bfqq) ||
13598 + !bfq_bfqq_some_coop_idle(bfqq)) &&
13599 + !bfq_queue_nonrot_noidle(bfqd, bfqq));
13600 +@@ -1519,7 +1730,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq,
13601 + */
13602 + static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
13603 + {
13604 +- struct bfq_queue *bfqq, *new_bfqq = NULL;
13605 ++ struct bfq_queue *bfqq;
13606 + struct request *next_rq;
13607 + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
13608 + int budg_timeout;
13609 +@@ -1530,17 +1741,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
13610 +
13611 + bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
13612 +
13613 +- /*
13614 +- * If another queue has a request waiting within our mean seek
13615 +- * distance, let it run. The expire code will check for close
13616 +- * cooperators and put the close queue at the front of the
13617 +- * service tree. If possible, merge the expiring queue with the
13618 +- * new bfqq.
13619 +- */
13620 +- new_bfqq = bfq_close_cooperator(bfqd, bfqq);
13621 +- if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
13622 +- bfq_setup_merge(bfqq, new_bfqq);
13623 +-
13624 + budg_timeout = bfq_may_expire_for_budg_timeout(bfqq);
13625 + if (budg_timeout &&
13626 + !bfq_bfqq_must_idle(bfqq, budg_timeout))
13627 +@@ -1577,10 +1777,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
13628 + bfq_clear_bfqq_wait_request(bfqq);
13629 + del_timer(&bfqd->idle_slice_timer);
13630 + }
13631 +- if (new_bfqq == NULL)
13632 +- goto keep_queue;
13633 +- else
13634 +- goto expire;
13635 ++ goto keep_queue;
13636 + }
13637 + }
13638 +
13639 +@@ -1589,26 +1786,19 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
13640 + * queue still has requests in flight or is idling for a new request,
13641 + * then keep it.
13642 + */
13643 +- if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
13644 ++ if (timer_pending(&bfqd->idle_slice_timer) ||
13645 + (bfqq->dispatched != 0 &&
13646 + (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) &&
13647 +- !bfq_queue_nonrot_noidle(bfqd, bfqq)))) {
13648 ++ !bfq_queue_nonrot_noidle(bfqd, bfqq))) {
13649 + bfqq = NULL;
13650 + goto keep_queue;
13651 +- } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
13652 +- /*
13653 +- * Expiring the queue because there is a close cooperator,
13654 +- * cancel timer.
13655 +- */
13656 +- bfq_clear_bfqq_wait_request(bfqq);
13657 +- del_timer(&bfqd->idle_slice_timer);
13658 + }
13659 +
13660 + reason = BFQ_BFQQ_NO_MORE_REQUESTS;
13661 + expire:
13662 + bfq_bfqq_expire(bfqd, bfqq, 0, reason);
13663 + new_queue:
13664 +- bfqq = bfq_set_active_queue(bfqd, new_bfqq);
13665 ++ bfqq = bfq_set_active_queue(bfqd);
13666 + bfq_log(bfqd, "select_queue: new queue %d returned",
13667 + bfqq != NULL ? bfqq->pid : 0);
13668 + keep_queue:
13669 +@@ -1617,9 +1807,8 @@ keep_queue:
13670 +
13671 + static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
13672 + {
13673 ++ struct bfq_entity *entity = &bfqq->entity;
13674 + if (bfqq->raising_coeff > 1) { /* queue is being boosted */
13675 +- struct bfq_entity *entity = &bfqq->entity;
13676 +-
13677 + bfq_log_bfqq(bfqd, bfqq,
13678 + "raising period dur %u/%u msec, "
13679 + "old raising coeff %u, w %d(%d)",
13680 +@@ -1656,12 +1845,14 @@ static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
13681 + jiffies_to_msecs(bfqq->
13682 + raising_cur_max_time));
13683 + bfq_bfqq_end_raising(bfqq);
13684 +- __bfq_entity_update_weight_prio(
13685 +- bfq_entity_service_tree(entity),
13686 +- entity);
13687 + }
13688 + }
13689 + }
13690 ++ /* Update weight both if it must be raised and if it must be lowered */
13691 ++ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))
13692 ++ __bfq_entity_update_weight_prio(
13693 ++ bfq_entity_service_tree(entity),
13694 ++ entity);
13695 + }
13696 +
13697 + /*
13698 +@@ -1901,6 +2092,25 @@ static void bfq_init_icq(struct io_cq *icq)
13699 + struct bfq_io_cq *bic = icq_to_bic(icq);
13700 +
13701 + bic->ttime.last_end_request = jiffies;
13702 ++ /*
13703 ++ * A newly created bic indicates that the process has just
13704 ++ * started doing I/O, and is probably mapping into memory its
13705 ++ * executable and libraries: it definitely needs weight raising.
13706 ++ * There is however the possibility that the process performs,
13707 ++ * for a while, I/O close to some other process. EQM intercepts
13708 ++ * this behavior and may merge the queue corresponding to the
13709 ++ * process with some other queue, BEFORE the weight of the queue
13710 ++ * is raised. Merged queues are not weight-raised (they are assumed
13711 ++ * to belong to processes that benefit only from high throughput).
13712 ++ * If the merge is basically the consequence of an accident, then
13713 ++ * the queue will be split soon and will get back its old weight.
13714 ++ * It is then important to write down somewhere that this queue
13715 ++ * does need weight raising, even if it did not make it to get its
13716 ++ * weight raised before being merged. To this purpose, we overload
13717 ++ * the field raising_time_left and assign 1 to it, to mark the queue
13718 ++ * as needing weight raising.
13719 ++ */
13720 ++ bic->raising_time_left = 1;
13721 + }
13722 +
13723 + static void bfq_exit_icq(struct io_cq *icq)
13724 +@@ -1914,6 +2124,13 @@ static void bfq_exit_icq(struct io_cq *icq)
13725 + }
13726 +
13727 + if (bic->bfqq[BLK_RW_SYNC]) {
13728 ++ /*
13729 ++ * If the bic is using a shared queue, put the reference
13730 ++ * taken on the io_context when the bic started using a
13731 ++ * shared bfq_queue.
13732 ++ */
13733 ++ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
13734 ++ put_io_context(icq->ioc);
13735 + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
13736 + bic->bfqq[BLK_RW_SYNC] = NULL;
13737 + }
13738 +@@ -2211,6 +2428,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
13739 + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
13740 + return;
13741 +
13742 ++ /* Idle window just restored, statistics are meaningless. */
13743 ++ if (bfq_bfqq_just_split(bfqq))
13744 ++ return;
13745 ++
13746 + enable_idle = bfq_bfqq_idle_window(bfqq);
13747 +
13748 + if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
13749 +@@ -2251,6 +2472,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
13750 + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
13751 + !BFQQ_SEEKY(bfqq))
13752 + bfq_update_idle_window(bfqd, bfqq, bic);
13753 ++ bfq_clear_bfqq_just_split(bfqq);
13754 +
13755 + bfq_log_bfqq(bfqd, bfqq,
13756 + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
13757 +@@ -2302,13 +2524,45 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
13758 + static void bfq_insert_request(struct request_queue *q, struct request *rq)
13759 + {
13760 + struct bfq_data *bfqd = q->elevator->elevator_data;
13761 +- struct bfq_queue *bfqq = RQ_BFQQ(rq);
13762 ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
13763 +
13764 + assert_spin_locked(bfqd->queue->queue_lock);
13765 ++
13766 ++ /*
13767 ++ * An unplug may trigger a requeue of a request from the device
13768 ++ * driver: make sure we are in process context while trying to
13769 ++ * merge two bfq_queues.
13770 ++ */
13771 ++ if (!in_interrupt() &&
13772 ++ (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true))) {
13773 ++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
13774 ++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
13775 ++ /*
13776 ++ * Release the request's reference to the old bfqq
13777 ++ * and make sure one is taken to the shared queue.
13778 ++ */
13779 ++ new_bfqq->allocated[rq_data_dir(rq)]++;
13780 ++ bfqq->allocated[rq_data_dir(rq)]--;
13781 ++ atomic_inc(&new_bfqq->ref);
13782 ++ bfq_put_queue(bfqq);
13783 ++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
13784 ++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq);
13785 ++ rq->elv.priv[1] = new_bfqq;
13786 ++ bfqq = new_bfqq;
13787 ++ }
13788 ++
13789 + bfq_init_prio_data(bfqq, RQ_BIC(rq));
13790 +
13791 + bfq_add_rq_rb(rq);
13792 +
13793 ++ /*
13794 ++ * Here a newly-created bfq_queue has already started a weight-raising
13795 ++ * period: clear raising_time_left to prevent bfq_bfqq_save_state()
13796 ++ * from assigning it a full weight-raising period. See the detailed
13797 ++ * comments about this field in bfq_init_icq().
13798 ++ */
13799 ++ if (bfqq->bic != NULL)
13800 ++ bfqq->bic->raising_time_left = 0;
13801 + rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
13802 + list_add_tail(&rq->queuelist, &bfqq->fifo);
13803 +
13804 +@@ -2371,15 +2625,6 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
13805 + if (bfq_bfqq_budget_new(bfqq))
13806 + bfq_set_budget_timeout(bfqd);
13807 +
13808 +- /* Idling is disabled also for cooperation issues:
13809 +- * 1) there is a close cooperator for the queue, or
13810 +- * 2) the queue is shared and some cooperator is likely
13811 +- * to be idle (in this case, by not arming the idle timer,
13812 +- * we try to slow down the queue, to prevent the zones
13813 +- * of the disk accessed by the active cooperators to become
13814 +- * too distant from the zone that will be accessed by the
13815 +- * currently idle cooperators)
13816 +- */
13817 + if (bfq_bfqq_must_idle(bfqq, budg_timeout))
13818 + bfq_arm_slice_timer(bfqd);
13819 + else if (budg_timeout)
13820 +@@ -2449,18 +2694,6 @@ static void bfq_put_request(struct request *rq)
13821 + }
13822 + }
13823 +
13824 +-static struct bfq_queue *
13825 +-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
13826 +- struct bfq_queue *bfqq)
13827 +-{
13828 +- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
13829 +- (long unsigned)bfqq->new_bfqq->pid);
13830 +- bic_set_bfqq(bic, bfqq->new_bfqq, 1);
13831 +- bfq_mark_bfqq_coop(bfqq->new_bfqq);
13832 +- bfq_put_queue(bfqq);
13833 +- return bic_to_bfqq(bic, 1);
13834 +-}
13835 +-
13836 + /*
13837 + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
13838 + * was the last process referring to said bfqq.
13839 +@@ -2469,6 +2702,9 @@ static struct bfq_queue *
13840 + bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
13841 + {
13842 + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
13843 ++
13844 ++ put_io_context(bic->icq.ioc);
13845 ++
13846 + if (bfqq_process_refs(bfqq) == 1) {
13847 + bfqq->pid = current->pid;
13848 + bfq_clear_bfqq_some_coop_idle(bfqq);
13849 +@@ -2498,6 +2734,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
13850 + struct bfq_queue *bfqq;
13851 + struct bfq_group *bfqg;
13852 + unsigned long flags;
13853 ++ bool split = false;
13854 +
13855 + might_sleep_if(gfp_mask & __GFP_WAIT);
13856 +
13857 +@@ -2516,24 +2753,14 @@ new_queue:
13858 + bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
13859 + bic_set_bfqq(bic, bfqq, is_sync);
13860 + } else {
13861 +- /*
13862 +- * If the queue was seeky for too long, break it apart.
13863 +- */
13864 ++ /* If the queue was seeky for too long, break it apart. */
13865 + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
13866 + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
13867 + bfqq = bfq_split_bfqq(bic, bfqq);
13868 ++ split = true;
13869 + if (!bfqq)
13870 + goto new_queue;
13871 + }
13872 +-
13873 +- /*
13874 +- * Check to see if this queue is scheduled to merge with
13875 +- * another closely cooperating queue. The merging of queues
13876 +- * happens here as it must be done in process context.
13877 +- * The reference on new_bfqq was taken in merge_bfqqs.
13878 +- */
13879 +- if (bfqq->new_bfqq != NULL)
13880 +- bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
13881 + }
13882 +
13883 + bfqq->allocated[rw]++;
13884 +@@ -2544,6 +2771,26 @@ new_queue:
13885 + rq->elv.priv[0] = bic;
13886 + rq->elv.priv[1] = bfqq;
13887 +
13888 ++ /*
13889 ++ * If a bfq_queue has only one process reference, it is owned
13890 ++ * by only one bfq_io_cq: we can set the bic field of the
13891 ++ * bfq_queue to the address of that structure. Also, if the
13892 ++ * queue has just been split, mark a flag so that the
13893 ++ * information is available to the other scheduler hooks.
13894 ++ */
13895 ++ if (bfqq_process_refs(bfqq) == 1) {
13896 ++ bfqq->bic = bic;
13897 ++ if (split) {
13898 ++ bfq_mark_bfqq_just_split(bfqq);
13899 ++ /*
13900 ++ * If the queue has just been split from a shared queue,
13901 ++ * restore the idle window and the possible weight
13902 ++ * raising period.
13903 ++ */
13904 ++ bfq_bfqq_resume_state(bfqq, bic);
13905 ++ }
13906 ++ }
13907 ++
13908 + spin_unlock_irqrestore(q->queue_lock, flags);
13909 +
13910 + return 0;
13911 +diff --git a/block/bfq-sched.c b/block/bfq-sched.c
13912 +index 03f8061..a0edaa2 100644
13913 +--- a/block/bfq-sched.c
13914 ++++ b/block/bfq-sched.c
13915 +@@ -978,34 +978,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
13916 + return bfqq;
13917 + }
13918 +
13919 +-/*
13920 +- * Forced extraction of the given queue.
13921 +- */
13922 +-static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
13923 +- struct bfq_queue *bfqq)
13924 +-{
13925 +- struct bfq_entity *entity;
13926 +- struct bfq_sched_data *sd;
13927 +-
13928 +- BUG_ON(bfqd->active_queue != NULL);
13929 +-
13930 +- entity = &bfqq->entity;
13931 +- /*
13932 +- * Bubble up extraction/update from the leaf to the root.
13933 +- */
13934 +- for_each_entity(entity) {
13935 +- sd = entity->sched_data;
13936 +- bfq_update_budget(entity);
13937 +- bfq_update_vtime(bfq_entity_service_tree(entity));
13938 +- bfq_active_extract(bfq_entity_service_tree(entity), entity);
13939 +- sd->active_entity = entity;
13940 +- sd->next_active = NULL;
13941 +- entity->service = 0;
13942 +- }
13943 +-
13944 +- return;
13945 +-}
13946 +-
13947 + static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)
13948 + {
13949 + if (bfqd->active_bic != NULL) {
13950 +diff --git a/block/bfq.h b/block/bfq.h
13951 +index 48ecde9..bb52975 100644
13952 +--- a/block/bfq.h
13953 ++++ b/block/bfq.h
13954 +@@ -188,6 +188,8 @@ struct bfq_group;
13955 + * @pid: pid of the process owning the queue, used for logging purposes.
13956 + * @last_rais_start_time: last (idle -> weight-raised) transition attempt
13957 + * @raising_cur_max_time: current max raising time for this queue
13958 ++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
13959 ++ * queue is shared
13960 + *
13961 + * A bfq_queue is a leaf request queue; it can be associated to an io_context
13962 + * or more (if it is an async one). @cgroup holds a reference to the
13963 +@@ -231,6 +233,7 @@ struct bfq_queue {
13964 + sector_t last_request_pos;
13965 +
13966 + pid_t pid;
13967 ++ struct bfq_io_cq *bic;
13968 +
13969 + /* weight-raising fields */
13970 + unsigned int raising_cur_max_time;
13971 +@@ -257,12 +260,23 @@ struct bfq_ttime {
13972 + * @icq: associated io_cq structure
13973 + * @bfqq: array of two process queues, the sync and the async
13974 + * @ttime: associated @bfq_ttime struct
13975 ++ * @raising_time_left: snapshot of the time left before weight raising ends
13976 ++ * for the sync queue associated to this process; this
13977 ++ * snapshot is taken to remember this value while the weight
13978 ++ * raising is suspended because the queue is merged with a
13979 ++ * shared queue, and is used to set @raising_cur_max_time
13980 ++ * when the queue is split from the shared queue and its
13981 ++ * weight is raised again
13982 ++ * @saved_idle_window: same purpose as the previous field for the idle window
13983 + */
13984 + struct bfq_io_cq {
13985 + struct io_cq icq; /* must be the first member */
13986 + struct bfq_queue *bfqq[2];
13987 + struct bfq_ttime ttime;
13988 + int ioprio;
13989 ++
13990 ++ unsigned int raising_time_left;
13991 ++ unsigned int saved_idle_window;
13992 + };
13993 +
13994 + /**
13995 +@@ -403,6 +417,7 @@ enum bfqq_state_flags {
13996 + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
13997 + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
13998 + BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */
13999 ++ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */
14000 + };
14001 +
14002 + #define BFQ_BFQQ_FNS(name) \
14003 +@@ -430,6 +445,7 @@ BFQ_BFQQ_FNS(budget_new);
14004 + BFQ_BFQQ_FNS(coop);
14005 + BFQ_BFQQ_FNS(split_coop);
14006 + BFQ_BFQQ_FNS(some_coop_idle);
14007 ++BFQ_BFQQ_FNS(just_split);
14008 + #undef BFQ_BFQQ_FNS
14009 +
14010 + /* Logging facilities. */
14011 +--
14012 +1.8.1.4
14013 +
14014
14015 Added: genpatches-2.6/trunk/3.12/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch
14016 ===================================================================
14017 --- genpatches-2.6/trunk/3.12/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch (rev 0)
14018 +++ genpatches-2.6/trunk/3.12/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch 2013-11-04 10:09:31 UTC (rev 2565)
14019 @@ -0,0 +1,362 @@
14020 +From 2e1646d06515b7dd1344db547dfcf9a4640dee8e Mon Sep 17 00:00:00 2001
14021 +From: Arianna Avanzini <avanzini.arianna@×××××.com>
14022 +Date: Wed, 11 Sep 2013 22:26:47 +0200
14023 +Subject: [PATCH] block: Switch from BFQ-v6r2 for 3.11.0 to BFQ-v6r2 for
14024 + 3.12.0-rc1
14025 +
14026 +---
14027 + block/bfq-cgroup.c | 115 +++++++++++++++++++++++++++++++----------------------
14028 + block/bfq.h | 2 +
14029 + 2 files changed, 70 insertions(+), 47 deletions(-)
14030 +
14031 +diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
14032 +index bb9b851..afae4ca 100644
14033 +--- a/block/bfq-cgroup.c
14034 ++++ b/block/bfq-cgroup.c
14035 +@@ -16,9 +16,9 @@
14036 +
14037 + static DEFINE_MUTEX(bfqio_mutex);
14038 +
14039 +-static bool bfqio_is_removed(struct cgroup *cgroup)
14040 ++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
14041 + {
14042 +- return test_bit(CGRP_DEAD, &cgroup->flags);
14043 ++ return bgrp ? !bgrp->online : false;
14044 + }
14045 +
14046 + static struct bfqio_cgroup bfqio_root_cgroup = {
14047 +@@ -38,10 +38,9 @@ static inline void bfq_init_entity(struct bfq_entity *entity,
14048 + entity->sched_data = &bfqg->sched_data;
14049 + }
14050 +
14051 +-static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)
14052 ++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
14053 + {
14054 +- return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),
14055 +- struct bfqio_cgroup, css);
14056 ++ return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
14057 + }
14058 +
14059 + /*
14060 +@@ -103,20 +102,20 @@ static inline void bfq_group_set_parent(struct bfq_group *bfqg,
14061 + /**
14062 + * bfq_group_chain_alloc - allocate a chain of groups.
14063 + * @bfqd: queue descriptor.
14064 +- * @cgroup: the leaf cgroup this chain starts from.
14065 ++ * @css: the leaf cgroup_subsys_state this chain starts from.
14066 + *
14067 + * Allocate a chain of groups starting from the one belonging to
14068 + * @cgroup up to the root cgroup. Stop if a cgroup on the chain
14069 + * to the root has already an allocated group on @bfqd.
14070 + */
14071 + static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
14072 +- struct cgroup *cgroup)
14073 ++ struct cgroup_subsys_state *css)
14074 + {
14075 + struct bfqio_cgroup *bgrp;
14076 + struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
14077 +
14078 +- for (; cgroup != NULL; cgroup = cgroup->parent) {
14079 +- bgrp = cgroup_to_bfqio(cgroup);
14080 ++ for (; css != NULL; css = css->parent) {
14081 ++ bgrp = css_to_bfqio(css);
14082 +
14083 + bfqg = bfqio_lookup_group(bgrp, bfqd);
14084 + if (bfqg != NULL) {
14085 +@@ -165,7 +164,7 @@ cleanup:
14086 + /**
14087 + * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
14088 + * @bfqd: the queue descriptor.
14089 +- * @cgroup: the leaf cgroup to start from.
14090 ++ * @css: the leaf cgroup_subsys_state to start from.
14091 + * @leaf: the leaf group (to be associated to @cgroup).
14092 + *
14093 + * Try to link a chain of groups to a cgroup hierarchy, connecting the
14094 +@@ -177,7 +176,8 @@ cleanup:
14095 + * per device) while the bfqio_cgroup lock protects the list of groups
14096 + * belonging to the same cgroup.
14097 + */
14098 +-static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
14099 ++static void bfq_group_chain_link(struct bfq_data *bfqd,
14100 ++ struct cgroup_subsys_state *css,
14101 + struct bfq_group *leaf)
14102 + {
14103 + struct bfqio_cgroup *bgrp;
14104 +@@ -186,8 +186,8 @@ static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
14105 +
14106 + assert_spin_locked(bfqd->queue->queue_lock);
14107 +
14108 +- for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {
14109 +- bgrp = cgroup_to_bfqio(cgroup);
14110 ++ for (; css != NULL && leaf != NULL; css = css->parent) {
14111 ++ bgrp = css_to_bfqio(css);
14112 + next = leaf->bfqd;
14113 +
14114 + bfqg = bfqio_lookup_group(bgrp, bfqd);
14115 +@@ -205,9 +205,9 @@ static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
14116 + leaf = next;
14117 + }
14118 +
14119 +- BUG_ON(cgroup == NULL && leaf != NULL);
14120 +- if (cgroup != NULL && prev != NULL) {
14121 +- bgrp = cgroup_to_bfqio(cgroup);
14122 ++ BUG_ON(css == NULL && leaf != NULL);
14123 ++ if (css != NULL && prev != NULL) {
14124 ++ bgrp = css_to_bfqio(css);
14125 + bfqg = bfqio_lookup_group(bgrp, bfqd);
14126 + bfq_group_set_parent(prev, bfqg);
14127 + }
14128 +@@ -233,18 +233,18 @@ static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
14129 + * have been successful.
14130 + */
14131 + static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
14132 +- struct cgroup *cgroup)
14133 ++ struct cgroup_subsys_state *css)
14134 + {
14135 +- struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
14136 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
14137 + struct bfq_group *bfqg;
14138 +
14139 + bfqg = bfqio_lookup_group(bgrp, bfqd);
14140 + if (bfqg != NULL)
14141 + return bfqg;
14142 +
14143 +- bfqg = bfq_group_chain_alloc(bfqd, cgroup);
14144 ++ bfqg = bfq_group_chain_alloc(bfqd, css);
14145 + if (bfqg != NULL)
14146 +- bfq_group_chain_link(bfqd, cgroup, bfqg);
14147 ++ bfq_group_chain_link(bfqd, css, bfqg);
14148 + else
14149 + bfqg = bfqd->root_group;
14150 +
14151 +@@ -315,8 +315,8 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
14152 + * time here, at the price of slightly more complex code.
14153 + */
14154 + static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
14155 +- struct bfq_io_cq *bic,
14156 +- struct cgroup *cgroup)
14157 ++ struct bfq_io_cq *bic,
14158 ++ struct cgroup_subsys_state *css)
14159 + {
14160 + struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
14161 + struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
14162 +@@ -324,9 +324,9 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
14163 + struct bfq_group *bfqg;
14164 + struct bfqio_cgroup *bgrp;
14165 +
14166 +- bgrp = cgroup_to_bfqio(cgroup);
14167 ++ bgrp = css_to_bfqio(css);
14168 +
14169 +- bfqg = bfq_find_alloc_group(bfqd, cgroup);
14170 ++ bfqg = bfq_find_alloc_group(bfqd, css);
14171 + if (async_bfqq != NULL) {
14172 + entity = &async_bfqq->entity;
14173 +
14174 +@@ -357,14 +357,14 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
14175 + * moved into its new parent group.
14176 + */
14177 + static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
14178 +- struct cgroup *cgroup)
14179 ++ struct cgroup_subsys_state *css)
14180 + {
14181 + struct bfq_data *bfqd;
14182 + unsigned long uninitialized_var(flags);
14183 +
14184 + bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
14185 + if (bfqd != NULL) {
14186 +- __bfq_bic_change_cgroup(bfqd, bic, cgroup);
14187 ++ __bfq_bic_change_cgroup(bfqd, bic, css);
14188 + bfq_put_bfqd_unlock(bfqd, &flags);
14189 + }
14190 + }
14191 +@@ -394,13 +394,13 @@ static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
14192 + {
14193 + struct bfq_data *bfqd = bic_to_bfqd(bic);
14194 + struct bfq_group *bfqg;
14195 +- struct cgroup *cgroup;
14196 ++ struct cgroup_subsys_state *css;
14197 +
14198 + BUG_ON(bfqd == NULL);
14199 +
14200 + rcu_read_lock();
14201 +- cgroup = task_cgroup(current, bfqio_subsys_id);
14202 +- bfqg = __bfq_bic_change_cgroup(bfqd, bic, cgroup);
14203 ++ css = task_css(current, bfqio_subsys_id);
14204 ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
14205 + rcu_read_unlock();
14206 +
14207 + return bfqg;
14208 +@@ -622,17 +622,16 @@ static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
14209 + }
14210 +
14211 + #define SHOW_FUNCTION(__VAR) \
14212 +-static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup, \
14213 ++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
14214 + struct cftype *cftype) \
14215 + { \
14216 +- struct bfqio_cgroup *bgrp; \
14217 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
14218 + u64 ret = -ENODEV; \
14219 + \
14220 + mutex_lock(&bfqio_mutex); \
14221 +- if (bfqio_is_removed(cgroup)) \
14222 ++ if (bfqio_is_removed(bgrp)) \
14223 + goto out_unlock; \
14224 + \
14225 +- bgrp = cgroup_to_bfqio(cgroup); \
14226 + spin_lock_irq(&bgrp->lock); \
14227 + ret = bgrp->__VAR; \
14228 + spin_unlock_irq(&bgrp->lock); \
14229 +@@ -648,11 +647,11 @@ SHOW_FUNCTION(ioprio_class);
14230 + #undef SHOW_FUNCTION
14231 +
14232 + #define STORE_FUNCTION(__VAR, __MIN, __MAX) \
14233 +-static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \
14234 ++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
14235 + struct cftype *cftype, \
14236 + u64 val) \
14237 + { \
14238 +- struct bfqio_cgroup *bgrp; \
14239 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
14240 + struct bfq_group *bfqg; \
14241 + int ret = -EINVAL; \
14242 + \
14243 +@@ -661,12 +660,10 @@ static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \
14244 + \
14245 + ret = -ENODEV; \
14246 + mutex_lock(&bfqio_mutex); \
14247 +- if (bfqio_is_removed(cgroup)) \
14248 ++ if (bfqio_is_removed(bgrp)) \
14249 + goto out_unlock; \
14250 + ret = 0; \
14251 + \
14252 +- bgrp = cgroup_to_bfqio(cgroup); \
14253 +- \
14254 + spin_lock_irq(&bgrp->lock); \
14255 + bgrp->__VAR = (unsigned short)val; \
14256 + hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
14257 +@@ -713,11 +710,11 @@ static struct cftype bfqio_files[] = {
14258 + { }, /* terminate */
14259 + };
14260 +
14261 +-static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)
14262 ++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state *parent_css)
14263 + {
14264 + struct bfqio_cgroup *bgrp;
14265 +
14266 +- if (cgroup->parent != NULL) {
14267 ++ if (parent_css != NULL) {
14268 + bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
14269 + if (bgrp == NULL)
14270 + return ERR_PTR(-ENOMEM);
14271 +@@ -740,13 +737,14 @@ static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)
14272 + * behavior is that a group containing a task that forked using CLONE_IO
14273 + * will not be destroyed until the tasks sharing the ioc die.
14274 + */
14275 +-static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
14276 ++static int bfqio_can_attach(struct cgroup_subsys_state *css,
14277 ++ struct cgroup_taskset *tset)
14278 + {
14279 + struct task_struct *task;
14280 + struct io_context *ioc;
14281 + int ret = 0;
14282 +
14283 +- cgroup_taskset_for_each(task, cgroup, tset) {
14284 ++ cgroup_taskset_for_each(task, css, tset) {
14285 + /* task_lock() is needed to avoid races with exit_io_context() */
14286 + task_lock(task);
14287 + ioc = task->io_context;
14288 +@@ -766,7 +764,8 @@ static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
14289 + return ret;
14290 + }
14291 +
14292 +-static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
14293 ++static void bfqio_attach(struct cgroup_subsys_state *css,
14294 ++ struct cgroup_taskset *tset)
14295 + {
14296 + struct task_struct *task;
14297 + struct io_context *ioc;
14298 +@@ -776,7 +775,7 @@ static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
14299 + * IMPORTANT NOTE: The move of more than one process at a time to a
14300 + * new group has not yet been tested.
14301 + */
14302 +- cgroup_taskset_for_each(task, cgroup, tset) {
14303 ++ cgroup_taskset_for_each(task, css, tset) {
14304 + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
14305 + if (ioc) {
14306 + /*
14307 +@@ -787,16 +786,16 @@ static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
14308 + if (!strncmp(icq->q->elevator->type->elevator_name,
14309 + "bfq", ELV_NAME_MAX))
14310 + bfq_bic_change_cgroup(icq_to_bic(icq),
14311 +- cgroup);
14312 ++ css);
14313 + rcu_read_unlock();
14314 + put_io_context(ioc);
14315 + }
14316 + }
14317 + }
14318 +
14319 +-static void bfqio_destroy(struct cgroup *cgroup)
14320 ++static void bfqio_destroy(struct cgroup_subsys_state *css)
14321 + {
14322 +- struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
14323 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
14324 + struct hlist_node *tmp;
14325 + struct bfq_group *bfqg;
14326 +
14327 +@@ -815,9 +814,31 @@ static void bfqio_destroy(struct cgroup *cgroup)
14328 + kfree(bgrp);
14329 + }
14330 +
14331 ++static int bfqio_css_online(struct cgroup_subsys_state *css)
14332 ++{
14333 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
14334 ++
14335 ++ mutex_lock(&bfqio_mutex);
14336 ++ bgrp->online = true;
14337 ++ mutex_unlock(&bfqio_mutex);
14338 ++
14339 ++ return 0;
14340 ++}
14341 ++
14342 ++static void bfqio_css_offline(struct cgroup_subsys_state *css)
14343 ++{
14344 ++ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
14345 ++
14346 ++ mutex_lock(&bfqio_mutex);
14347 ++ bgrp->online = false;
14348 ++ mutex_unlock(&bfqio_mutex);
14349 ++}
14350 ++
14351 + struct cgroup_subsys bfqio_subsys = {
14352 + .name = "bfqio",
14353 + .css_alloc = bfqio_create,
14354 ++ .css_online = bfqio_css_online,
14355 ++ .css_offline = bfqio_css_offline,
14356 + .can_attach = bfqio_can_attach,
14357 + .attach = bfqio_attach,
14358 + .css_free = bfqio_destroy,
14359 +diff --git a/block/bfq.h b/block/bfq.h
14360 +index bb52975..885e62c 100644
14361 +--- a/block/bfq.h
14362 ++++ b/block/bfq.h
14363 +@@ -510,6 +510,7 @@ struct bfq_group {
14364 + /**
14365 + * struct bfqio_cgroup - bfq cgroup data structure.
14366 + * @css: subsystem state for bfq in the containing cgroup.
14367 ++ * @online: flag marked when the subsystem is inserted.
14368 + * @weight: cgroup weight.
14369 + * @ioprio: cgroup ioprio.
14370 + * @ioprio_class: cgroup ioprio_class.
14371 +@@ -521,6 +522,7 @@ struct bfq_group {
14372 + */
14373 + struct bfqio_cgroup {
14374 + struct cgroup_subsys_state css;
14375 ++ bool online;
14376 +
14377 + unsigned short weight, ioprio, ioprio_class;
14378 +
14379 +--
14380 +1.8.1.4
14381 +
14382
14383
14384 Property changes on: genpatches-2.6/trunk/3.12/5000_BFQ-4-block-Switch-from-BFQ-v6r2-for-3.11.0-to-BFQ-v6r2-fo.patch
14385 ___________________________________________________________________
14386 Added: svn:executable
14387 ## -0,0 +1 ##
14388 +*
14389 \ No newline at end of property