Gentoo Archives: gentoo-commits

From: "Mike Pagano (mpagano)" <mpagano@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] linux-patches r2716 - genpatches-2.6/trunk/3.14
Date: Mon, 31 Mar 2014 12:03:23
Message-Id: 20140331120315.2817A2005C@flycatcher.gentoo.org
1 Author: mpagano
2 Date: 2014-03-31 12:03:14 +0000 (Mon, 31 Mar 2014)
3 New Revision: 2716
4
5 Removed:
6 genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch
7 genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch
8 genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1
9 genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1
10 genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch
11 genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch
12 genpatches-2.6/trunk/3.14/5000_enable-additional-cpu-optimizations-for-gcc.patch
13 Modified:
14 genpatches-2.6/trunk/3.14/0000_README
15 Log:
16 Temporary removal of BFQ patches until compatible patches are committed
17
18 Modified: genpatches-2.6/trunk/3.14/0000_README
19 ===================================================================
20 --- genpatches-2.6/trunk/3.14/0000_README 2014-03-26 23:50:52 UTC (rev 2715)
21 +++ genpatches-2.6/trunk/3.14/0000_README 2014-03-31 12:03:14 UTC (rev 2716)
22 @@ -77,19 +77,3 @@
23 Patch: 4567_distro-Gentoo-Kconfig.patch
24 From: Tom Wijsman <TomWij@g.o>
25 Desc: Add Gentoo Linux support config settings and defaults.
26 -
27 -Patch: 5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7r1-3.13.patch
28 -From: http://algo.ing.unimo.it/people/paolo/disk_sched/
29 -Desc: BFQ v7r1 patch 1 for 3.13: Build, cgroups and kconfig bits
30 -
31 -Patch: 5000_BFQ-2-block-introduce-the-v7r1-I-O-sched-for-3.13.patch1
32 -From: http://algo.ing.unimo.it/people/paolo/disk_sched/
33 -Desc: BFQ v7r1 patch 2 for 3.13: BFQ Scheduler
34 -
35 -Patch: 5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r1-for-3.13.0.patch
36 -From: http://algo.ing.unimo.it/people/paolo/disk_sched/
37 -Desc: BFQ v7r1 patch 3 for 3.13: Early Queue Merge (EQM)
38 -
39 -Patch: 5000_enable-additional-cpu-optimizations-for-gcc.patch
40 -From: https://github.com/graysky2/kernel_gcc_patch/
41 -Desc: Kernel patch enables gcc optimizations for additional CPUs.
42
43 Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch
44 ===================================================================
45 --- genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch 2014-03-26 23:50:52 UTC (rev 2715)
46 +++ genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch 2014-03-31 12:03:14 UTC (rev 2716)
47 @@ -1,104 +0,0 @@
48 -From 7f029ed2a02bea57b791c032d6242129c3372a84 Mon Sep 17 00:00:00 2001
49 -From: Paolo Valente <paolo.valente@×××××××.it>
50 -Date: Tue, 3 Sep 2013 16:50:42 +0200
51 -Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7-3.13
52 -
53 -Update Kconfig.iosched and do the related Makefile changes to include
54 -kernel configuration options for BFQ. Also add the bfqio controller
55 -to the cgroups subsystem.
56 -
57 -Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
58 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
59 ----
60 - block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++
61 - block/Makefile | 1 +
62 - include/linux/cgroup_subsys.h | 4 ++++
63 - 3 files changed, 37 insertions(+)
64 -
65 -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
66 -index 421bef9..8f552ba 100644
67 ---- a/block/Kconfig.iosched
68 -+++ b/block/Kconfig.iosched
69 -@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED
70 - ---help---
71 - Enable group IO scheduling in CFQ.
72 -
73 -+config IOSCHED_BFQ
74 -+ tristate "BFQ I/O scheduler"
75 -+ default n
76 -+ ---help---
77 -+ The BFQ I/O scheduler tries to distribute bandwidth among
78 -+ all processes according to their weights.
79 -+ It aims at distributing the bandwidth as desired, independently of
80 -+ the disk parameters and with any workload. It also tries to
81 -+ guarantee low latency to interactive and soft real-time
82 -+ applications. If compiled built-in (saying Y here), BFQ can
83 -+ be configured to support hierarchical scheduling.
84 -+
85 -+config CGROUP_BFQIO
86 -+ bool "BFQ hierarchical scheduling support"
87 -+ depends on CGROUPS && IOSCHED_BFQ=y
88 -+ default n
89 -+ ---help---
90 -+ Enable hierarchical scheduling in BFQ, using the cgroups
91 -+ filesystem interface. The name of the subsystem will be
92 -+ bfqio.
93 -+
94 - choice
95 - prompt "Default I/O scheduler"
96 - default DEFAULT_CFQ
97 -@@ -52,6 +73,16 @@ choice
98 - config DEFAULT_CFQ
99 - bool "CFQ" if IOSCHED_CFQ=y
100 -
101 -+ config DEFAULT_BFQ
102 -+ bool "BFQ" if IOSCHED_BFQ=y
103 -+ help
104 -+ Selects BFQ as the default I/O scheduler which will be
105 -+ used by default for all block devices.
106 -+ The BFQ I/O scheduler aims at distributing the bandwidth
107 -+ as desired, independently of the disk parameters and with
108 -+ any workload. It also tries to guarantee low latency to
109 -+ interactive and soft real-time applications.
110 -+
111 - config DEFAULT_NOOP
112 - bool "No-op"
113 -
114 -@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED
115 - string
116 - default "deadline" if DEFAULT_DEADLINE
117 - default "cfq" if DEFAULT_CFQ
118 -+ default "bfq" if DEFAULT_BFQ
119 - default "noop" if DEFAULT_NOOP
120 -
121 - endmenu
122 -diff --git a/block/Makefile b/block/Makefile
123 -index 20645e8..cbd83fb 100644
124 ---- a/block/Makefile
125 -+++ b/block/Makefile
126 -@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
127 - obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
128 - obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
129 - obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
130 -+obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
131 -
132 - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
133 - obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
134 -diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
135 -index b613ffd..43c5dc9 100644
136 ---- a/include/linux/cgroup_subsys.h
137 -+++ b/include/linux/cgroup_subsys.h
138 -@@ -39,6 +39,10 @@ SUBSYS(net_cls)
139 - SUBSYS(blkio)
140 - #endif
141 -
142 -+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)
143 -+SUBSYS(bfqio)
144 -+#endif
145 -+
146 - #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)
147 - SUBSYS(perf)
148 - #endif
149 ---
150 -1.8.5.2
151 -
152
153 Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch
154 ===================================================================
155 --- genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch 2014-03-26 23:50:52 UTC (rev 2715)
156 +++ genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch 2014-03-31 12:03:14 UTC (rev 2716)
157 @@ -1,104 +0,0 @@
158 -From ae1b820a5286601aa9d5426459f8f3de658342b4 Mon Sep 17 00:00:00 2001
159 -From: Paolo Valente <paolo.valente@×××××××.it>
160 -Date: Tue, 3 Sep 2013 16:50:42 +0200
161 -Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r1-3.13
162 -
163 -Update Kconfig.iosched and do the related Makefile changes to include
164 -kernel configuration options for BFQ. Also add the bfqio controller
165 -to the cgroups subsystem.
166 -
167 -Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
168 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
169 ----
170 - block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++
171 - block/Makefile | 1 +
172 - include/linux/cgroup_subsys.h | 4 ++++
173 - 3 files changed, 37 insertions(+)
174 -
175 -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
176 -index 421bef9..8f552ba 100644
177 ---- a/block/Kconfig.iosched
178 -+++ b/block/Kconfig.iosched
179 -@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED
180 - ---help---
181 - Enable group IO scheduling in CFQ.
182 -
183 -+config IOSCHED_BFQ
184 -+ tristate "BFQ I/O scheduler"
185 -+ default n
186 -+ ---help---
187 -+ The BFQ I/O scheduler tries to distribute bandwidth among
188 -+ all processes according to their weights.
189 -+ It aims at distributing the bandwidth as desired, independently of
190 -+ the disk parameters and with any workload. It also tries to
191 -+ guarantee low latency to interactive and soft real-time
192 -+ applications. If compiled built-in (saying Y here), BFQ can
193 -+ be configured to support hierarchical scheduling.
194 -+
195 -+config CGROUP_BFQIO
196 -+ bool "BFQ hierarchical scheduling support"
197 -+ depends on CGROUPS && IOSCHED_BFQ=y
198 -+ default n
199 -+ ---help---
200 -+ Enable hierarchical scheduling in BFQ, using the cgroups
201 -+ filesystem interface. The name of the subsystem will be
202 -+ bfqio.
203 -+
204 - choice
205 - prompt "Default I/O scheduler"
206 - default DEFAULT_CFQ
207 -@@ -52,6 +73,16 @@ choice
208 - config DEFAULT_CFQ
209 - bool "CFQ" if IOSCHED_CFQ=y
210 -
211 -+ config DEFAULT_BFQ
212 -+ bool "BFQ" if IOSCHED_BFQ=y
213 -+ help
214 -+ Selects BFQ as the default I/O scheduler which will be
215 -+ used by default for all block devices.
216 -+ The BFQ I/O scheduler aims at distributing the bandwidth
217 -+ as desired, independently of the disk parameters and with
218 -+ any workload. It also tries to guarantee low latency to
219 -+ interactive and soft real-time applications.
220 -+
221 - config DEFAULT_NOOP
222 - bool "No-op"
223 -
224 -@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED
225 - string
226 - default "deadline" if DEFAULT_DEADLINE
227 - default "cfq" if DEFAULT_CFQ
228 -+ default "bfq" if DEFAULT_BFQ
229 - default "noop" if DEFAULT_NOOP
230 -
231 - endmenu
232 -diff --git a/block/Makefile b/block/Makefile
233 -index 20645e8..cbd83fb 100644
234 ---- a/block/Makefile
235 -+++ b/block/Makefile
236 -@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
237 - obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
238 - obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
239 - obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
240 -+obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o
241 -
242 - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
243 - obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
244 -diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
245 -index b613ffd..43c5dc9 100644
246 ---- a/include/linux/cgroup_subsys.h
247 -+++ b/include/linux/cgroup_subsys.h
248 -@@ -39,6 +39,10 @@ SUBSYS(net_cls)
249 - SUBSYS(blkio)
250 - #endif
251 -
252 -+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO)
253 -+SUBSYS(bfqio)
254 -+#endif
255 -+
256 - #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)
257 - SUBSYS(perf)
258 - #endif
259 ---
260 -1.8.5.2
261 -
262
263 Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1
264 ===================================================================
265 --- genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1 2014-03-26 23:50:52 UTC (rev 2715)
266 +++ genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1 2014-03-31 12:03:14 UTC (rev 2716)
267 @@ -1,6008 +0,0 @@
268 -From 3747f129106ce58fbad1b8f05cc836a6addd8588 Mon Sep 17 00:00:00 2001
269 -From: Paolo Valente <paolo.valente@×××××××.it>
270 -Date: Thu, 9 May 2013 19:10:02 +0200
271 -Subject: [PATCH 2/3] block: introduce the BFQ-v7 I/O sched for 3.13
272 -
273 -Add the BFQ-v7 I/O scheduler to 3.13.
274 -The general structure is borrowed from CFQ, as much of the code for
275 -handling I/O contexts Over time, several useful features have been
276 -ported from CFQ as well (details in the changelog in README.BFQ). A
277 -(bfq_)queue is associated to each task doing I/O on a device, and each
278 -time a scheduling decision has to be made a queue is selected and served
279 -until it expires.
280 -
281 - - Slices are given in the service domain: tasks are assigned
282 - budgets, measured in number of sectors. Once got the disk, a task
283 - must however consume its assigned budget within a configurable
284 - maximum time (by default, the maximum possible value of the
285 - budgets is automatically computed to comply with this timeout).
286 - This allows the desired latency vs "throughput boosting" tradeoff
287 - to be set.
288 -
289 - - Budgets are scheduled according to a variant of WF2Q+, implemented
290 - using an augmented rb-tree to take eligibility into account while
291 - preserving an O(log N) overall complexity.
292 -
293 - - A low-latency tunable is provided; if enabled, both interactive
294 - and soft real-time applications are guaranteed a very low latency.
295 -
296 - - Latency guarantees are preserved also in the presence of NCQ.
297 -
298 - - Also with flash-based devices, a high throughput is achieved
299 - while still preserving latency guarantees.
300 -
301 - - BFQ features Early Queue Merge (EQM), a sort of fusion of the
302 - cooperating-queue-merging and the preemption mechanisms present
303 - in CFQ. EQM is in fact a unified mechanism that tries to get a
304 - sequential read pattern, and hence a high throughput, with any
305 - set of processes performing interleaved I/O over a contiguous
306 - sequence of sectors.
307 -
308 - - BFQ supports full hierarchical scheduling, exporting a cgroups
309 - interface. Since each node has a full scheduler, each group can
310 - be assigned its own weight.
311 -
312 - - If the cgroups interface is not used, only I/O priorities can be
313 - assigned to processes, with ioprio values mapped to weights
314 - with the relation weight = IOPRIO_BE_NR - ioprio.
315 -
316 - - ioprio classes are served in strict priority order, i.e., lower
317 - priority queues are not served as long as there are higher
318 - priority queues. Among queues in the same class the bandwidth is
319 - distributed in proportion to the weight of each queue. A very
320 - thin extra bandwidth is however guaranteed to the Idle class, to
321 - prevent it from starving.
322 -
323 -Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
324 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
325 ----
326 - block/bfq-cgroup.c | 910 ++++++++++++++
327 - block/bfq-ioc.c | 36 +
328 - block/bfq-iosched.c | 3268 +++++++++++++++++++++++++++++++++++++++++++++++++++
329 - block/bfq-sched.c | 1077 +++++++++++++++++
330 - block/bfq.h | 614 ++++++++++
331 - 5 files changed, 5905 insertions(+)
332 - create mode 100644 block/bfq-cgroup.c
333 - create mode 100644 block/bfq-ioc.c
334 - create mode 100644 block/bfq-iosched.c
335 - create mode 100644 block/bfq-sched.c
336 - create mode 100644 block/bfq.h
337 -
338 -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
339 -new file mode 100644
340 -index 0000000..b889acf
341 ---- /dev/null
342 -+++ b/block/bfq-cgroup.c
343 -@@ -0,0 +1,910 @@
344 -+/*
345 -+ * BFQ: CGROUPS support.
346 -+ *
347 -+ * Based on ideas and code from CFQ:
348 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
349 -+ *
350 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
351 -+ * Paolo Valente <paolo.valente@×××××××.it>
352 -+ *
353 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
354 -+ *
355 -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
356 -+ */
357 -+
358 -+#ifdef CONFIG_CGROUP_BFQIO
359 -+
360 -+static DEFINE_MUTEX(bfqio_mutex);
361 -+
362 -+static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
363 -+{
364 -+ return bgrp ? !bgrp->online : false;
365 -+}
366 -+
367 -+static struct bfqio_cgroup bfqio_root_cgroup = {
368 -+ .weight = BFQ_DEFAULT_GRP_WEIGHT,
369 -+ .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
370 -+ .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
371 -+};
372 -+
373 -+static inline void bfq_init_entity(struct bfq_entity *entity,
374 -+ struct bfq_group *bfqg)
375 -+{
376 -+ entity->weight = entity->new_weight;
377 -+ entity->orig_weight = entity->new_weight;
378 -+ entity->ioprio = entity->new_ioprio;
379 -+ entity->ioprio_class = entity->new_ioprio_class;
380 -+ entity->parent = bfqg->my_entity;
381 -+ entity->sched_data = &bfqg->sched_data;
382 -+}
383 -+
384 -+static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
385 -+{
386 -+ return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
387 -+}
388 -+
389 -+/*
390 -+ * Search the bfq_group for bfqd into the hash table (by now only a list)
391 -+ * of bgrp. Must be called under rcu_read_lock().
392 -+ */
393 -+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
394 -+ struct bfq_data *bfqd)
395 -+{
396 -+ struct bfq_group *bfqg;
397 -+ void *key;
398 -+
399 -+ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
400 -+ key = rcu_dereference(bfqg->bfqd);
401 -+ if (key == bfqd)
402 -+ return bfqg;
403 -+ }
404 -+
405 -+ return NULL;
406 -+}
407 -+
408 -+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
409 -+ struct bfq_group *bfqg)
410 -+{
411 -+ struct bfq_entity *entity = &bfqg->entity;
412 -+
413 -+ /*
414 -+ * If the weight of the entity has never been set via the sysfs
415 -+ * interface, then bgrp->weight == 0. In this case we initialize
416 -+ * the weight from the current ioprio value. Otherwise, the group
417 -+ * weight, if set, has priority over the ioprio value.
418 -+ */
419 -+ if (bgrp->weight == 0) {
420 -+ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
421 -+ entity->new_ioprio = bgrp->ioprio;
422 -+ } else {
423 -+ entity->new_weight = bgrp->weight;
424 -+ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
425 -+ }
426 -+ entity->orig_weight = entity->weight = entity->new_weight;
427 -+ entity->ioprio = entity->new_ioprio;
428 -+ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
429 -+ entity->my_sched_data = &bfqg->sched_data;
430 -+}
431 -+
432 -+static inline void bfq_group_set_parent(struct bfq_group *bfqg,
433 -+ struct bfq_group *parent)
434 -+{
435 -+ struct bfq_entity *entity;
436 -+
437 -+ BUG_ON(parent == NULL);
438 -+ BUG_ON(bfqg == NULL);
439 -+
440 -+ entity = &bfqg->entity;
441 -+ entity->parent = parent->my_entity;
442 -+ entity->sched_data = &parent->sched_data;
443 -+}
444 -+
445 -+/**
446 -+ * bfq_group_chain_alloc - allocate a chain of groups.
447 -+ * @bfqd: queue descriptor.
448 -+ * @css: the leaf cgroup_subsys_state this chain starts from.
449 -+ *
450 -+ * Allocate a chain of groups starting from the one belonging to
451 -+ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
452 -+ * to the root has already an allocated group on @bfqd.
453 -+ */
454 -+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
455 -+ struct cgroup_subsys_state *css)
456 -+{
457 -+ struct bfqio_cgroup *bgrp;
458 -+ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
459 -+
460 -+ for (; css != NULL; css = css->parent) {
461 -+ bgrp = css_to_bfqio(css);
462 -+
463 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
464 -+ if (bfqg != NULL) {
465 -+ /*
466 -+ * All the cgroups in the path from there to the
467 -+ * root must have a bfq_group for bfqd, so we don't
468 -+ * need any more allocations.
469 -+ */
470 -+ break;
471 -+ }
472 -+
473 -+ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
474 -+ if (bfqg == NULL)
475 -+ goto cleanup;
476 -+
477 -+ bfq_group_init_entity(bgrp, bfqg);
478 -+ bfqg->my_entity = &bfqg->entity;
479 -+
480 -+ if (leaf == NULL) {
481 -+ leaf = bfqg;
482 -+ prev = leaf;
483 -+ } else {
484 -+ bfq_group_set_parent(prev, bfqg);
485 -+ /*
486 -+ * Build a list of allocated nodes using the bfqd
487 -+ * filed, that is still unused and will be initialized
488 -+ * only after the node will be connected.
489 -+ */
490 -+ prev->bfqd = bfqg;
491 -+ prev = bfqg;
492 -+ }
493 -+ }
494 -+
495 -+ return leaf;
496 -+
497 -+cleanup:
498 -+ while (leaf != NULL) {
499 -+ prev = leaf;
500 -+ leaf = leaf->bfqd;
501 -+ kfree(prev);
502 -+ }
503 -+
504 -+ return NULL;
505 -+}
506 -+
507 -+/**
508 -+ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
509 -+ * @bfqd: the queue descriptor.
510 -+ * @css: the leaf cgroup_subsys_state to start from.
511 -+ * @leaf: the leaf group (to be associated to @cgroup).
512 -+ *
513 -+ * Try to link a chain of groups to a cgroup hierarchy, connecting the
514 -+ * nodes bottom-up, so we can be sure that when we find a cgroup in the
515 -+ * hierarchy that already as a group associated to @bfqd all the nodes
516 -+ * in the path to the root cgroup have one too.
517 -+ *
518 -+ * On locking: the queue lock protects the hierarchy (there is a hierarchy
519 -+ * per device) while the bfqio_cgroup lock protects the list of groups
520 -+ * belonging to the same cgroup.
521 -+ */
522 -+static void bfq_group_chain_link(struct bfq_data *bfqd,
523 -+ struct cgroup_subsys_state *css,
524 -+ struct bfq_group *leaf)
525 -+{
526 -+ struct bfqio_cgroup *bgrp;
527 -+ struct bfq_group *bfqg, *next, *prev = NULL;
528 -+ unsigned long flags;
529 -+
530 -+ assert_spin_locked(bfqd->queue->queue_lock);
531 -+
532 -+ for (; css != NULL && leaf != NULL; css = css->parent) {
533 -+ bgrp = css_to_bfqio(css);
534 -+ next = leaf->bfqd;
535 -+
536 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
537 -+ BUG_ON(bfqg != NULL);
538 -+
539 -+ spin_lock_irqsave(&bgrp->lock, flags);
540 -+
541 -+ rcu_assign_pointer(leaf->bfqd, bfqd);
542 -+ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
543 -+ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
544 -+
545 -+ spin_unlock_irqrestore(&bgrp->lock, flags);
546 -+
547 -+ prev = leaf;
548 -+ leaf = next;
549 -+ }
550 -+
551 -+ BUG_ON(css == NULL && leaf != NULL);
552 -+ if (css != NULL && prev != NULL) {
553 -+ bgrp = css_to_bfqio(css);
554 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
555 -+ bfq_group_set_parent(prev, bfqg);
556 -+ }
557 -+}
558 -+
559 -+/**
560 -+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
561 -+ * @bfqd: queue descriptor.
562 -+ * @cgroup: cgroup being searched for.
563 -+ *
564 -+ * Return a group associated to @bfqd in @cgroup, allocating one if
565 -+ * necessary. When a group is returned all the cgroups in the path
566 -+ * to the root have a group associated to @bfqd.
567 -+ *
568 -+ * If the allocation fails, return the root group: this breaks guarantees
569 -+ * but is a safe fallbak. If this loss becames a problem it can be
570 -+ * mitigated using the equivalent weight (given by the product of the
571 -+ * weights of the groups in the path from @group to the root) in the
572 -+ * root scheduler.
573 -+ *
574 -+ * We allocate all the missing nodes in the path from the leaf cgroup
575 -+ * to the root and we connect the nodes only after all the allocations
576 -+ * have been successful.
577 -+ */
578 -+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
579 -+ struct cgroup_subsys_state *css)
580 -+{
581 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
582 -+ struct bfq_group *bfqg;
583 -+
584 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
585 -+ if (bfqg != NULL)
586 -+ return bfqg;
587 -+
588 -+ bfqg = bfq_group_chain_alloc(bfqd, css);
589 -+ if (bfqg != NULL)
590 -+ bfq_group_chain_link(bfqd, css, bfqg);
591 -+ else
592 -+ bfqg = bfqd->root_group;
593 -+
594 -+ return bfqg;
595 -+}
596 -+
597 -+/**
598 -+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
599 -+ * @bfqd: queue descriptor.
600 -+ * @bfqq: the queue to move.
601 -+ * @entity: @bfqq's entity.
602 -+ * @bfqg: the group to move to.
603 -+ *
604 -+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
605 -+ * it on the new one. Avoid putting the entity on the old group idle tree.
606 -+ *
607 -+ * Must be called under the queue lock; the cgroup owning @bfqg must
608 -+ * not disappear (by now this just means that we are called under
609 -+ * rcu_read_lock()).
610 -+ */
611 -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
612 -+ struct bfq_entity *entity, struct bfq_group *bfqg)
613 -+{
614 -+ int busy, resume;
615 -+
616 -+ busy = bfq_bfqq_busy(bfqq);
617 -+ resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
618 -+
619 -+ BUG_ON(resume && !entity->on_st);
620 -+ BUG_ON(busy && !resume && entity->on_st &&
621 -+ bfqq != bfqd->in_service_queue);
622 -+
623 -+ if (busy) {
624 -+ BUG_ON(atomic_read(&bfqq->ref) < 2);
625 -+
626 -+ if (!resume)
627 -+ bfq_del_bfqq_busy(bfqd, bfqq, 0);
628 -+ else
629 -+ bfq_deactivate_bfqq(bfqd, bfqq, 0);
630 -+ } else if (entity->on_st)
631 -+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
632 -+
633 -+ /*
634 -+ * Here we use a reference to bfqg. We don't need a refcounter
635 -+ * as the cgroup reference will not be dropped, so that its
636 -+ * destroy() callback will not be invoked.
637 -+ */
638 -+ entity->parent = bfqg->my_entity;
639 -+ entity->sched_data = &bfqg->sched_data;
640 -+
641 -+ if (busy && resume)
642 -+ bfq_activate_bfqq(bfqd, bfqq);
643 -+
644 -+ if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)
645 -+ bfq_schedule_dispatch(bfqd);
646 -+}
647 -+
648 -+/**
649 -+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
650 -+ * @bfqd: the queue descriptor.
651 -+ * @bic: the bic to move.
652 -+ * @cgroup: the cgroup to move to.
653 -+ *
654 -+ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
655 -+ * has to make sure that the reference to cgroup is valid across the call.
656 -+ *
657 -+ * NOTE: an alternative approach might have been to store the current
658 -+ * cgroup in bfqq and getting a reference to it, reducing the lookup
659 -+ * time here, at the price of slightly more complex code.
660 -+ */
661 -+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
662 -+ struct bfq_io_cq *bic,
663 -+ struct cgroup_subsys_state *css)
664 -+{
665 -+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
666 -+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
667 -+ struct bfq_entity *entity;
668 -+ struct bfq_group *bfqg;
669 -+ struct bfqio_cgroup *bgrp;
670 -+
671 -+ bgrp = css_to_bfqio(css);
672 -+
673 -+ bfqg = bfq_find_alloc_group(bfqd, css);
674 -+ if (async_bfqq != NULL) {
675 -+ entity = &async_bfqq->entity;
676 -+
677 -+ if (entity->sched_data != &bfqg->sched_data) {
678 -+ bic_set_bfqq(bic, NULL, 0);
679 -+ bfq_log_bfqq(bfqd, async_bfqq,
680 -+ "bic_change_group: %p %d",
681 -+ async_bfqq, atomic_read(&async_bfqq->ref));
682 -+ bfq_put_queue(async_bfqq);
683 -+ }
684 -+ }
685 -+
686 -+ if (sync_bfqq != NULL) {
687 -+ entity = &sync_bfqq->entity;
688 -+ if (entity->sched_data != &bfqg->sched_data)
689 -+ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
690 -+ }
691 -+
692 -+ return bfqg;
693 -+}
694 -+
695 -+/**
696 -+ * bfq_bic_change_cgroup - move @bic to @cgroup.
697 -+ * @bic: the bic being migrated.
698 -+ * @cgroup: the destination cgroup.
699 -+ *
700 -+ * When the task owning @bic is moved to @cgroup, @bic is immediately
701 -+ * moved into its new parent group.
702 -+ */
703 -+static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
704 -+ struct cgroup_subsys_state *css)
705 -+{
706 -+ struct bfq_data *bfqd;
707 -+ unsigned long uninitialized_var(flags);
708 -+
709 -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
710 -+ &flags);
711 -+ if (bfqd != NULL) {
712 -+ __bfq_bic_change_cgroup(bfqd, bic, css);
713 -+ bfq_put_bfqd_unlock(bfqd, &flags);
714 -+ }
715 -+}
716 -+
717 -+/**
718 -+ * bfq_bic_update_cgroup - update the cgroup of @bic.
719 -+ * @bic: the @bic to update.
720 -+ *
721 -+ * Make sure that @bic is enqueued in the cgroup of the current task.
722 -+ * We need this in addition to moving bics during the cgroup attach
723 -+ * phase because the task owning @bic could be at its first disk
724 -+ * access or we may end up in the root cgroup as the result of a
725 -+ * memory allocation failure and here we try to move to the right
726 -+ * group.
727 -+ *
728 -+ * Must be called under the queue lock. It is safe to use the returned
729 -+ * value even after the rcu_read_unlock() as the migration/destruction
730 -+ * paths act under the queue lock too. IOW it is impossible to race with
731 -+ * group migration/destruction and end up with an invalid group as:
732 -+ * a) here cgroup has not yet been destroyed, nor its destroy callback
733 -+ * has started execution, as current holds a reference to it,
734 -+ * b) if it is destroyed after rcu_read_unlock() [after current is
735 -+ * migrated to a different cgroup] its attach() callback will have
736 -+ * taken care of remove all the references to the old cgroup data.
737 -+ */
738 -+static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
739 -+{
740 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
741 -+ struct bfq_group *bfqg;
742 -+ struct cgroup_subsys_state *css;
743 -+
744 -+ BUG_ON(bfqd == NULL);
745 -+
746 -+ rcu_read_lock();
747 -+ css = task_css(current, bfqio_subsys_id);
748 -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
749 -+ rcu_read_unlock();
750 -+
751 -+ return bfqg;
752 -+}
753 -+
754 -+/**
755 -+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
756 -+ * @st: the service tree being flushed.
757 -+ */
758 -+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
759 -+{
760 -+ struct bfq_entity *entity = st->first_idle;
761 -+
762 -+ for (; entity != NULL; entity = st->first_idle)
763 -+ __bfq_deactivate_entity(entity, 0);
764 -+}
765 -+
766 -+/**
767 -+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
768 -+ * @bfqd: the device data structure with the root group.
769 -+ * @entity: the entity to move.
770 -+ */
771 -+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
772 -+ struct bfq_entity *entity)
773 -+{
774 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
775 -+
776 -+ BUG_ON(bfqq == NULL);
777 -+ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
778 -+ return;
779 -+}
780 -+
781 -+/**
782 -+ * bfq_reparent_active_entities - move to the root group all active entities.
783 -+ * @bfqd: the device data structure with the root group.
784 -+ * @bfqg: the group to move from.
785 -+ * @st: the service tree with the entities.
786 -+ *
787 -+ * Needs queue_lock to be taken and reference to be valid over the call.
788 -+ */
789 -+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
790 -+ struct bfq_group *bfqg,
791 -+ struct bfq_service_tree *st)
792 -+{
793 -+ struct rb_root *active = &st->active;
794 -+ struct bfq_entity *entity = NULL;
795 -+
796 -+ if (!RB_EMPTY_ROOT(&st->active))
797 -+ entity = bfq_entity_of(rb_first(active));
798 -+
799 -+ for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))
800 -+ bfq_reparent_leaf_entity(bfqd, entity);
801 -+
802 -+ if (bfqg->sched_data.active_entity != NULL)
803 -+ bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);
804 -+
805 -+ return;
806 -+}
807 -+
808 -+/**
809 -+ * bfq_destroy_group - destroy @bfqg.
810 -+ * @bgrp: the bfqio_cgroup containing @bfqg.
811 -+ * @bfqg: the group being destroyed.
812 -+ *
813 -+ * Destroy @bfqg, making sure that it is not referenced from its parent.
814 -+ */
815 -+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
816 -+{
817 -+ struct bfq_data *bfqd;
818 -+ struct bfq_service_tree *st;
819 -+ struct bfq_entity *entity = bfqg->my_entity;
820 -+ unsigned long uninitialized_var(flags);
821 -+ int i;
822 -+
823 -+ hlist_del(&bfqg->group_node);
824 -+
825 -+ /*
826 -+ * Empty all service_trees belonging to this group before deactivating
827 -+ * the group itself.
828 -+ */
829 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
830 -+ st = bfqg->sched_data.service_tree + i;
831 -+
832 -+ /*
833 -+ * The idle tree may still contain bfq_queues belonging
834 -+ * to exited task because they never migrated to a different
835 -+ * cgroup from the one being destroyed now. Noone else
836 -+ * can access them so it's safe to act without any lock.
837 -+ */
838 -+ bfq_flush_idle_tree(st);
839 -+
840 -+ /*
841 -+ * It may happen that some queues are still active
842 -+ * (busy) upon group destruction (if the corresponding
843 -+ * processes have been forced to terminate). We move
844 -+ * all the leaf entities corresponding to these queues
845 -+ * to the root_group.
846 -+ * Also, it may happen that the group has an entity
847 -+ * under service, which is disconnected from the active
848 -+ * tree: it must be moved, too.
849 -+ * There is no need to put the sync queues, as the
850 -+ * scheduler has taken no reference.
851 -+ */
852 -+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
853 -+ if (bfqd != NULL) {
854 -+ bfq_reparent_active_entities(bfqd, bfqg, st);
855 -+ bfq_put_bfqd_unlock(bfqd, &flags);
856 -+ }
857 -+ BUG_ON(!RB_EMPTY_ROOT(&st->active));
858 -+ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
859 -+ }
860 -+ BUG_ON(bfqg->sched_data.next_active != NULL);
861 -+ BUG_ON(bfqg->sched_data.active_entity != NULL);
862 -+
863 -+ /*
864 -+ * We may race with device destruction, take extra care when
865 -+ * dereferencing bfqg->bfqd.
866 -+ */
867 -+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
868 -+ if (bfqd != NULL) {
869 -+ hlist_del(&bfqg->bfqd_node);
870 -+ __bfq_deactivate_entity(entity, 0);
871 -+ bfq_put_async_queues(bfqd, bfqg);
872 -+ bfq_put_bfqd_unlock(bfqd, &flags);
873 -+ }
874 -+ BUG_ON(entity->tree != NULL);
875 -+
876 -+ /*
877 -+ * No need to defer the kfree() to the end of the RCU grace
878 -+ * period: we are called from the destroy() callback of our
879 -+ * cgroup, so we can be sure that noone is a) still using
880 -+ * this cgroup or b) doing lookups in it.
881 -+ */
882 -+ kfree(bfqg);
883 -+}
884 -+
885 -+static void bfq_end_raising_async(struct bfq_data *bfqd)
886 -+{
887 -+ struct hlist_node *tmp;
888 -+ struct bfq_group *bfqg;
889 -+
890 -+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
891 -+ bfq_end_raising_async_queues(bfqd, bfqg);
892 -+ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
893 -+}
894 -+
895 -+/**
896 -+ * bfq_disconnect_groups - diconnect @bfqd from all its groups.
897 -+ * @bfqd: the device descriptor being exited.
898 -+ *
899 -+ * When the device exits we just make sure that no lookup can return
900 -+ * the now unused group structures. They will be deallocated on cgroup
901 -+ * destruction.
902 -+ */
903 -+static void bfq_disconnect_groups(struct bfq_data *bfqd)
904 -+{
905 -+ struct hlist_node *tmp;
906 -+ struct bfq_group *bfqg;
907 -+
908 -+ bfq_log(bfqd, "disconnect_groups beginning");
909 -+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
910 -+ hlist_del(&bfqg->bfqd_node);
911 -+
912 -+ __bfq_deactivate_entity(bfqg->my_entity, 0);
913 -+
914 -+ /*
915 -+ * Don't remove from the group hash, just set an
916 -+ * invalid key. No lookups can race with the
917 -+ * assignment as bfqd is being destroyed; this
918 -+ * implies also that new elements cannot be added
919 -+ * to the list.
920 -+ */
921 -+ rcu_assign_pointer(bfqg->bfqd, NULL);
922 -+
923 -+ bfq_log(bfqd, "disconnect_groups: put async for group %p",
924 -+ bfqg);
925 -+ bfq_put_async_queues(bfqd, bfqg);
926 -+ }
927 -+}
928 -+
929 -+static inline void bfq_free_root_group(struct bfq_data *bfqd)
930 -+{
931 -+ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
932 -+ struct bfq_group *bfqg = bfqd->root_group;
933 -+
934 -+ bfq_put_async_queues(bfqd, bfqg);
935 -+
936 -+ spin_lock_irq(&bgrp->lock);
937 -+ hlist_del_rcu(&bfqg->group_node);
938 -+ spin_unlock_irq(&bgrp->lock);
939 -+
940 -+ /*
941 -+ * No need to synchronize_rcu() here: since the device is gone
942 -+ * there cannot be any read-side access to its root_group.
943 -+ */
944 -+ kfree(bfqg);
945 -+}
946 -+
947 -+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
948 -+{
949 -+ struct bfq_group *bfqg;
950 -+ struct bfqio_cgroup *bgrp;
951 -+ int i;
952 -+
953 -+ bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);
954 -+ if (bfqg == NULL)
955 -+ return NULL;
956 -+
957 -+ bfqg->entity.parent = NULL;
958 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
959 -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
960 -+
961 -+ bgrp = &bfqio_root_cgroup;
962 -+ spin_lock_irq(&bgrp->lock);
963 -+ rcu_assign_pointer(bfqg->bfqd, bfqd);
964 -+ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
965 -+ spin_unlock_irq(&bgrp->lock);
966 -+
967 -+ return bfqg;
968 -+}
969 -+
970 -+#define SHOW_FUNCTION(__VAR) \
971 -+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
972 -+ struct cftype *cftype) \
973 -+{ \
974 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
975 -+ u64 ret = -ENODEV; \
976 -+ \
977 -+ mutex_lock(&bfqio_mutex); \
978 -+ if (bfqio_is_removed(bgrp)) \
979 -+ goto out_unlock; \
980 -+ \
981 -+ spin_lock_irq(&bgrp->lock); \
982 -+ ret = bgrp->__VAR; \
983 -+ spin_unlock_irq(&bgrp->lock); \
984 -+ \
985 -+out_unlock: \
986 -+ mutex_unlock(&bfqio_mutex); \
987 -+ return ret; \
988 -+}
989 -+
990 -+SHOW_FUNCTION(weight);
991 -+SHOW_FUNCTION(ioprio);
992 -+SHOW_FUNCTION(ioprio_class);
993 -+#undef SHOW_FUNCTION
994 -+
995 -+#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
996 -+static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
997 -+ struct cftype *cftype, \
998 -+ u64 val) \
999 -+{ \
1000 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
1001 -+ struct bfq_group *bfqg; \
1002 -+ int ret = -EINVAL; \
1003 -+ \
1004 -+ if (val < (__MIN) || val > (__MAX)) \
1005 -+ return ret; \
1006 -+ \
1007 -+ ret = -ENODEV; \
1008 -+ mutex_lock(&bfqio_mutex); \
1009 -+ if (bfqio_is_removed(bgrp)) \
1010 -+ goto out_unlock; \
1011 -+ ret = 0; \
1012 -+ \
1013 -+ spin_lock_irq(&bgrp->lock); \
1014 -+ bgrp->__VAR = (unsigned short)val; \
1015 -+ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
1016 -+ /* \
1017 -+ * Setting the ioprio_changed flag of the entity \
1018 -+ * to 1 with new_##__VAR == ##__VAR would re-set \
1019 -+ * the value of the weight to its ioprio mapping. \
1020 -+ * Set the flag only if necessary. \
1021 -+ */ \
1022 -+ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
1023 -+ bfqg->entity.new_##__VAR = (unsigned short)val; \
1024 -+ smp_wmb(); \
1025 -+ bfqg->entity.ioprio_changed = 1; \
1026 -+ } \
1027 -+ } \
1028 -+ spin_unlock_irq(&bgrp->lock); \
1029 -+ \
1030 -+out_unlock: \
1031 -+ mutex_unlock(&bfqio_mutex); \
1032 -+ return ret; \
1033 -+}
1034 -+
1035 -+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
1036 -+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
1037 -+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
1038 -+#undef STORE_FUNCTION
1039 -+
1040 -+static struct cftype bfqio_files[] = {
1041 -+ {
1042 -+ .name = "weight",
1043 -+ .read_u64 = bfqio_cgroup_weight_read,
1044 -+ .write_u64 = bfqio_cgroup_weight_write,
1045 -+ },
1046 -+ {
1047 -+ .name = "ioprio",
1048 -+ .read_u64 = bfqio_cgroup_ioprio_read,
1049 -+ .write_u64 = bfqio_cgroup_ioprio_write,
1050 -+ },
1051 -+ {
1052 -+ .name = "ioprio_class",
1053 -+ .read_u64 = bfqio_cgroup_ioprio_class_read,
1054 -+ .write_u64 = bfqio_cgroup_ioprio_class_write,
1055 -+ },
1056 -+ { }, /* terminate */
1057 -+};
1058 -+
1059 -+static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state
1060 -+ *parent_css)
1061 -+{
1062 -+ struct bfqio_cgroup *bgrp;
1063 -+
1064 -+ if (parent_css != NULL) {
1065 -+ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
1066 -+ if (bgrp == NULL)
1067 -+ return ERR_PTR(-ENOMEM);
1068 -+ } else
1069 -+ bgrp = &bfqio_root_cgroup;
1070 -+
1071 -+ spin_lock_init(&bgrp->lock);
1072 -+ INIT_HLIST_HEAD(&bgrp->group_data);
1073 -+ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
1074 -+ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
1075 -+
1076 -+ return &bgrp->css;
1077 -+}
1078 -+
1079 -+/*
1080 -+ * We cannot support shared io contexts, as we have no means to support
1081 -+ * two tasks with the same ioc in two different groups without major rework
1082 -+ * of the main bic/bfqq data structures. By now we allow a task to change
1083 -+ * its cgroup only if it's the only owner of its ioc; the drawback of this
1084 -+ * behavior is that a group containing a task that forked using CLONE_IO
1085 -+ * will not be destroyed until the tasks sharing the ioc die.
1086 -+ */
1087 -+static int bfqio_can_attach(struct cgroup_subsys_state *css,
1088 -+ struct cgroup_taskset *tset)
1089 -+{
1090 -+ struct task_struct *task;
1091 -+ struct io_context *ioc;
1092 -+ int ret = 0;
1093 -+
1094 -+ cgroup_taskset_for_each(task, css, tset) {
1095 -+ /*
1096 -+ * task_lock() is needed to avoid races with
1097 -+ * exit_io_context()
1098 -+ */
1099 -+ task_lock(task);
1100 -+ ioc = task->io_context;
1101 -+ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
1102 -+ /*
1103 -+ * ioc == NULL means that the task is either too young
1104 -+ * or exiting: if it has still no ioc the ioc can't be
1105 -+ * shared, if the task is exiting the attach will fail
1106 -+ * anyway, no matter what we return here.
1107 -+ */
1108 -+ ret = -EINVAL;
1109 -+ task_unlock(task);
1110 -+ if (ret)
1111 -+ break;
1112 -+ }
1113 -+
1114 -+ return ret;
1115 -+}
1116 -+
1117 -+static void bfqio_attach(struct cgroup_subsys_state *css,
1118 -+ struct cgroup_taskset *tset)
1119 -+{
1120 -+ struct task_struct *task;
1121 -+ struct io_context *ioc;
1122 -+ struct io_cq *icq;
1123 -+
1124 -+ /*
1125 -+ * IMPORTANT NOTE: The move of more than one process at a time to a
1126 -+ * new group has not yet been tested.
1127 -+ */
1128 -+ cgroup_taskset_for_each(task, css, tset) {
1129 -+ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
1130 -+ if (ioc) {
1131 -+ /*
1132 -+ * Handle cgroup change here.
1133 -+ */
1134 -+ rcu_read_lock();
1135 -+ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
1136 -+ if (!strncmp(
1137 -+ icq->q->elevator->type->elevator_name,
1138 -+ "bfq", ELV_NAME_MAX))
1139 -+ bfq_bic_change_cgroup(icq_to_bic(icq),
1140 -+ css);
1141 -+ rcu_read_unlock();
1142 -+ put_io_context(ioc);
1143 -+ }
1144 -+ }
1145 -+}
1146 -+
1147 -+static void bfqio_destroy(struct cgroup_subsys_state *css)
1148 -+{
1149 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
1150 -+ struct hlist_node *tmp;
1151 -+ struct bfq_group *bfqg;
1152 -+
1153 -+ /*
1154 -+ * Since we are destroying the cgroup, there are no more tasks
1155 -+ * referencing it, and all the RCU grace periods that may have
1156 -+ * referenced it are ended (as the destruction of the parent
1157 -+ * cgroup is RCU-safe); bgrp->group_data will not be accessed by
1158 -+ * anything else and we don't need any synchronization.
1159 -+ */
1160 -+ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
1161 -+ bfq_destroy_group(bgrp, bfqg);
1162 -+
1163 -+ BUG_ON(!hlist_empty(&bgrp->group_data));
1164 -+
1165 -+ kfree(bgrp);
1166 -+}
1167 -+
1168 -+static int bfqio_css_online(struct cgroup_subsys_state *css)
1169 -+{
1170 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
1171 -+
1172 -+ mutex_lock(&bfqio_mutex);
1173 -+ bgrp->online = true;
1174 -+ mutex_unlock(&bfqio_mutex);
1175 -+
1176 -+ return 0;
1177 -+}
1178 -+
1179 -+static void bfqio_css_offline(struct cgroup_subsys_state *css)
1180 -+{
1181 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
1182 -+
1183 -+ mutex_lock(&bfqio_mutex);
1184 -+ bgrp->online = false;
1185 -+ mutex_unlock(&bfqio_mutex);
1186 -+}
1187 -+
1188 -+struct cgroup_subsys bfqio_subsys = {
1189 -+ .name = "bfqio",
1190 -+ .css_alloc = bfqio_create,
1191 -+ .css_online = bfqio_css_online,
1192 -+ .css_offline = bfqio_css_offline,
1193 -+ .can_attach = bfqio_can_attach,
1194 -+ .attach = bfqio_attach,
1195 -+ .css_free = bfqio_destroy,
1196 -+ .subsys_id = bfqio_subsys_id,
1197 -+ .base_cftypes = bfqio_files,
1198 -+};
1199 -+#else
1200 -+static inline void bfq_init_entity(struct bfq_entity *entity,
1201 -+ struct bfq_group *bfqg)
1202 -+{
1203 -+ entity->weight = entity->new_weight;
1204 -+ entity->orig_weight = entity->new_weight;
1205 -+ entity->ioprio = entity->new_ioprio;
1206 -+ entity->ioprio_class = entity->new_ioprio_class;
1207 -+ entity->sched_data = &bfqg->sched_data;
1208 -+}
1209 -+
1210 -+static inline struct bfq_group *
1211 -+bfq_bic_update_cgroup(struct bfq_io_cq *bic)
1212 -+{
1213 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
1214 -+ return bfqd->root_group;
1215 -+}
1216 -+
1217 -+static inline void bfq_bfqq_move(struct bfq_data *bfqd,
1218 -+ struct bfq_queue *bfqq,
1219 -+ struct bfq_entity *entity,
1220 -+ struct bfq_group *bfqg)
1221 -+{
1222 -+}
1223 -+
1224 -+static void bfq_end_raising_async(struct bfq_data *bfqd)
1225 -+{
1226 -+ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
1227 -+}
1228 -+
1229 -+static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
1230 -+{
1231 -+ bfq_put_async_queues(bfqd, bfqd->root_group);
1232 -+}
1233 -+
1234 -+static inline void bfq_free_root_group(struct bfq_data *bfqd)
1235 -+{
1236 -+ kfree(bfqd->root_group);
1237 -+}
1238 -+
1239 -+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
1240 -+{
1241 -+ struct bfq_group *bfqg;
1242 -+ int i;
1243 -+
1244 -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
1245 -+ if (bfqg == NULL)
1246 -+ return NULL;
1247 -+
1248 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
1249 -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
1250 -+
1251 -+ return bfqg;
1252 -+}
1253 -+#endif
1254 -diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
1255 -new file mode 100644
1256 -index 0000000..7f6b000
1257 ---- /dev/null
1258 -+++ b/block/bfq-ioc.c
1259 -@@ -0,0 +1,36 @@
1260 -+/*
1261 -+ * BFQ: I/O context handling.
1262 -+ *
1263 -+ * Based on ideas and code from CFQ:
1264 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1265 -+ *
1266 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1267 -+ * Paolo Valente <paolo.valente@×××××××.it>
1268 -+ *
1269 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1270 -+ */
1271 -+
1272 -+/**
1273 -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
1274 -+ * @icq: the iocontext queue.
1275 -+ */
1276 -+static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
1277 -+{
1278 -+ /* bic->icq is the first member, %NULL will convert to %NULL */
1279 -+ return container_of(icq, struct bfq_io_cq, icq);
1280 -+}
1281 -+
1282 -+/**
1283 -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
1284 -+ * @bfqd: the lookup key.
1285 -+ * @ioc: the io_context of the process doing I/O.
1286 -+ *
1287 -+ * Queue lock must be held.
1288 -+ */
1289 -+static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
1290 -+ struct io_context *ioc)
1291 -+{
1292 -+ if (ioc)
1293 -+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
1294 -+ return NULL;
1295 -+}
1296 -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
1297 -new file mode 100644
1298 -index 0000000..7670400
1299 ---- /dev/null
1300 -+++ b/block/bfq-iosched.c
1301 -@@ -0,0 +1,3268 @@
1302 -+/*
1303 -+ * BFQ, or Budget Fair Queueing, disk scheduler.
1304 -+ *
1305 -+ * Based on ideas and code from CFQ:
1306 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
1307 -+ *
1308 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
1309 -+ * Paolo Valente <paolo.valente@×××××××.it>
1310 -+ *
1311 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
1312 -+ *
1313 -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
1314 -+ *
1315 -+ * BFQ is a proportional share disk scheduling algorithm based on the
1316 -+ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in
1317 -+ * number of sectors, to tasks instead of time slices. The disk is not granted
1318 -+ * to the in-service task for a given time slice, but until it has exahusted
1319 -+ * its assigned budget. This change from the time to the service domain allows
1320 -+ * BFQ to distribute the disk bandwidth among tasks as desired, without any
1321 -+ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an
1322 -+ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to
1323 -+ * their budgets (more precisely BFQ schedules queues associated to tasks).
1324 -+ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to
1325 -+ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low
1326 -+ * latencies to interactive and soft real-time applications.
1327 -+ *
1328 -+ * BFQ is described in [1], where also a reference to the initial, more
1329 -+ * theoretical paper on BFQ can be found. The interested reader can find in
1330 -+ * the latter paper full details on the main algorithm as well as formulas of
1331 -+ * the guarantees, plus formal proofs of all the properties. With respect to
1332 -+ * the version of BFQ presented in these papers, this implementation adds a
1333 -+ * few more heuristics, such as the one that guarantees a low latency to soft
1334 -+ * real-time applications, and a hierarchical extension based on H-WF2Q+.
1335 -+ *
1336 -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
1337 -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
1338 -+ * complexity derives from the one introduced with EEVDF in [3].
1339 -+ *
1340 -+ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
1341 -+ * with the BFQ Disk I/O Scheduler'',
1342 -+ * Proceedings of the 5th Annual International Systems and Storage
1343 -+ * Conference (SYSTOR '12), June 2012.
1344 -+ *
1345 -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
1346 -+ *
1347 -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
1348 -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
1349 -+ * Oct 1997.
1350 -+ *
1351 -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
1352 -+ *
1353 -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
1354 -+ * First: A Flexible and Accurate Mechanism for Proportional Share
1355 -+ * Resource Allocation,'' technical report.
1356 -+ *
1357 -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
1358 -+ */
1359 -+#include <linux/module.h>
1360 -+#include <linux/slab.h>
1361 -+#include <linux/blkdev.h>
1362 -+#include <linux/cgroup.h>
1363 -+#include <linux/elevator.h>
1364 -+#include <linux/jiffies.h>
1365 -+#include <linux/rbtree.h>
1366 -+#include <linux/ioprio.h>
1367 -+#include "bfq.h"
1368 -+#include "blk.h"
1369 -+
1370 -+/* Max number of dispatches in one round of service. */
1371 -+static const int bfq_quantum = 4;
1372 -+
1373 -+/* Expiration time of sync (0) and async (1) requests, in jiffies. */
1374 -+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
1375 -+
1376 -+/* Maximum backwards seek, in KiB. */
1377 -+static const int bfq_back_max = 16 * 1024;
1378 -+
1379 -+/* Penalty of a backwards seek, in number of sectors. */
1380 -+static const int bfq_back_penalty = 2;
1381 -+
1382 -+/* Idling period duration, in jiffies. */
1383 -+static int bfq_slice_idle = HZ / 125;
1384 -+
1385 -+/* Default maximum budget values, in sectors and number of requests. */
1386 -+static const int bfq_default_max_budget = 16 * 1024;
1387 -+static const int bfq_max_budget_async_rq = 4;
1388 -+
1389 -+/*
1390 -+ * Async to sync throughput distribution is controlled as follows:
1391 -+ * when an async request is served, the entity is charged the number
1392 -+ * of sectors of the request, multipled by the factor below
1393 -+ */
1394 -+static const int bfq_async_charge_factor = 10;
1395 -+
1396 -+/* Default timeout values, in jiffies, approximating CFQ defaults. */
1397 -+static const int bfq_timeout_sync = HZ / 8;
1398 -+static int bfq_timeout_async = HZ / 25;
1399 -+
1400 -+struct kmem_cache *bfq_pool;
1401 -+
1402 -+/* Below this threshold (in ms), we consider thinktime immediate. */
1403 -+#define BFQ_MIN_TT 2
1404 -+
1405 -+/* hw_tag detection: parallel requests threshold and min samples needed. */
1406 -+#define BFQ_HW_QUEUE_THRESHOLD 4
1407 -+#define BFQ_HW_QUEUE_SAMPLES 32
1408 -+
1409 -+#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
1410 -+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
1411 -+
1412 -+/* Min samples used for peak rate estimation (for autotuning). */
1413 -+#define BFQ_PEAK_RATE_SAMPLES 32
1414 -+
1415 -+/* Shift used for peak rate fixed precision calculations. */
1416 -+#define BFQ_RATE_SHIFT 16
1417 -+
1418 -+/*
1419 -+ * The duration of the weight raising for interactive applications is
1420 -+ * computed automatically (as default behaviour), using the following
1421 -+ * formula: duration = (R / r) * T, where r is the peak rate of the
1422 -+ * disk, and R and T are two reference parameters. In particular, R is
1423 -+ * the peak rate of a reference disk, and T is about the maximum time
1424 -+ * for starting popular large applications on that disk, under BFQ and
1425 -+ * while reading two files in parallel. Finally, BFQ uses two
1426 -+ * different pairs (R, T) depending on whether the disk is rotational
1427 -+ * or non-rotational.
1428 -+ */
1429 -+#define T_rot (msecs_to_jiffies(5500))
1430 -+#define T_nonrot (msecs_to_jiffies(2000))
1431 -+/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
1432 -+#define R_rot 17415
1433 -+#define R_nonrot 34791
1434 -+
1435 -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
1436 -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1437 -+
1438 -+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
1439 -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
1440 -+
1441 -+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
1442 -+
1443 -+#include "bfq-ioc.c"
1444 -+#include "bfq-sched.c"
1445 -+#include "bfq-cgroup.c"
1446 -+
1447 -+#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
1448 -+ IOPRIO_CLASS_IDLE)
1449 -+#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
1450 -+ IOPRIO_CLASS_RT)
1451 -+
1452 -+#define bfq_sample_valid(samples) ((samples) > 80)
1453 -+
1454 -+/*
1455 -+ * We regard a request as SYNC, if either it's a read or has the SYNC bit
1456 -+ * set (in which case it could also be a direct WRITE).
1457 -+ */
1458 -+static inline int bfq_bio_sync(struct bio *bio)
1459 -+{
1460 -+ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
1461 -+ return 1;
1462 -+
1463 -+ return 0;
1464 -+}
1465 -+
1466 -+/*
1467 -+ * Scheduler run of queue, if there are requests pending and no one in the
1468 -+ * driver that will restart queueing.
1469 -+ */
1470 -+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
1471 -+{
1472 -+ if (bfqd->queued != 0) {
1473 -+ bfq_log(bfqd, "schedule dispatch");
1474 -+ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
1475 -+ }
1476 -+}
1477 -+
1478 -+/*
1479 -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
1480 -+ * We choose the request that is closesr to the head right now. Distance
1481 -+ * behind the head is penalized and only allowed to a certain extent.
1482 -+ */
1483 -+static struct request *bfq_choose_req(struct bfq_data *bfqd,
1484 -+ struct request *rq1,
1485 -+ struct request *rq2,
1486 -+ sector_t last)
1487 -+{
1488 -+ sector_t s1, s2, d1 = 0, d2 = 0;
1489 -+ unsigned long back_max;
1490 -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
1491 -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
1492 -+ unsigned wrap = 0; /* bit mask: requests behind the disk head? */
1493 -+
1494 -+ if (rq1 == NULL || rq1 == rq2)
1495 -+ return rq2;
1496 -+ if (rq2 == NULL)
1497 -+ return rq1;
1498 -+
1499 -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
1500 -+ return rq1;
1501 -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
1502 -+ return rq2;
1503 -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
1504 -+ return rq1;
1505 -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
1506 -+ return rq2;
1507 -+
1508 -+ s1 = blk_rq_pos(rq1);
1509 -+ s2 = blk_rq_pos(rq2);
1510 -+
1511 -+ /*
1512 -+ * By definition, 1KiB is 2 sectors.
1513 -+ */
1514 -+ back_max = bfqd->bfq_back_max * 2;
1515 -+
1516 -+ /*
1517 -+ * Strict one way elevator _except_ in the case where we allow
1518 -+ * short backward seeks which are biased as twice the cost of a
1519 -+ * similar forward seek.
1520 -+ */
1521 -+ if (s1 >= last)
1522 -+ d1 = s1 - last;
1523 -+ else if (s1 + back_max >= last)
1524 -+ d1 = (last - s1) * bfqd->bfq_back_penalty;
1525 -+ else
1526 -+ wrap |= BFQ_RQ1_WRAP;
1527 -+
1528 -+ if (s2 >= last)
1529 -+ d2 = s2 - last;
1530 -+ else if (s2 + back_max >= last)
1531 -+ d2 = (last - s2) * bfqd->bfq_back_penalty;
1532 -+ else
1533 -+ wrap |= BFQ_RQ2_WRAP;
1534 -+
1535 -+ /* Found required data */
1536 -+
1537 -+ /*
1538 -+ * By doing switch() on the bit mask "wrap" we avoid having to
1539 -+ * check two variables for all permutations: --> faster!
1540 -+ */
1541 -+ switch (wrap) {
1542 -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
1543 -+ if (d1 < d2)
1544 -+ return rq1;
1545 -+ else if (d2 < d1)
1546 -+ return rq2;
1547 -+ else {
1548 -+ if (s1 >= s2)
1549 -+ return rq1;
1550 -+ else
1551 -+ return rq2;
1552 -+ }
1553 -+
1554 -+ case BFQ_RQ2_WRAP:
1555 -+ return rq1;
1556 -+ case BFQ_RQ1_WRAP:
1557 -+ return rq2;
1558 -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
1559 -+ default:
1560 -+ /*
1561 -+ * Since both rqs are wrapped,
1562 -+ * start with the one that's further behind head
1563 -+ * (--> only *one* back seek required),
1564 -+ * since back seek takes more time than forward.
1565 -+ */
1566 -+ if (s1 <= s2)
1567 -+ return rq1;
1568 -+ else
1569 -+ return rq2;
1570 -+ }
1571 -+}
1572 -+
1573 -+static struct bfq_queue *
1574 -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
1575 -+ sector_t sector, struct rb_node **ret_parent,
1576 -+ struct rb_node ***rb_link)
1577 -+{
1578 -+ struct rb_node **p, *parent;
1579 -+ struct bfq_queue *bfqq = NULL;
1580 -+
1581 -+ parent = NULL;
1582 -+ p = &root->rb_node;
1583 -+ while (*p) {
1584 -+ struct rb_node **n;
1585 -+
1586 -+ parent = *p;
1587 -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
1588 -+
1589 -+ /*
1590 -+ * Sort strictly based on sector. Smallest to the left,
1591 -+ * largest to the right.
1592 -+ */
1593 -+ if (sector > blk_rq_pos(bfqq->next_rq))
1594 -+ n = &(*p)->rb_right;
1595 -+ else if (sector < blk_rq_pos(bfqq->next_rq))
1596 -+ n = &(*p)->rb_left;
1597 -+ else
1598 -+ break;
1599 -+ p = n;
1600 -+ bfqq = NULL;
1601 -+ }
1602 -+
1603 -+ *ret_parent = parent;
1604 -+ if (rb_link)
1605 -+ *rb_link = p;
1606 -+
1607 -+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
1608 -+ (long long unsigned)sector,
1609 -+ bfqq != NULL ? bfqq->pid : 0);
1610 -+
1611 -+ return bfqq;
1612 -+}
1613 -+
1614 -+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
1615 -+{
1616 -+ struct rb_node **p, *parent;
1617 -+ struct bfq_queue *__bfqq;
1618 -+
1619 -+ if (bfqq->pos_root != NULL) {
1620 -+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
1621 -+ bfqq->pos_root = NULL;
1622 -+ }
1623 -+
1624 -+ if (bfq_class_idle(bfqq))
1625 -+ return;
1626 -+ if (!bfqq->next_rq)
1627 -+ return;
1628 -+
1629 -+ bfqq->pos_root = &bfqd->rq_pos_tree;
1630 -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
1631 -+ blk_rq_pos(bfqq->next_rq), &parent, &p);
1632 -+ if (__bfqq == NULL) {
1633 -+ rb_link_node(&bfqq->pos_node, parent, p);
1634 -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
1635 -+ } else
1636 -+ bfqq->pos_root = NULL;
1637 -+}
1638 -+
1639 -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
1640 -+ struct bfq_queue *bfqq,
1641 -+ struct request *last)
1642 -+{
1643 -+ struct rb_node *rbnext = rb_next(&last->rb_node);
1644 -+ struct rb_node *rbprev = rb_prev(&last->rb_node);
1645 -+ struct request *next = NULL, *prev = NULL;
1646 -+
1647 -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
1648 -+
1649 -+ if (rbprev != NULL)
1650 -+ prev = rb_entry_rq(rbprev);
1651 -+
1652 -+ if (rbnext != NULL)
1653 -+ next = rb_entry_rq(rbnext);
1654 -+ else {
1655 -+ rbnext = rb_first(&bfqq->sort_list);
1656 -+ if (rbnext && rbnext != &last->rb_node)
1657 -+ next = rb_entry_rq(rbnext);
1658 -+ }
1659 -+
1660 -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
1661 -+}
1662 -+
1663 -+static void bfq_del_rq_rb(struct request *rq)
1664 -+{
1665 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1666 -+ struct bfq_data *bfqd = bfqq->bfqd;
1667 -+ const int sync = rq_is_sync(rq);
1668 -+
1669 -+ BUG_ON(bfqq->queued[sync] == 0);
1670 -+ bfqq->queued[sync]--;
1671 -+ bfqd->queued--;
1672 -+
1673 -+ elv_rb_del(&bfqq->sort_list, rq);
1674 -+
1675 -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
1676 -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
1677 -+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
1678 -+ /*
1679 -+ * Remove queue from request-position tree as it is empty.
1680 -+ */
1681 -+ if (bfqq->pos_root != NULL) {
1682 -+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
1683 -+ bfqq->pos_root = NULL;
1684 -+ }
1685 -+ }
1686 -+}
1687 -+
1688 -+/* see the definition of bfq_async_charge_factor for details */
1689 -+static inline unsigned long bfq_serv_to_charge(struct request *rq,
1690 -+ struct bfq_queue *bfqq)
1691 -+{
1692 -+ return blk_rq_sectors(rq) *
1693 -+ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
1694 -+ bfq_async_charge_factor));
1695 -+}
1696 -+
1697 -+/**
1698 -+ * bfq_updated_next_req - update the queue after a new next_rq selection.
1699 -+ * @bfqd: the device data the queue belongs to.
1700 -+ * @bfqq: the queue to update.
1701 -+ *
1702 -+ * If the first request of a queue changes we make sure that the queue
1703 -+ * has enough budget to serve at least its first request (if the
1704 -+ * request has grown). We do this because if the queue has not enough
1705 -+ * budget for its first request, it has to go through two dispatch
1706 -+ * rounds to actually get it dispatched.
1707 -+ */
1708 -+static void bfq_updated_next_req(struct bfq_data *bfqd,
1709 -+ struct bfq_queue *bfqq)
1710 -+{
1711 -+ struct bfq_entity *entity = &bfqq->entity;
1712 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
1713 -+ struct request *next_rq = bfqq->next_rq;
1714 -+ unsigned long new_budget;
1715 -+
1716 -+ if (next_rq == NULL)
1717 -+ return;
1718 -+
1719 -+ if (bfqq == bfqd->in_service_queue)
1720 -+ /*
1721 -+ * In order not to break guarantees, budgets cannot be
1722 -+ * changed after an entity has been selected.
1723 -+ */
1724 -+ return;
1725 -+
1726 -+ BUG_ON(entity->tree != &st->active);
1727 -+ BUG_ON(entity == entity->sched_data->active_entity);
1728 -+
1729 -+ new_budget = max_t(unsigned long, bfqq->max_budget,
1730 -+ bfq_serv_to_charge(next_rq, bfqq));
1731 -+ entity->budget = new_budget;
1732 -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
1733 -+ bfq_activate_bfqq(bfqd, bfqq);
1734 -+}
1735 -+
1736 -+static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
1737 -+{
1738 -+ u64 dur;
1739 -+
1740 -+ if (bfqd->bfq_raising_max_time > 0)
1741 -+ return bfqd->bfq_raising_max_time;
1742 -+
1743 -+ dur = bfqd->RT_prod;
1744 -+ do_div(dur, bfqd->peak_rate);
1745 -+
1746 -+ return dur;
1747 -+}
1748 -+
1749 -+static void bfq_add_rq_rb(struct request *rq)
1750 -+{
1751 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1752 -+ struct bfq_entity *entity = &bfqq->entity;
1753 -+ struct bfq_data *bfqd = bfqq->bfqd;
1754 -+ struct request *next_rq, *prev;
1755 -+ unsigned long old_raising_coeff = bfqq->raising_coeff;
1756 -+ int idle_for_long_time = 0;
1757 -+
1758 -+ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
1759 -+ bfqq->queued[rq_is_sync(rq)]++;
1760 -+ bfqd->queued++;
1761 -+
1762 -+ elv_rb_add(&bfqq->sort_list, rq);
1763 -+
1764 -+ /*
1765 -+ * Check if this request is a better next-serve candidate.
1766 -+ */
1767 -+ prev = bfqq->next_rq;
1768 -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
1769 -+ BUG_ON(next_rq == NULL);
1770 -+ bfqq->next_rq = next_rq;
1771 -+
1772 -+ /*
1773 -+ * Adjust priority tree position, if next_rq changes.
1774 -+ */
1775 -+ if (prev != bfqq->next_rq)
1776 -+ bfq_rq_pos_tree_add(bfqd, bfqq);
1777 -+
1778 -+ if (!bfq_bfqq_busy(bfqq)) {
1779 -+ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
1780 -+ time_is_before_jiffies(bfqq->soft_rt_next_start);
1781 -+ idle_for_long_time = time_is_before_jiffies(
1782 -+ bfqq->budget_timeout +
1783 -+ bfqd->bfq_raising_min_idle_time);
1784 -+ entity->budget = max_t(unsigned long, bfqq->max_budget,
1785 -+ bfq_serv_to_charge(next_rq, bfqq));
1786 -+
1787 -+ if (!bfqd->low_latency)
1788 -+ goto add_bfqq_busy;
1789 -+
1790 -+ /*
1791 -+ * If the queue is not being boosted and has been idle
1792 -+ * for enough time, start a weight-raising period
1793 -+ */
1794 -+ if (old_raising_coeff == 1 &&
1795 -+ (idle_for_long_time || soft_rt)) {
1796 -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1797 -+ if (idle_for_long_time)
1798 -+ bfqq->raising_cur_max_time =
1799 -+ bfq_wrais_duration(bfqd);
1800 -+ else
1801 -+ bfqq->raising_cur_max_time =
1802 -+ bfqd->bfq_raising_rt_max_time;
1803 -+ bfq_log_bfqq(bfqd, bfqq,
1804 -+ "wrais starting at %llu msec,"
1805 -+ "rais_max_time %u",
1806 -+ bfqq->last_rais_start_finish,
1807 -+ jiffies_to_msecs(bfqq->
1808 -+ raising_cur_max_time));
1809 -+ } else if (old_raising_coeff > 1) {
1810 -+ if (idle_for_long_time)
1811 -+ bfqq->raising_cur_max_time =
1812 -+ bfq_wrais_duration(bfqd);
1813 -+ else if (bfqq->raising_cur_max_time ==
1814 -+ bfqd->bfq_raising_rt_max_time &&
1815 -+ !soft_rt) {
1816 -+ bfqq->raising_coeff = 1;
1817 -+ bfq_log_bfqq(bfqd, bfqq,
1818 -+ "wrais ending at %llu msec,"
1819 -+ "rais_max_time %u",
1820 -+ bfqq->last_rais_start_finish,
1821 -+ jiffies_to_msecs(bfqq->
1822 -+ raising_cur_max_time));
1823 -+ } else if ((bfqq->last_rais_start_finish +
1824 -+ bfqq->raising_cur_max_time <
1825 -+ jiffies + bfqd->bfq_raising_rt_max_time) &&
1826 -+ soft_rt) {
1827 -+ /*
1828 -+ *
1829 -+ * The remaining weight-raising time is lower
1830 -+ * than bfqd->bfq_raising_rt_max_time, which
1831 -+ * means that the application is enjoying
1832 -+ * weight raising either because deemed soft rt
1833 -+ * in the near past, or because deemed
1834 -+ * interactive a long ago. In both cases,
1835 -+ * resetting now the current remaining weight-
1836 -+ * raising time for the application to the
1837 -+ * weight-raising duration for soft rt
1838 -+ * applications would not cause any latency
1839 -+ * increase for the application (as the new
1840 -+ * duration would be higher than the remaining
1841 -+ * time).
1842 -+ *
1843 -+ * In addition, the application is now meeting
1844 -+ * the requirements for being deemed soft rt.
1845 -+ * In the end we can correctly and safely
1846 -+ * (re)charge the weight-raising duration for
1847 -+ * the application with the weight-raising
1848 -+ * duration for soft rt applications.
1849 -+ *
1850 -+ * In particular, doing this recharge now, i.e.,
1851 -+ * before the weight-raising period for the
1852 -+ * application finishes, reduces the probability
1853 -+ * of the following negative scenario:
1854 -+ * 1) the weight of a soft rt application is
1855 -+ * raised at startup (as for any newly
1856 -+ * created application),
1857 -+ * 2) since the application is not interactive,
1858 -+ * at a certain time weight-raising is
1859 -+ * stopped for the application,
1860 -+ * 3) at that time the application happens to
1861 -+ * still have pending requests, and hence
1862 -+ * is destined to not have a chance to be
1863 -+ * deemed soft rt before these requests are
1864 -+ * completed (see the comments to the
1865 -+ * function bfq_bfqq_softrt_next_start()
1866 -+ * for details on soft rt detection),
1867 -+ * 4) these pending requests experience a high
1868 -+ * latency because the application is not
1869 -+ * weight-raised while they are pending.
1870 -+ */
1871 -+ bfqq->last_rais_start_finish = jiffies;
1872 -+ bfqq->raising_cur_max_time =
1873 -+ bfqd->bfq_raising_rt_max_time;
1874 -+ }
1875 -+ }
1876 -+ if (old_raising_coeff != bfqq->raising_coeff)
1877 -+ entity->ioprio_changed = 1;
1878 -+add_bfqq_busy:
1879 -+ bfqq->last_idle_bklogged = jiffies;
1880 -+ bfqq->service_from_backlogged = 0;
1881 -+ bfq_clear_bfqq_softrt_update(bfqq);
1882 -+ bfq_add_bfqq_busy(bfqd, bfqq);
1883 -+ } else {
1884 -+ if (bfqd->low_latency && old_raising_coeff == 1 &&
1885 -+ !rq_is_sync(rq) &&
1886 -+ bfqq->last_rais_start_finish +
1887 -+ time_is_before_jiffies(
1888 -+ bfqd->bfq_raising_min_inter_arr_async)) {
1889 -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
1890 -+ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
1891 -+
1892 -+ bfqd->raised_busy_queues++;
1893 -+ entity->ioprio_changed = 1;
1894 -+ bfq_log_bfqq(bfqd, bfqq,
1895 -+ "non-idle wrais starting at %llu msec,"
1896 -+ "rais_max_time %u",
1897 -+ bfqq->last_rais_start_finish,
1898 -+ jiffies_to_msecs(bfqq->
1899 -+ raising_cur_max_time));
1900 -+ }
1901 -+ bfq_updated_next_req(bfqd, bfqq);
1902 -+ }
1903 -+
1904 -+ if (bfqd->low_latency &&
1905 -+ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
1906 -+ idle_for_long_time))
1907 -+ bfqq->last_rais_start_finish = jiffies;
1908 -+}
1909 -+
1910 -+static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
1911 -+{
1912 -+ elv_rb_del(&bfqq->sort_list, rq);
1913 -+ bfqq->queued[rq_is_sync(rq)]--;
1914 -+ bfqq->bfqd->queued--;
1915 -+ bfq_add_rq_rb(rq);
1916 -+}
1917 -+
1918 -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
1919 -+ struct bio *bio)
1920 -+{
1921 -+ struct task_struct *tsk = current;
1922 -+ struct bfq_io_cq *bic;
1923 -+ struct bfq_queue *bfqq;
1924 -+
1925 -+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
1926 -+ if (bic == NULL)
1927 -+ return NULL;
1928 -+
1929 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
1930 -+ if (bfqq != NULL)
1931 -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
1932 -+
1933 -+ return NULL;
1934 -+}
1935 -+
1936 -+static void bfq_activate_request(struct request_queue *q, struct request *rq)
1937 -+{
1938 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1939 -+
1940 -+ bfqd->rq_in_driver++;
1941 -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
1942 -+ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
1943 -+ (long long unsigned)bfqd->last_position);
1944 -+}
1945 -+
1946 -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
1947 -+{
1948 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1949 -+
1950 -+ WARN_ON(bfqd->rq_in_driver == 0);
1951 -+ bfqd->rq_in_driver--;
1952 -+}
1953 -+
1954 -+static void bfq_remove_request(struct request *rq)
1955 -+{
1956 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
1957 -+ struct bfq_data *bfqd = bfqq->bfqd;
1958 -+
1959 -+ if (bfqq->next_rq == rq) {
1960 -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
1961 -+ bfq_updated_next_req(bfqd, bfqq);
1962 -+ }
1963 -+
1964 -+ list_del_init(&rq->queuelist);
1965 -+ bfq_del_rq_rb(rq);
1966 -+
1967 -+ if (rq->cmd_flags & REQ_META) {
1968 -+ WARN_ON(bfqq->meta_pending == 0);
1969 -+ bfqq->meta_pending--;
1970 -+ }
1971 -+}
1972 -+
1973 -+static int bfq_merge(struct request_queue *q, struct request **req,
1974 -+ struct bio *bio)
1975 -+{
1976 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
1977 -+ struct request *__rq;
1978 -+
1979 -+ __rq = bfq_find_rq_fmerge(bfqd, bio);
1980 -+ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
1981 -+ *req = __rq;
1982 -+ return ELEVATOR_FRONT_MERGE;
1983 -+ }
1984 -+
1985 -+ return ELEVATOR_NO_MERGE;
1986 -+}
1987 -+
1988 -+static void bfq_merged_request(struct request_queue *q, struct request *req,
1989 -+ int type)
1990 -+{
1991 -+ if (type == ELEVATOR_FRONT_MERGE) {
1992 -+ struct bfq_queue *bfqq = RQ_BFQQ(req);
1993 -+
1994 -+ bfq_reposition_rq_rb(bfqq, req);
1995 -+ }
1996 -+}
1997 -+
1998 -+static void bfq_merged_requests(struct request_queue *q, struct request *rq,
1999 -+ struct request *next)
2000 -+{
2001 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
2002 -+
2003 -+ /*
2004 -+ * Reposition in fifo if next is older than rq.
2005 -+ */
2006 -+ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
2007 -+ time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
2008 -+ list_move(&rq->queuelist, &next->queuelist);
2009 -+ rq_set_fifo_time(rq, rq_fifo_time(next));
2010 -+ }
2011 -+
2012 -+ if (bfqq->next_rq == next)
2013 -+ bfqq->next_rq = rq;
2014 -+
2015 -+ bfq_remove_request(next);
2016 -+}
2017 -+
2018 -+/* Must be called with bfqq != NULL */
2019 -+static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)
2020 -+{
2021 -+ BUG_ON(bfqq == NULL);
2022 -+ if (bfq_bfqq_busy(bfqq))
2023 -+ bfqq->bfqd->raised_busy_queues--;
2024 -+ bfqq->raising_coeff = 1;
2025 -+ bfqq->raising_cur_max_time = 0;
2026 -+ /* Trigger a weight change on the next activation of the queue */
2027 -+ bfqq->entity.ioprio_changed = 1;
2028 -+}
2029 -+
2030 -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
2031 -+ struct bfq_group *bfqg)
2032 -+{
2033 -+ int i, j;
2034 -+
2035 -+ for (i = 0; i < 2; i++)
2036 -+ for (j = 0; j < IOPRIO_BE_NR; j++)
2037 -+ if (bfqg->async_bfqq[i][j] != NULL)
2038 -+ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);
2039 -+ if (bfqg->async_idle_bfqq != NULL)
2040 -+ bfq_bfqq_end_raising(bfqg->async_idle_bfqq);
2041 -+}
2042 -+
2043 -+static void bfq_end_raising(struct bfq_data *bfqd)
2044 -+{
2045 -+ struct bfq_queue *bfqq;
2046 -+
2047 -+ spin_lock_irq(bfqd->queue->queue_lock);
2048 -+
2049 -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
2050 -+ bfq_bfqq_end_raising(bfqq);
2051 -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
2052 -+ bfq_bfqq_end_raising(bfqq);
2053 -+ bfq_end_raising_async(bfqd);
2054 -+
2055 -+ spin_unlock_irq(bfqd->queue->queue_lock);
2056 -+}
2057 -+
2058 -+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
2059 -+ struct bio *bio)
2060 -+{
2061 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
2062 -+ struct bfq_io_cq *bic;
2063 -+ struct bfq_queue *bfqq;
2064 -+
2065 -+ /*
2066 -+ * Disallow merge of a sync bio into an async request.
2067 -+ */
2068 -+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
2069 -+ return 0;
2070 -+
2071 -+ /*
2072 -+ * Lookup the bfqq that this bio will be queued with. Allow
2073 -+ * merge only if rq is queued there.
2074 -+ * Queue lock is held here.
2075 -+ */
2076 -+ bic = bfq_bic_lookup(bfqd, current->io_context);
2077 -+ if (bic == NULL)
2078 -+ return 0;
2079 -+
2080 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
2081 -+ return bfqq == RQ_BFQQ(rq);
2082 -+}
2083 -+
2084 -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
2085 -+ struct bfq_queue *bfqq)
2086 -+{
2087 -+ if (bfqq != NULL) {
2088 -+ bfq_mark_bfqq_must_alloc(bfqq);
2089 -+ bfq_mark_bfqq_budget_new(bfqq);
2090 -+ bfq_clear_bfqq_fifo_expire(bfqq);
2091 -+
2092 -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
2093 -+
2094 -+ bfq_log_bfqq(bfqd, bfqq,
2095 -+ "set_in_service_queue, cur-budget = %lu",
2096 -+ bfqq->entity.budget);
2097 -+ }
2098 -+
2099 -+ bfqd->in_service_queue = bfqq;
2100 -+}
2101 -+
2102 -+/*
2103 -+ * Get and set a new queue for service.
2104 -+ */
2105 -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
2106 -+ struct bfq_queue *bfqq)
2107 -+{
2108 -+ if (!bfqq)
2109 -+ bfqq = bfq_get_next_queue(bfqd);
2110 -+ else
2111 -+ bfq_get_next_queue_forced(bfqd, bfqq);
2112 -+
2113 -+ __bfq_set_in_service_queue(bfqd, bfqq);
2114 -+ return bfqq;
2115 -+}
2116 -+
2117 -+static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
2118 -+ struct request *rq)
2119 -+{
2120 -+ if (blk_rq_pos(rq) >= bfqd->last_position)
2121 -+ return blk_rq_pos(rq) - bfqd->last_position;
2122 -+ else
2123 -+ return bfqd->last_position - blk_rq_pos(rq);
2124 -+}
2125 -+
2126 -+/*
2127 -+ * Return true if bfqq has no request pending and rq is close enough to
2128 -+ * bfqd->last_position, or if rq is closer to bfqd->last_position than
2129 -+ * bfqq->next_rq
2130 -+ */
2131 -+static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
2132 -+{
2133 -+ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
2134 -+}
2135 -+
2136 -+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
2137 -+{
2138 -+ struct rb_root *root = &bfqd->rq_pos_tree;
2139 -+ struct rb_node *parent, *node;
2140 -+ struct bfq_queue *__bfqq;
2141 -+ sector_t sector = bfqd->last_position;
2142 -+
2143 -+ if (RB_EMPTY_ROOT(root))
2144 -+ return NULL;
2145 -+
2146 -+ /*
2147 -+ * First, if we find a request starting at the end of the last
2148 -+ * request, choose it.
2149 -+ */
2150 -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
2151 -+ if (__bfqq != NULL)
2152 -+ return __bfqq;
2153 -+
2154 -+ /*
2155 -+ * If the exact sector wasn't found, the parent of the NULL leaf
2156 -+ * will contain the closest sector (rq_pos_tree sorted by next_request
2157 -+ * position).
2158 -+ */
2159 -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
2160 -+ if (bfq_rq_close(bfqd, __bfqq->next_rq))
2161 -+ return __bfqq;
2162 -+
2163 -+ if (blk_rq_pos(__bfqq->next_rq) < sector)
2164 -+ node = rb_next(&__bfqq->pos_node);
2165 -+ else
2166 -+ node = rb_prev(&__bfqq->pos_node);
2167 -+ if (node == NULL)
2168 -+ return NULL;
2169 -+
2170 -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
2171 -+ if (bfq_rq_close(bfqd, __bfqq->next_rq))
2172 -+ return __bfqq;
2173 -+
2174 -+ return NULL;
2175 -+}
2176 -+
2177 -+/*
2178 -+ * bfqd - obvious
2179 -+ * cur_bfqq - passed in so that we don't decide that the current queue
2180 -+ * is closely cooperating with itself.
2181 -+ *
2182 -+ * We are assuming that cur_bfqq has dispatched at least one request,
2183 -+ * and that bfqd->last_position reflects a position on the disk associated
2184 -+ * with the I/O issued by cur_bfqq.
2185 -+ */
2186 -+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
2187 -+ struct bfq_queue *cur_bfqq)
2188 -+{
2189 -+ struct bfq_queue *bfqq;
2190 -+
2191 -+ if (bfq_class_idle(cur_bfqq))
2192 -+ return NULL;
2193 -+ if (!bfq_bfqq_sync(cur_bfqq))
2194 -+ return NULL;
2195 -+ if (BFQQ_SEEKY(cur_bfqq))
2196 -+ return NULL;
2197 -+
2198 -+ /* If device has only one backlogged bfq_queue, don't search. */
2199 -+ if (bfqd->busy_queues == 1)
2200 -+ return NULL;
2201 -+
2202 -+ /*
2203 -+ * We should notice if some of the queues are cooperating, e.g.
2204 -+ * working closely on the same area of the disk. In that case,
2205 -+ * we can group them together and don't waste time idling.
2206 -+ */
2207 -+ bfqq = bfqq_close(bfqd);
2208 -+ if (bfqq == NULL || bfqq == cur_bfqq)
2209 -+ return NULL;
2210 -+
2211 -+ /*
2212 -+ * Do not merge queues from different bfq_groups.
2213 -+ */
2214 -+ if (bfqq->entity.parent != cur_bfqq->entity.parent)
2215 -+ return NULL;
2216 -+
2217 -+ /*
2218 -+ * It only makes sense to merge sync queues.
2219 -+ */
2220 -+ if (!bfq_bfqq_sync(bfqq))
2221 -+ return NULL;
2222 -+ if (BFQQ_SEEKY(bfqq))
2223 -+ return NULL;
2224 -+
2225 -+ /*
2226 -+ * Do not merge queues of different priority classes.
2227 -+ */
2228 -+ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
2229 -+ return NULL;
2230 -+
2231 -+ return bfqq;
2232 -+}
2233 -+
2234 -+/*
2235 -+ * If enough samples have been computed, return the current max budget
2236 -+ * stored in bfqd, which is dynamically updated according to the
2237 -+ * estimated disk peak rate; otherwise return the default max budget
2238 -+ */
2239 -+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
2240 -+{
2241 -+ if (bfqd->budgets_assigned < 194)
2242 -+ return bfq_default_max_budget;
2243 -+ else
2244 -+ return bfqd->bfq_max_budget;
2245 -+}
2246 -+
2247 -+/*
2248 -+ * Return min budget, which is a fraction of the current or default
2249 -+ * max budget (trying with 1/32)
2250 -+ */
2251 -+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
2252 -+{
2253 -+ if (bfqd->budgets_assigned < 194)
2254 -+ return bfq_default_max_budget / 32;
2255 -+ else
2256 -+ return bfqd->bfq_max_budget / 32;
2257 -+}
2258 -+
2259 -+/*
2260 -+ * Decides whether idling should be done for given device and
2261 -+ * given in-service queue.
2262 -+ */
2263 -+static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
2264 -+ struct bfq_queue *in_service_bfqq)
2265 -+{
2266 -+ if (in_service_bfqq == NULL)
2267 -+ return false;
2268 -+ /*
2269 -+ * If device is SSD it has no seek penalty, disable idling; but
2270 -+ * do so only if:
2271 -+ * - device does not support queuing, otherwise we still have
2272 -+ * a problem with sync vs async workloads;
2273 -+ * - the queue is not weight-raised, to preserve guarantees.
2274 -+ */
2275 -+ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
2276 -+ in_service_bfqq->raising_coeff == 1);
2277 -+}
2278 -+
2279 -+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
2280 -+{
2281 -+ struct bfq_queue *bfqq = bfqd->in_service_queue;
2282 -+ struct bfq_io_cq *bic;
2283 -+ unsigned long sl;
2284 -+
2285 -+ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
2286 -+
2287 -+ /* Tasks have exited, don't wait. */
2288 -+ bic = bfqd->in_service_bic;
2289 -+ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
2290 -+ return;
2291 -+
2292 -+ bfq_mark_bfqq_wait_request(bfqq);
2293 -+
2294 -+ /*
2295 -+ * We don't want to idle for seeks, but we do want to allow
2296 -+ * fair distribution of slice time for a process doing back-to-back
2297 -+ * seeks. So allow a little bit of time for him to submit a new rq.
2298 -+ *
2299 -+ * To prevent processes with (partly) seeky workloads from
2300 -+ * being too ill-treated, grant them a small fraction of the
2301 -+ * assigned budget before reducing the waiting time to
2302 -+ * BFQ_MIN_TT. This happened to help reduce latency.
2303 -+ */
2304 -+ sl = bfqd->bfq_slice_idle;
2305 -+ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
2306 -+ bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
2307 -+ bfqq->raising_coeff == 1)
2308 -+ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
2309 -+ else if (bfqq->raising_coeff > 1)
2310 -+ sl = sl * 3;
2311 -+ bfqd->last_idling_start = ktime_get();
2312 -+ mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
2313 -+ bfq_log(bfqd, "arm idle: %u/%u ms",
2314 -+ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
2315 -+}
2316 -+
2317 -+/*
2318 -+ * Set the maximum time for the in-service queue to consume its
2319 -+ * budget. This prevents seeky processes from lowering the disk
2320 -+ * throughput (always guaranteed with a time slice scheme as in CFQ).
2321 -+ */
2322 -+static void bfq_set_budget_timeout(struct bfq_data *bfqd)
2323 -+{
2324 -+ struct bfq_queue *bfqq = bfqd->in_service_queue;
2325 -+ unsigned int timeout_coeff;
2326 -+ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
2327 -+ timeout_coeff = 1;
2328 -+ else
2329 -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
2330 -+
2331 -+ bfqd->last_budget_start = ktime_get();
2332 -+
2333 -+ bfq_clear_bfqq_budget_new(bfqq);
2334 -+ bfqq->budget_timeout = jiffies +
2335 -+ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
2336 -+
2337 -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
2338 -+ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
2339 -+ timeout_coeff));
2340 -+}
2341 -+
2342 -+/*
2343 -+ * Move request from internal lists to the request queue dispatch list.
2344 -+ */
2345 -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
2346 -+{
2347 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
2348 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
2349 -+
2350 -+ bfq_remove_request(rq);
2351 -+ bfqq->dispatched++;
2352 -+ elv_dispatch_sort(q, rq);
2353 -+
2354 -+ if (bfq_bfqq_sync(bfqq))
2355 -+ bfqd->sync_flight++;
2356 -+}
2357 -+
2358 -+/*
2359 -+ * Return expired entry, or NULL to just start from scratch in rbtree.
2360 -+ */
2361 -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
2362 -+{
2363 -+ struct request *rq = NULL;
2364 -+
2365 -+ if (bfq_bfqq_fifo_expire(bfqq))
2366 -+ return NULL;
2367 -+
2368 -+ bfq_mark_bfqq_fifo_expire(bfqq);
2369 -+
2370 -+ if (list_empty(&bfqq->fifo))
2371 -+ return NULL;
2372 -+
2373 -+ rq = rq_entry_fifo(bfqq->fifo.next);
2374 -+
2375 -+ if (time_before(jiffies, rq_fifo_time(rq)))
2376 -+ return NULL;
2377 -+
2378 -+ return rq;
2379 -+}
2380 -+
2381 -+/*
2382 -+ * Must be called with the queue_lock held.
2383 -+ */
2384 -+static int bfqq_process_refs(struct bfq_queue *bfqq)
2385 -+{
2386 -+ int process_refs, io_refs;
2387 -+
2388 -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
2389 -+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
2390 -+ BUG_ON(process_refs < 0);
2391 -+ return process_refs;
2392 -+}
2393 -+
2394 -+static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
2395 -+{
2396 -+ int process_refs, new_process_refs;
2397 -+ struct bfq_queue *__bfqq;
2398 -+
2399 -+ /*
2400 -+ * If there are no process references on the new_bfqq, then it is
2401 -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
2402 -+ * may have dropped their last reference (not just their last process
2403 -+ * reference).
2404 -+ */
2405 -+ if (!bfqq_process_refs(new_bfqq))
2406 -+ return;
2407 -+
2408 -+ /* Avoid a circular list and skip interim queue merges. */
2409 -+ while ((__bfqq = new_bfqq->new_bfqq)) {
2410 -+ if (__bfqq == bfqq)
2411 -+ return;
2412 -+ new_bfqq = __bfqq;
2413 -+ }
2414 -+
2415 -+ process_refs = bfqq_process_refs(bfqq);
2416 -+ new_process_refs = bfqq_process_refs(new_bfqq);
2417 -+ /*
2418 -+ * If the process for the bfqq has gone away, there is no
2419 -+ * sense in merging the queues.
2420 -+ */
2421 -+ if (process_refs == 0 || new_process_refs == 0)
2422 -+ return;
2423 -+
2424 -+ /*
2425 -+ * Merge in the direction of the lesser amount of work.
2426 -+ */
2427 -+ if (new_process_refs >= process_refs) {
2428 -+ bfqq->new_bfqq = new_bfqq;
2429 -+ atomic_add(process_refs, &new_bfqq->ref);
2430 -+ } else {
2431 -+ new_bfqq->new_bfqq = bfqq;
2432 -+ atomic_add(new_process_refs, &bfqq->ref);
2433 -+ }
2434 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
2435 -+ new_bfqq->pid);
2436 -+}
2437 -+
2438 -+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
2439 -+{
2440 -+ struct bfq_entity *entity = &bfqq->entity;
2441 -+ return entity->budget - entity->service;
2442 -+}
2443 -+
2444 -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
2445 -+{
2446 -+ BUG_ON(bfqq != bfqd->in_service_queue);
2447 -+
2448 -+ __bfq_bfqd_reset_in_service(bfqd);
2449 -+
2450 -+ /*
2451 -+ * If this bfqq is shared between multiple processes, check
2452 -+ * to make sure that those processes are still issuing I/Os
2453 -+ * within the mean seek distance. If not, it may be time to
2454 -+ * break the queues apart again.
2455 -+ */
2456 -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
2457 -+ bfq_mark_bfqq_split_coop(bfqq);
2458 -+
2459 -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
2460 -+ /*
2461 -+ * overloading budget_timeout field to store when
2462 -+ * the queue remains with no backlog, used by
2463 -+ * the weight-raising mechanism
2464 -+ */
2465 -+ bfqq->budget_timeout = jiffies;
2466 -+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
2467 -+ } else {
2468 -+ bfq_activate_bfqq(bfqd, bfqq);
2469 -+ /*
2470 -+ * Resort priority tree of potential close cooperators.
2471 -+ */
2472 -+ bfq_rq_pos_tree_add(bfqd, bfqq);
2473 -+ }
2474 -+}
2475 -+
2476 -+/**
2477 -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
2478 -+ * @bfqd: device data.
2479 -+ * @bfqq: queue to update.
2480 -+ * @reason: reason for expiration.
2481 -+ *
2482 -+ * Handle the feedback on @bfqq budget. See the body for detailed
2483 -+ * comments.
2484 -+ */
2485 -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
2486 -+ struct bfq_queue *bfqq,
2487 -+ enum bfqq_expiration reason)
2488 -+{
2489 -+ struct request *next_rq;
2490 -+ unsigned long budget, min_budget;
2491 -+
2492 -+ budget = bfqq->max_budget;
2493 -+ min_budget = bfq_min_budget(bfqd);
2494 -+
2495 -+ BUG_ON(bfqq != bfqd->in_service_queue);
2496 -+
2497 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
2498 -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
2499 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
2500 -+ budget, bfq_min_budget(bfqd));
2501 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
2502 -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
2503 -+
2504 -+ if (bfq_bfqq_sync(bfqq)) {
2505 -+ switch (reason) {
2506 -+ /*
2507 -+ * Caveat: in all the following cases we trade latency
2508 -+ * for throughput.
2509 -+ */
2510 -+ case BFQ_BFQQ_TOO_IDLE:
2511 -+ /*
2512 -+ * This is the only case where we may reduce
2513 -+ * the budget: if there is no requets of the
2514 -+ * process still waiting for completion, then
2515 -+ * we assume (tentatively) that the timer has
2516 -+ * expired because the batch of requests of
2517 -+ * the process could have been served with a
2518 -+ * smaller budget. Hence, betting that
2519 -+ * process will behave in the same way when it
2520 -+ * becomes backlogged again, we reduce its
2521 -+ * next budget. As long as we guess right,
2522 -+ * this budget cut reduces the latency
2523 -+ * experienced by the process.
2524 -+ *
2525 -+ * However, if there are still outstanding
2526 -+ * requests, then the process may have not yet
2527 -+ * issued its next request just because it is
2528 -+ * still waiting for the completion of some of
2529 -+ * the still oustanding ones. So in this
2530 -+ * subcase we do not reduce its budget, on the
2531 -+ * contrary we increase it to possibly boost
2532 -+ * the throughput, as discussed in the
2533 -+ * comments to the BUDGET_TIMEOUT case.
2534 -+ */
2535 -+ if (bfqq->dispatched > 0) /* still oustanding reqs */
2536 -+ budget = min(budget * 2, bfqd->bfq_max_budget);
2537 -+ else {
2538 -+ if (budget > 5 * min_budget)
2539 -+ budget -= 4 * min_budget;
2540 -+ else
2541 -+ budget = min_budget;
2542 -+ }
2543 -+ break;
2544 -+ case BFQ_BFQQ_BUDGET_TIMEOUT:
2545 -+ /*
2546 -+ * We double the budget here because: 1) it
2547 -+ * gives the chance to boost the throughput if
2548 -+ * this is not a seeky process (which may have
2549 -+ * bumped into this timeout because of, e.g.,
2550 -+ * ZBR), 2) together with charge_full_budget
2551 -+ * it helps give seeky processes higher
2552 -+ * timestamps, and hence be served less
2553 -+ * frequently.
2554 -+ */
2555 -+ budget = min(budget * 2, bfqd->bfq_max_budget);
2556 -+ break;
2557 -+ case BFQ_BFQQ_BUDGET_EXHAUSTED:
2558 -+ /*
2559 -+ * The process still has backlog, and did not
2560 -+ * let either the budget timeout or the disk
2561 -+ * idling timeout expire. Hence it is not
2562 -+ * seeky, has a short thinktime and may be
2563 -+ * happy with a higher budget too. So
2564 -+ * definitely increase the budget of this good
2565 -+ * candidate to boost the disk throughput.
2566 -+ */
2567 -+ budget = min(budget * 4, bfqd->bfq_max_budget);
2568 -+ break;
2569 -+ case BFQ_BFQQ_NO_MORE_REQUESTS:
2570 -+ /*
2571 -+ * Leave the budget unchanged.
2572 -+ */
2573 -+ default:
2574 -+ return;
2575 -+ }
2576 -+ } else /* async queue */
2577 -+ /* async queues get always the maximum possible budget
2578 -+ * (their ability to dispatch is limited by
2579 -+ * @bfqd->bfq_max_budget_async_rq).
2580 -+ */
2581 -+ budget = bfqd->bfq_max_budget;
2582 -+
2583 -+ bfqq->max_budget = budget;
2584 -+
2585 -+ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
2586 -+ bfqq->max_budget > bfqd->bfq_max_budget)
2587 -+ bfqq->max_budget = bfqd->bfq_max_budget;
2588 -+
2589 -+ /*
2590 -+ * Make sure that we have enough budget for the next request.
2591 -+ * Since the finish time of the bfqq must be kept in sync with
2592 -+ * the budget, be sure to call __bfq_bfqq_expire() after the
2593 -+ * update.
2594 -+ */
2595 -+ next_rq = bfqq->next_rq;
2596 -+ if (next_rq != NULL)
2597 -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
2598 -+ bfq_serv_to_charge(next_rq, bfqq));
2599 -+ else
2600 -+ bfqq->entity.budget = bfqq->max_budget;
2601 -+
2602 -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
2603 -+ next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
2604 -+ bfqq->entity.budget);
2605 -+}
2606 -+
2607 -+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
2608 -+{
2609 -+ unsigned long max_budget;
2610 -+
2611 -+ /*
2612 -+ * The max_budget calculated when autotuning is equal to the
2613 -+ * amount of sectors transfered in timeout_sync at the
2614 -+ * estimated peak rate.
2615 -+ */
2616 -+ max_budget = (unsigned long)(peak_rate * 1000 *
2617 -+ timeout >> BFQ_RATE_SHIFT);
2618 -+
2619 -+ return max_budget;
2620 -+}
2621 -+
2622 -+/*
2623 -+ * In addition to updating the peak rate, checks whether the process
2624 -+ * is "slow", and returns 1 if so. This slow flag is used, in addition
2625 -+ * to the budget timeout, to reduce the amount of service provided to
2626 -+ * seeky processes, and hence reduce their chances to lower the
2627 -+ * throughput. See the code for more details.
2628 -+ */
2629 -+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
2630 -+ int compensate, enum bfqq_expiration reason)
2631 -+{
2632 -+ u64 bw, usecs, expected, timeout;
2633 -+ ktime_t delta;
2634 -+ int update = 0;
2635 -+
2636 -+ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
2637 -+ return 0;
2638 -+
2639 -+ if (compensate)
2640 -+ delta = bfqd->last_idling_start;
2641 -+ else
2642 -+ delta = ktime_get();
2643 -+ delta = ktime_sub(delta, bfqd->last_budget_start);
2644 -+ usecs = ktime_to_us(delta);
2645 -+
2646 -+ /* Don't trust short/unrealistic values. */
2647 -+ if (usecs < 100 || usecs >= LONG_MAX)
2648 -+ return 0;
2649 -+
2650 -+ /*
2651 -+ * Calculate the bandwidth for the last slice. We use a 64 bit
2652 -+ * value to store the peak rate, in sectors per usec in fixed
2653 -+ * point math. We do so to have enough precision in the estimate
2654 -+ * and to avoid overflows.
2655 -+ */
2656 -+ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
2657 -+ do_div(bw, (unsigned long)usecs);
2658 -+
2659 -+ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
2660 -+
2661 -+ /*
2662 -+ * Use only long (> 20ms) intervals to filter out spikes for
2663 -+ * the peak rate estimation.
2664 -+ */
2665 -+ if (usecs > 20000) {
2666 -+ if (bw > bfqd->peak_rate ||
2667 -+ (!BFQQ_SEEKY(bfqq) &&
2668 -+ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
2669 -+ bfq_log(bfqd, "measured bw =%llu", bw);
2670 -+ /*
2671 -+ * To smooth oscillations use a low-pass filter with
2672 -+ * alpha=7/8, i.e.,
2673 -+ * new_rate = (7/8) * old_rate + (1/8) * bw
2674 -+ */
2675 -+ do_div(bw, 8);
2676 -+ if (bw == 0)
2677 -+ return 0;
2678 -+ bfqd->peak_rate *= 7;
2679 -+ do_div(bfqd->peak_rate, 8);
2680 -+ bfqd->peak_rate += bw;
2681 -+ update = 1;
2682 -+ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
2683 -+ }
2684 -+
2685 -+ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
2686 -+
2687 -+ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
2688 -+ bfqd->peak_rate_samples++;
2689 -+
2690 -+ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
2691 -+ update && bfqd->bfq_user_max_budget == 0) {
2692 -+ bfqd->bfq_max_budget =
2693 -+ bfq_calc_max_budget(bfqd->peak_rate, timeout);
2694 -+ bfq_log(bfqd, "new max_budget=%lu",
2695 -+ bfqd->bfq_max_budget);
2696 -+ }
2697 -+ }
2698 -+
2699 -+ /*
2700 -+ * If the process has been served for a too short time
2701 -+ * interval to let its possible sequential accesses prevail on
2702 -+ * the initial seek time needed to move the disk head on the
2703 -+ * first sector it requested, then give the process a chance
2704 -+ * and for the moment return false.
2705 -+ */
2706 -+ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
2707 -+ return 0;
2708 -+
2709 -+ /*
2710 -+ * A process is considered ``slow'' (i.e., seeky, so that we
2711 -+ * cannot treat it fairly in the service domain, as it would
2712 -+ * slow down too much the other processes) if, when a slice
2713 -+ * ends for whatever reason, it has received service at a
2714 -+ * rate that would not be high enough to complete the budget
2715 -+ * before the budget timeout expiration.
2716 -+ */
2717 -+ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
2718 -+
2719 -+ /*
2720 -+ * Caveat: processes doing IO in the slower disk zones will
2721 -+ * tend to be slow(er) even if not seeky. And the estimated
2722 -+ * peak rate will actually be an average over the disk
2723 -+ * surface. Hence, to not be too harsh with unlucky processes,
2724 -+ * we keep a budget/3 margin of safety before declaring a
2725 -+ * process slow.
2726 -+ */
2727 -+ return expected > (4 * bfqq->entity.budget) / 3;
2728 -+}
2729 -+
2730 -+/*
2731 -+ * To be deemed as soft real-time, an application must meet two requirements.
2732 -+ * The first is that the application must not require an average bandwidth
2733 -+ * higher than the approximate bandwidth required to playback or record a
2734 -+ * compressed high-definition video.
2735 -+ * The next function is invoked on the completion of the last request of a
2736 -+ * batch, to compute the next-start time instant, soft_rt_next_start, such
2737 -+ * that, if the next request of the application does not arrive before
2738 -+ * soft_rt_next_start, then the above requirement on the bandwidth is met.
2739 -+ *
2740 -+ * The second requirement is that the request pattern of the application is
2741 -+ * isochronous, i.e., that, after issuing a request or a batch of requests, the
2742 -+ * application stops for a while, then issues a new batch, and so on. For this
2743 -+ * reason the next function is invoked to compute soft_rt_next_start only for
2744 -+ * applications that meet this requirement, whereas soft_rt_next_start is set
2745 -+ * to infinity for applications that do not.
2746 -+ *
2747 -+ * Unfortunately, even a greedy application may happen to behave in an
2748 -+ * isochronous way if several processes are competing for the CPUs. In fact,
2749 -+ * in this scenario the application stops issuing requests while the CPUs are
2750 -+ * busy serving other processes, then restarts, then stops again for a while,
2751 -+ * and so on. In addition, if the disk achieves a low enough throughput with
2752 -+ * the request pattern issued by the application, then the above bandwidth
2753 -+ * requirement may happen to be met too. To prevent such a greedy application
2754 -+ * to be deemed as soft real-time, a further rule is used in the computation
2755 -+ * of soft_rt_next_start: soft_rt_next_start must be higher than the current
2756 -+ * time plus the maximum time for which the arrival of a request is waited
2757 -+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This
2758 -+ * filters out greedy applications, as the latter issue instead their next
2759 -+ * request as soon as possible after the last one has been completed (in
2760 -+ * contrast, when a batch of requests is completed, a soft real-time
2761 -+ * application spends some time processing data).
2762 -+ *
2763 -+ * Actually, the last filter may easily generate false positives if: only
2764 -+ * bfqd->bfq_slice_idle is used as a reference time interval, and one or
2765 -+ * both the following two cases occur:
2766 -+ * 1) HZ is so low that the duration of a jiffie is comparable to or higher
2767 -+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
2768 -+ * HZ=100.
2769 -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
2770 -+ * for a while, then suddenly 'jump' by several units to recover the lost
2771 -+ * increments. This seems to happen, e.g., inside virtual machines.
2772 -+ * To address this issue, we do not use as a reference time interval just
2773 -+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
2774 -+ * particular we add the minimum number of jiffies for which the filter seems
2775 -+ * to be quite precise also in embedded systems and KVM/QEMU virtual machines.
2776 -+ */
2777 -+static inline u64 bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
2778 -+ struct bfq_queue *bfqq)
2779 -+{
2780 -+ return max(bfqq->last_idle_bklogged +
2781 -+ HZ * bfqq->service_from_backlogged /
2782 -+ bfqd->bfq_raising_max_softrt_rate,
2783 -+ (u64)jiffies + bfqq->bfqd->bfq_slice_idle + 4);
2784 -+}
2785 -+
2786 -+/**
2787 -+ * bfq_bfqq_expire - expire a queue.
2788 -+ * @bfqd: device owning the queue.
2789 -+ * @bfqq: the queue to expire.
2790 -+ * @compensate: if true, compensate for the time spent idling.
2791 -+ * @reason: the reason causing the expiration.
2792 -+ *
2793 -+ *
2794 -+ * If the process associated to the queue is slow (i.e., seeky), or in
2795 -+ * case of budget timeout, or, finally, if it is async, we
2796 -+ * artificially charge it an entire budget (independently of the
2797 -+ * actual service it received). As a consequence, the queue will get
2798 -+ * higher timestamps than the correct ones upon reactivation, and
2799 -+ * hence it will be rescheduled as if it had received more service
2800 -+ * than what it actually received. In the end, this class of processes
2801 -+ * will receive less service in proportion to how slowly they consume
2802 -+ * their budgets (and hence how seriously they tend to lower the
2803 -+ * throughput).
2804 -+ *
2805 -+ * In contrast, when a queue expires because it has been idling for
2806 -+ * too much or because it exhausted its budget, we do not touch the
2807 -+ * amount of service it has received. Hence when the queue will be
2808 -+ * reactivated and its timestamps updated, the latter will be in sync
2809 -+ * with the actual service received by the queue until expiration.
2810 -+ *
2811 -+ * Charging a full budget to the first type of queues and the exact
2812 -+ * service to the others has the effect of using the WF2Q+ policy to
2813 -+ * schedule the former on a timeslice basis, without violating the
2814 -+ * service domain guarantees of the latter.
2815 -+ */
2816 -+static void bfq_bfqq_expire(struct bfq_data *bfqd,
2817 -+ struct bfq_queue *bfqq,
2818 -+ int compensate,
2819 -+ enum bfqq_expiration reason)
2820 -+{
2821 -+ int slow;
2822 -+ BUG_ON(bfqq != bfqd->in_service_queue);
2823 -+
2824 -+ /* Update disk peak rate for autotuning and check whether the
2825 -+ * process is slow (see bfq_update_peak_rate).
2826 -+ */
2827 -+ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
2828 -+
2829 -+ /*
2830 -+ * As above explained, 'punish' slow (i.e., seeky), timed-out
2831 -+ * and async queues, to favor sequential sync workloads.
2832 -+ *
2833 -+ * Processes doing IO in the slower disk zones will tend to be
2834 -+ * slow(er) even if not seeky. Hence, since the estimated peak
2835 -+ * rate is actually an average over the disk surface, these
2836 -+ * processes may timeout just for bad luck. To avoid punishing
2837 -+ * them we do not charge a full budget to a process that
2838 -+ * succeeded in consuming at least 2/3 of its budget.
2839 -+ */
2840 -+ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
2841 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
2842 -+ bfq_bfqq_charge_full_budget(bfqq);
2843 -+
2844 -+ bfqq->service_from_backlogged += bfqq->entity.service;
2845 -+
2846 -+ if (bfqd->low_latency && bfqq->raising_coeff == 1)
2847 -+ bfqq->last_rais_start_finish = jiffies;
2848 -+
2849 -+ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {
2850 -+ if (reason != BFQ_BFQQ_BUDGET_TIMEOUT &&
2851 -+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) {
2852 -+ /*
2853 -+ * If we get here, then the request pattern is
2854 -+ * isochronous (see the comments to the function
2855 -+ * bfq_bfqq_softrt_next_start()). However, if the
2856 -+ * queue still has in-flight requests, then it is
2857 -+ * better to postpone the computation of next_start
2858 -+ * to the next request completion. In fact, if we
2859 -+ * computed it now, then the application might pass
2860 -+ * the greedy-application filter improperly, because
2861 -+ * the arrival of its next request may happen to be
2862 -+ * higher than (jiffies + bfqq->bfqd->bfq_slice_idle)
2863 -+ * not because the application is truly soft real-
2864 -+ * time, but just because the application is currently
2865 -+ * waiting for the completion of some request before
2866 -+ * issuing, as quickly as possible, its next request.
2867 -+ */
2868 -+ if (bfqq->dispatched > 0) {
2869 -+ bfqq->soft_rt_next_start = -1;
2870 -+ bfq_mark_bfqq_softrt_update(bfqq);
2871 -+ } else
2872 -+ bfqq->soft_rt_next_start =
2873 -+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
2874 -+ } else
2875 -+ bfqq->soft_rt_next_start = -1; /* infinity */
2876 -+ }
2877 -+
2878 -+ bfq_log_bfqq(bfqd, bfqq,
2879 -+ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
2880 -+ bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
2881 -+
2882 -+ /* Increase, decrease or leave budget unchanged according to reason */
2883 -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
2884 -+ __bfq_bfqq_expire(bfqd, bfqq);
2885 -+}
2886 -+
2887 -+/*
2888 -+ * Budget timeout is not implemented through a dedicated timer, but
2889 -+ * just checked on request arrivals and completions, as well as on
2890 -+ * idle timer expirations.
2891 -+ */
2892 -+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
2893 -+{
2894 -+ if (bfq_bfqq_budget_new(bfqq))
2895 -+ return 0;
2896 -+
2897 -+ if (time_before(jiffies, bfqq->budget_timeout))
2898 -+ return 0;
2899 -+
2900 -+ return 1;
2901 -+}
2902 -+
2903 -+/*
2904 -+ * If we expire a queue that is waiting for the arrival of a new
2905 -+ * request, we may prevent the fictitious timestamp backshifting that
2906 -+ * allows the guarantees of the queue to be preserved (see [1] for
2907 -+ * this tricky aspect). Hence we return true only if this condition
2908 -+ * does not hold, or if the queue is slow enough to deserve only to be
2909 -+ * kicked off for preserving a high throughput.
2910 -+*/
2911 -+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
2912 -+{
2913 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
2914 -+ "may_budget_timeout: wr %d left %d timeout %d",
2915 -+ bfq_bfqq_wait_request(bfqq),
2916 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
2917 -+ bfq_bfqq_budget_timeout(bfqq));
2918 -+
2919 -+ return (!bfq_bfqq_wait_request(bfqq) ||
2920 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
2921 -+ &&
2922 -+ bfq_bfqq_budget_timeout(bfqq);
2923 -+}
2924 -+
2925 -+/*
2926 -+ * For weight-raised queues issuing sync requests, idling is always performed,
2927 -+ * as this is instrumental in guaranteeing a high fraction of the throughput
2928 -+ * to these queues, and hence in guaranteeing a lower latency for their
2929 -+ * requests. See [1] for details.
2930 -+ *
2931 -+ * For non-weight-raised queues, idling is instead disabled if the device is
2932 -+ * NCQ-enabled and non-rotational, as this boosts the throughput on such
2933 -+ * devices.
2934 -+ */
2935 -+static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
2936 -+{
2937 -+ struct bfq_data *bfqd = bfqq->bfqd;
2938 -+
2939 -+ return bfq_bfqq_sync(bfqq) && (
2940 -+ bfqq->raising_coeff > 1 ||
2941 -+ (bfq_bfqq_idle_window(bfqq) &&
2942 -+ !(bfqd->hw_tag &&
2943 -+ (blk_queue_nonrot(bfqd->queue) ||
2944 -+ /*
2945 -+ * If there are weight-raised busy queues, then do not idle
2946 -+ * the disk for a sync non-weight-raised queue, and hence
2947 -+ * expire the queue immediately if empty. Combined with the
2948 -+ * timestamping rules of BFQ (see [1] for details), this
2949 -+ * causes sync non-weight-raised queues to get a lower
2950 -+ * fraction of the disk throughput, and hence reduces the rate
2951 -+ * at which the processes associated to these queues ask for
2952 -+ * requests from the request pool.
2953 -+ *
2954 -+ * This is beneficial for weight-raised processes, when the
2955 -+ * system operates in request-pool saturation conditions
2956 -+ * (e.g., in the presence of write hogs). In fact, if
2957 -+ * non-weight-raised processes ask for requests at a lower
2958 -+ * rate, then weight-raised processes have a higher
2959 -+ * probability to get a request from the pool immediately
2960 -+ * (or at least soon) when they need one. Hence they have a
2961 -+ * higher probability to actually get a fraction of the disk
2962 -+ * throughput proportional to their high weight. This is
2963 -+ * especially true with NCQ-enabled drives, which enqueue
2964 -+ * several requests in advance and further reorder
2965 -+ * internally-queued requests.
2966 -+ *
2967 -+ * Mistreating non-weight-raised queues in the above-described
2968 -+ * way, when there are busy weight-raised queues, seems to
2969 -+ * mitigate starvation problems in the presence of heavy write
2970 -+ * workloads and NCQ, and hence to guarantee a higher
2971 -+ * application and system responsiveness in these hostile
2972 -+ * scenarios.
2973 -+ */
2974 -+ bfqd->raised_busy_queues > 0)
2975 -+ )
2976 -+ )
2977 -+ );
2978 -+}
2979 -+
2980 -+/*
2981 -+ * If the in-service queue is empty, but it is sync and either of the following
2982 -+ * conditions holds, then: 1) the queue must remain in service and cannot be
2983 -+ * expired, and 2) the disk must be idled to wait for the possible arrival
2984 -+ * of a new request for the queue. The conditions are:
2985 -+ * - the device is rotational and not performing NCQ, and the queue has its
2986 -+ * idle window set (in this case, waiting for a new request for the queue
2987 -+ * is likely to boost the disk throughput);
2988 -+ * - the queue is weight-raised (waiting for the request is necessary to
2989 -+ * provide the queue with fairness and latency guarantees, see [1] for
2990 -+ * details).
2991 -+ */
2992 -+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
2993 -+{
2994 -+ struct bfq_data *bfqd = bfqq->bfqd;
2995 -+
2996 -+ return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
2997 -+ bfq_bfqq_must_not_expire(bfqq) &&
2998 -+ !bfq_queue_nonrot_noidle(bfqd, bfqq));
2999 -+}
3000 -+
3001 -+/*
3002 -+ * Select a queue for service. If we have a current queue in service,
3003 -+ * check whether to continue servicing it, or retrieve and set a new one.
3004 -+ */
3005 -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
3006 -+{
3007 -+ struct bfq_queue *bfqq, *new_bfqq = NULL;
3008 -+ struct request *next_rq;
3009 -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
3010 -+
3011 -+ bfqq = bfqd->in_service_queue;
3012 -+ if (bfqq == NULL)
3013 -+ goto new_queue;
3014 -+
3015 -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
3016 -+
3017 -+ /*
3018 -+ * If another queue has a request waiting within our mean seek
3019 -+ * distance, let it run. The expire code will check for close
3020 -+ * cooperators and put the close queue at the front of the
3021 -+ * service tree. If possible, merge the expiring queue with the
3022 -+ * new bfqq.
3023 -+ */
3024 -+ new_bfqq = bfq_close_cooperator(bfqd, bfqq);
3025 -+ if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
3026 -+ bfq_setup_merge(bfqq, new_bfqq);
3027 -+
3028 -+ if (bfq_may_expire_for_budg_timeout(bfqq) &&
3029 -+ !timer_pending(&bfqd->idle_slice_timer) &&
3030 -+ !bfq_bfqq_must_idle(bfqq))
3031 -+ goto expire;
3032 -+
3033 -+ next_rq = bfqq->next_rq;
3034 -+ /*
3035 -+ * If bfqq has requests queued and it has enough budget left to
3036 -+ * serve them, keep the queue, otherwise expire it.
3037 -+ */
3038 -+ if (next_rq != NULL) {
3039 -+ if (bfq_serv_to_charge(next_rq, bfqq) >
3040 -+ bfq_bfqq_budget_left(bfqq)) {
3041 -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
3042 -+ goto expire;
3043 -+ } else {
3044 -+ /*
3045 -+ * The idle timer may be pending because we may not
3046 -+ * disable disk idling even when a new request arrives
3047 -+ */
3048 -+ if (timer_pending(&bfqd->idle_slice_timer)) {
3049 -+ /*
3050 -+ * If we get here: 1) at least a new request
3051 -+ * has arrived but we have not disabled the
3052 -+ * timer because the request was too small,
3053 -+ * 2) then the block layer has unplugged the
3054 -+ * device, causing the dispatch to be invoked.
3055 -+ *
3056 -+ * Since the device is unplugged, now the
3057 -+ * requests are probably large enough to
3058 -+ * provide a reasonable throughput.
3059 -+ * So we disable idling.
3060 -+ */
3061 -+ bfq_clear_bfqq_wait_request(bfqq);
3062 -+ del_timer(&bfqd->idle_slice_timer);
3063 -+ }
3064 -+ if (new_bfqq == NULL)
3065 -+ goto keep_queue;
3066 -+ else
3067 -+ goto expire;
3068 -+ }
3069 -+ }
3070 -+
3071 -+ /*
3072 -+ * No requests pending. If the in-service queue has no cooperator and
3073 -+ * still has requests in flight (possibly waiting for a completion)
3074 -+ * or is idling for a new request, then keep it.
3075 -+ */
3076 -+ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
3077 -+ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
3078 -+ bfqq = NULL;
3079 -+ goto keep_queue;
3080 -+ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
3081 -+ /*
3082 -+ * Expiring the queue because there is a close cooperator,
3083 -+ * cancel timer.
3084 -+ */
3085 -+ bfq_clear_bfqq_wait_request(bfqq);
3086 -+ del_timer(&bfqd->idle_slice_timer);
3087 -+ }
3088 -+
3089 -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
3090 -+expire:
3091 -+ bfq_bfqq_expire(bfqd, bfqq, 0, reason);
3092 -+new_queue:
3093 -+ bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
3094 -+ bfq_log(bfqd, "select_queue: new queue %d returned",
3095 -+ bfqq != NULL ? bfqq->pid : 0);
3096 -+keep_queue:
3097 -+ return bfqq;
3098 -+}
3099 -+
3100 -+static void bfq_update_raising_data(struct bfq_data *bfqd,
3101 -+ struct bfq_queue *bfqq)
3102 -+{
3103 -+ if (bfqq->raising_coeff > 1) { /* queue is being boosted */
3104 -+ struct bfq_entity *entity = &bfqq->entity;
3105 -+
3106 -+ bfq_log_bfqq(bfqd, bfqq,
3107 -+ "raising period dur %u/%u msec, "
3108 -+ "old raising coeff %u, w %d(%d)",
3109 -+ jiffies_to_msecs(jiffies -
3110 -+ bfqq->last_rais_start_finish),
3111 -+ jiffies_to_msecs(bfqq->raising_cur_max_time),
3112 -+ bfqq->raising_coeff,
3113 -+ bfqq->entity.weight, bfqq->entity.orig_weight);
3114 -+
3115 -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
3116 -+ entity->orig_weight * bfqq->raising_coeff);
3117 -+ if (entity->ioprio_changed)
3118 -+ bfq_log_bfqq(bfqd, bfqq,
3119 -+ "WARN: pending prio change");
3120 -+ /*
3121 -+ * If too much time has elapsed from the beginning
3122 -+ * of this weight-raising, stop it.
3123 -+ */
3124 -+ if (jiffies - bfqq->last_rais_start_finish >
3125 -+ bfqq->raising_cur_max_time) {
3126 -+ bfqq->last_rais_start_finish = jiffies;
3127 -+ bfq_log_bfqq(bfqd, bfqq,
3128 -+ "wrais ending at %llu msec,"
3129 -+ "rais_max_time %u",
3130 -+ bfqq->last_rais_start_finish,
3131 -+ jiffies_to_msecs(bfqq->
3132 -+ raising_cur_max_time));
3133 -+ bfq_bfqq_end_raising(bfqq);
3134 -+ __bfq_entity_update_weight_prio(
3135 -+ bfq_entity_service_tree(entity),
3136 -+ entity);
3137 -+ }
3138 -+ }
3139 -+}
3140 -+
3141 -+/*
3142 -+ * Dispatch one request from bfqq, moving it to the request queue
3143 -+ * dispatch list.
3144 -+ */
3145 -+static int bfq_dispatch_request(struct bfq_data *bfqd,
3146 -+ struct bfq_queue *bfqq)
3147 -+{
3148 -+ int dispatched = 0;
3149 -+ struct request *rq;
3150 -+ unsigned long service_to_charge;
3151 -+
3152 -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
3153 -+
3154 -+ /* Follow expired path, else get first next available. */
3155 -+ rq = bfq_check_fifo(bfqq);
3156 -+ if (rq == NULL)
3157 -+ rq = bfqq->next_rq;
3158 -+ service_to_charge = bfq_serv_to_charge(rq, bfqq);
3159 -+
3160 -+ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
3161 -+ /*
3162 -+ * This may happen if the next rq is chosen
3163 -+ * in fifo order instead of sector order.
3164 -+ * The budget is properly dimensioned
3165 -+ * to be always sufficient to serve the next request
3166 -+ * only if it is chosen in sector order. The reason is
3167 -+ * that it would be quite inefficient and little useful
3168 -+ * to always make sure that the budget is large enough
3169 -+ * to serve even the possible next rq in fifo order.
3170 -+ * In fact, requests are seldom served in fifo order.
3171 -+ *
3172 -+ * Expire the queue for budget exhaustion, and
3173 -+ * make sure that the next act_budget is enough
3174 -+ * to serve the next request, even if it comes
3175 -+ * from the fifo expired path.
3176 -+ */
3177 -+ bfqq->next_rq = rq;
3178 -+ /*
3179 -+ * Since this dispatch is failed, make sure that
3180 -+ * a new one will be performed
3181 -+ */
3182 -+ if (!bfqd->rq_in_driver)
3183 -+ bfq_schedule_dispatch(bfqd);
3184 -+ goto expire;
3185 -+ }
3186 -+
3187 -+ /* Finally, insert request into driver dispatch list. */
3188 -+ bfq_bfqq_served(bfqq, service_to_charge);
3189 -+ bfq_dispatch_insert(bfqd->queue, rq);
3190 -+
3191 -+ bfq_update_raising_data(bfqd, bfqq);
3192 -+
3193 -+ bfq_log_bfqq(bfqd, bfqq,
3194 -+ "dispatched %u sec req (%llu), budg left %lu",
3195 -+ blk_rq_sectors(rq),
3196 -+ (long long unsigned)blk_rq_pos(rq),
3197 -+ bfq_bfqq_budget_left(bfqq));
3198 -+
3199 -+ dispatched++;
3200 -+
3201 -+ if (bfqd->in_service_bic == NULL) {
3202 -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
3203 -+ bfqd->in_service_bic = RQ_BIC(rq);
3204 -+ }
3205 -+
3206 -+ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
3207 -+ dispatched >= bfqd->bfq_max_budget_async_rq) ||
3208 -+ bfq_class_idle(bfqq)))
3209 -+ goto expire;
3210 -+
3211 -+ return dispatched;
3212 -+
3213 -+expire:
3214 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
3215 -+ return dispatched;
3216 -+}
3217 -+
3218 -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
3219 -+{
3220 -+ int dispatched = 0;
3221 -+
3222 -+ while (bfqq->next_rq != NULL) {
3223 -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
3224 -+ dispatched++;
3225 -+ }
3226 -+
3227 -+ BUG_ON(!list_empty(&bfqq->fifo));
3228 -+ return dispatched;
3229 -+}
3230 -+
3231 -+/*
3232 -+ * Drain our current requests. Used for barriers and when switching
3233 -+ * io schedulers on-the-fly.
3234 -+ */
3235 -+static int bfq_forced_dispatch(struct bfq_data *bfqd)
3236 -+{
3237 -+ struct bfq_queue *bfqq, *n;
3238 -+ struct bfq_service_tree *st;
3239 -+ int dispatched = 0;
3240 -+
3241 -+ bfqq = bfqd->in_service_queue;
3242 -+ if (bfqq != NULL)
3243 -+ __bfq_bfqq_expire(bfqd, bfqq);
3244 -+
3245 -+ /*
3246 -+ * Loop through classes, and be careful to leave the scheduler
3247 -+ * in a consistent state, as feedback mechanisms and vtime
3248 -+ * updates cannot be disabled during the process.
3249 -+ */
3250 -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
3251 -+ st = bfq_entity_service_tree(&bfqq->entity);
3252 -+
3253 -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
3254 -+ bfqq->max_budget = bfq_max_budget(bfqd);
3255 -+
3256 -+ bfq_forget_idle(st);
3257 -+ }
3258 -+
3259 -+ BUG_ON(bfqd->busy_queues != 0);
3260 -+
3261 -+ return dispatched;
3262 -+}
3263 -+
3264 -+static int bfq_dispatch_requests(struct request_queue *q, int force)
3265 -+{
3266 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3267 -+ struct bfq_queue *bfqq;
3268 -+ int max_dispatch;
3269 -+
3270 -+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
3271 -+ if (bfqd->busy_queues == 0)
3272 -+ return 0;
3273 -+
3274 -+ if (unlikely(force))
3275 -+ return bfq_forced_dispatch(bfqd);
3276 -+
3277 -+ bfqq = bfq_select_queue(bfqd);
3278 -+ if (bfqq == NULL)
3279 -+ return 0;
3280 -+
3281 -+ max_dispatch = bfqd->bfq_quantum;
3282 -+ if (bfq_class_idle(bfqq))
3283 -+ max_dispatch = 1;
3284 -+
3285 -+ if (!bfq_bfqq_sync(bfqq))
3286 -+ max_dispatch = bfqd->bfq_max_budget_async_rq;
3287 -+
3288 -+ if (bfqq->dispatched >= max_dispatch) {
3289 -+ if (bfqd->busy_queues > 1)
3290 -+ return 0;
3291 -+ if (bfqq->dispatched >= 4 * max_dispatch)
3292 -+ return 0;
3293 -+ }
3294 -+
3295 -+ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
3296 -+ return 0;
3297 -+
3298 -+ bfq_clear_bfqq_wait_request(bfqq);
3299 -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
3300 -+
3301 -+ if (!bfq_dispatch_request(bfqd, bfqq))
3302 -+ return 0;
3303 -+
3304 -+ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",
3305 -+ bfqq->pid, max_dispatch);
3306 -+
3307 -+ return 1;
3308 -+}
3309 -+
3310 -+/*
3311 -+ * Task holds one reference to the queue, dropped when task exits. Each rq
3312 -+ * in-flight on this queue also holds a reference, dropped when rq is freed.
3313 -+ *
3314 -+ * Queue lock must be held here.
3315 -+ */
3316 -+static void bfq_put_queue(struct bfq_queue *bfqq)
3317 -+{
3318 -+ struct bfq_data *bfqd = bfqq->bfqd;
3319 -+
3320 -+ BUG_ON(atomic_read(&bfqq->ref) <= 0);
3321 -+
3322 -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
3323 -+ atomic_read(&bfqq->ref));
3324 -+ if (!atomic_dec_and_test(&bfqq->ref))
3325 -+ return;
3326 -+
3327 -+ BUG_ON(rb_first(&bfqq->sort_list) != NULL);
3328 -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
3329 -+ BUG_ON(bfqq->entity.tree != NULL);
3330 -+ BUG_ON(bfq_bfqq_busy(bfqq));
3331 -+ BUG_ON(bfqd->in_service_queue == bfqq);
3332 -+
3333 -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
3334 -+
3335 -+ kmem_cache_free(bfq_pool, bfqq);
3336 -+}
3337 -+
3338 -+static void bfq_put_cooperator(struct bfq_queue *bfqq)
3339 -+{
3340 -+ struct bfq_queue *__bfqq, *next;
3341 -+
3342 -+ /*
3343 -+ * If this queue was scheduled to merge with another queue, be
3344 -+ * sure to drop the reference taken on that queue (and others in
3345 -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
3346 -+ */
3347 -+ __bfqq = bfqq->new_bfqq;
3348 -+ while (__bfqq) {
3349 -+ if (__bfqq == bfqq) {
3350 -+ WARN(1, "bfqq->new_bfqq loop detected.\n");
3351 -+ break;
3352 -+ }
3353 -+ next = __bfqq->new_bfqq;
3354 -+ bfq_put_queue(__bfqq);
3355 -+ __bfqq = next;
3356 -+ }
3357 -+}
3358 -+
3359 -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3360 -+{
3361 -+ if (bfqq == bfqd->in_service_queue) {
3362 -+ __bfq_bfqq_expire(bfqd, bfqq);
3363 -+ bfq_schedule_dispatch(bfqd);
3364 -+ }
3365 -+
3366 -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
3367 -+ atomic_read(&bfqq->ref));
3368 -+
3369 -+ bfq_put_cooperator(bfqq);
3370 -+
3371 -+ bfq_put_queue(bfqq);
3372 -+}
3373 -+
3374 -+static void bfq_init_icq(struct io_cq *icq)
3375 -+{
3376 -+ struct bfq_io_cq *bic = icq_to_bic(icq);
3377 -+
3378 -+ bic->ttime.last_end_request = jiffies;
3379 -+}
3380 -+
3381 -+static void bfq_exit_icq(struct io_cq *icq)
3382 -+{
3383 -+ struct bfq_io_cq *bic = icq_to_bic(icq);
3384 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
3385 -+
3386 -+ if (bic->bfqq[BLK_RW_ASYNC]) {
3387 -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
3388 -+ bic->bfqq[BLK_RW_ASYNC] = NULL;
3389 -+ }
3390 -+
3391 -+ if (bic->bfqq[BLK_RW_SYNC]) {
3392 -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
3393 -+ bic->bfqq[BLK_RW_SYNC] = NULL;
3394 -+ }
3395 -+}
3396 -+
3397 -+/*
3398 -+ * Update the entity prio values; note that the new values will not
3399 -+ * be used until the next (re)activation.
3400 -+ */
3401 -+static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
3402 -+{
3403 -+ struct task_struct *tsk = current;
3404 -+ int ioprio_class;
3405 -+
3406 -+ if (!bfq_bfqq_prio_changed(bfqq))
3407 -+ return;
3408 -+
3409 -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3410 -+ switch (ioprio_class) {
3411 -+ default:
3412 -+ dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
3413 -+ "bfq: bad prio %x\n", ioprio_class);
3414 -+ case IOPRIO_CLASS_NONE:
3415 -+ /*
3416 -+ * No prio set, inherit CPU scheduling settings.
3417 -+ */
3418 -+ bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
3419 -+ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
3420 -+ break;
3421 -+ case IOPRIO_CLASS_RT:
3422 -+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3423 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
3424 -+ break;
3425 -+ case IOPRIO_CLASS_BE:
3426 -+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3427 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
3428 -+ break;
3429 -+ case IOPRIO_CLASS_IDLE:
3430 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
3431 -+ bfqq->entity.new_ioprio = 7;
3432 -+ bfq_clear_bfqq_idle_window(bfqq);
3433 -+ break;
3434 -+ }
3435 -+
3436 -+ bfqq->entity.ioprio_changed = 1;
3437 -+
3438 -+ /*
3439 -+ * Keep track of original prio settings in case we have to temporarily
3440 -+ * elevate the priority of this queue.
3441 -+ */
3442 -+ bfqq->org_ioprio = bfqq->entity.new_ioprio;
3443 -+ bfq_clear_bfqq_prio_changed(bfqq);
3444 -+}
3445 -+
3446 -+static void bfq_changed_ioprio(struct bfq_io_cq *bic)
3447 -+{
3448 -+ struct bfq_data *bfqd;
3449 -+ struct bfq_queue *bfqq, *new_bfqq;
3450 -+ struct bfq_group *bfqg;
3451 -+ unsigned long uninitialized_var(flags);
3452 -+ int ioprio = bic->icq.ioc->ioprio;
3453 -+
3454 -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
3455 -+ &flags);
3456 -+ /*
3457 -+ * This condition may trigger on a newly created bic, be sure to drop
3458 -+ * the lock before returning.
3459 -+ */
3460 -+ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
3461 -+ goto out;
3462 -+
3463 -+ bfqq = bic->bfqq[BLK_RW_ASYNC];
3464 -+ if (bfqq != NULL) {
3465 -+ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
3466 -+ sched_data);
3467 -+ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
3468 -+ GFP_ATOMIC);
3469 -+ if (new_bfqq != NULL) {
3470 -+ bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
3471 -+ bfq_log_bfqq(bfqd, bfqq,
3472 -+ "changed_ioprio: bfqq %p %d",
3473 -+ bfqq, atomic_read(&bfqq->ref));
3474 -+ bfq_put_queue(bfqq);
3475 -+ }
3476 -+ }
3477 -+
3478 -+ bfqq = bic->bfqq[BLK_RW_SYNC];
3479 -+ if (bfqq != NULL)
3480 -+ bfq_mark_bfqq_prio_changed(bfqq);
3481 -+
3482 -+ bic->ioprio = ioprio;
3483 -+
3484 -+out:
3485 -+ bfq_put_bfqd_unlock(bfqd, &flags);
3486 -+}
3487 -+
3488 -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3489 -+ pid_t pid, int is_sync)
3490 -+{
3491 -+ RB_CLEAR_NODE(&bfqq->entity.rb_node);
3492 -+ INIT_LIST_HEAD(&bfqq->fifo);
3493 -+
3494 -+ atomic_set(&bfqq->ref, 0);
3495 -+ bfqq->bfqd = bfqd;
3496 -+
3497 -+ bfq_mark_bfqq_prio_changed(bfqq);
3498 -+
3499 -+ if (is_sync) {
3500 -+ if (!bfq_class_idle(bfqq))
3501 -+ bfq_mark_bfqq_idle_window(bfqq);
3502 -+ bfq_mark_bfqq_sync(bfqq);
3503 -+ }
3504 -+
3505 -+ /* Tentative initial value to trade off between thr and lat */
3506 -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
3507 -+ bfqq->pid = pid;
3508 -+
3509 -+ bfqq->raising_coeff = 1;
3510 -+ bfqq->last_rais_start_finish = 0;
3511 -+ bfqq->soft_rt_next_start = -1;
3512 -+}
3513 -+
3514 -+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
3515 -+ struct bfq_group *bfqg,
3516 -+ int is_sync,
3517 -+ struct bfq_io_cq *bic,
3518 -+ gfp_t gfp_mask)
3519 -+{
3520 -+ struct bfq_queue *bfqq, *new_bfqq = NULL;
3521 -+
3522 -+retry:
3523 -+ /* bic always exists here */
3524 -+ bfqq = bic_to_bfqq(bic, is_sync);
3525 -+
3526 -+ /*
3527 -+ * Always try a new alloc if we fall back to the OOM bfqq
3528 -+ * originally, since it should just be a temporary situation.
3529 -+ */
3530 -+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3531 -+ bfqq = NULL;
3532 -+ if (new_bfqq != NULL) {
3533 -+ bfqq = new_bfqq;
3534 -+ new_bfqq = NULL;
3535 -+ } else if (gfp_mask & __GFP_WAIT) {
3536 -+ spin_unlock_irq(bfqd->queue->queue_lock);
3537 -+ new_bfqq = kmem_cache_alloc_node(bfq_pool,
3538 -+ gfp_mask | __GFP_ZERO,
3539 -+ bfqd->queue->node);
3540 -+ spin_lock_irq(bfqd->queue->queue_lock);
3541 -+ if (new_bfqq != NULL)
3542 -+ goto retry;
3543 -+ } else {
3544 -+ bfqq = kmem_cache_alloc_node(bfq_pool,
3545 -+ gfp_mask | __GFP_ZERO,
3546 -+ bfqd->queue->node);
3547 -+ }
3548 -+
3549 -+ if (bfqq != NULL) {
3550 -+ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
3551 -+ bfq_log_bfqq(bfqd, bfqq, "allocated");
3552 -+ } else {
3553 -+ bfqq = &bfqd->oom_bfqq;
3554 -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
3555 -+ }
3556 -+
3557 -+ bfq_init_prio_data(bfqq, bic);
3558 -+ bfq_init_entity(&bfqq->entity, bfqg);
3559 -+ }
3560 -+
3561 -+ if (new_bfqq != NULL)
3562 -+ kmem_cache_free(bfq_pool, new_bfqq);
3563 -+
3564 -+ return bfqq;
3565 -+}
3566 -+
3567 -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
3568 -+ struct bfq_group *bfqg,
3569 -+ int ioprio_class, int ioprio)
3570 -+{
3571 -+ switch (ioprio_class) {
3572 -+ case IOPRIO_CLASS_RT:
3573 -+ return &bfqg->async_bfqq[0][ioprio];
3574 -+ case IOPRIO_CLASS_NONE:
3575 -+ ioprio = IOPRIO_NORM;
3576 -+ /* fall through */
3577 -+ case IOPRIO_CLASS_BE:
3578 -+ return &bfqg->async_bfqq[1][ioprio];
3579 -+ case IOPRIO_CLASS_IDLE:
3580 -+ return &bfqg->async_idle_bfqq;
3581 -+ default:
3582 -+ BUG();
3583 -+ }
3584 -+}
3585 -+
3586 -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
3587 -+ struct bfq_group *bfqg, int is_sync,
3588 -+ struct bfq_io_cq *bic, gfp_t gfp_mask)
3589 -+{
3590 -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
3591 -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
3592 -+ struct bfq_queue **async_bfqq = NULL;
3593 -+ struct bfq_queue *bfqq = NULL;
3594 -+
3595 -+ if (!is_sync) {
3596 -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
3597 -+ ioprio);
3598 -+ bfqq = *async_bfqq;
3599 -+ }
3600 -+
3601 -+ if (bfqq == NULL)
3602 -+ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
3603 -+
3604 -+ /*
3605 -+ * Pin the queue now that it's allocated, scheduler exit will prune it.
3606 -+ */
3607 -+ if (!is_sync && *async_bfqq == NULL) {
3608 -+ atomic_inc(&bfqq->ref);
3609 -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
3610 -+ bfqq, atomic_read(&bfqq->ref));
3611 -+ *async_bfqq = bfqq;
3612 -+ }
3613 -+
3614 -+ atomic_inc(&bfqq->ref);
3615 -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
3616 -+ atomic_read(&bfqq->ref));
3617 -+ return bfqq;
3618 -+}
3619 -+
3620 -+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
3621 -+ struct bfq_io_cq *bic)
3622 -+{
3623 -+ unsigned long elapsed = jiffies - bic->ttime.last_end_request;
3624 -+ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
3625 -+
3626 -+ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
3627 -+ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
3628 -+ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /
3629 -+ bic->ttime.ttime_samples;
3630 -+}
3631 -+
3632 -+static void bfq_update_io_seektime(struct bfq_data *bfqd,
3633 -+ struct bfq_queue *bfqq,
3634 -+ struct request *rq)
3635 -+{
3636 -+ sector_t sdist;
3637 -+ u64 total;
3638 -+
3639 -+ if (bfqq->last_request_pos < blk_rq_pos(rq))
3640 -+ sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
3641 -+ else
3642 -+ sdist = bfqq->last_request_pos - blk_rq_pos(rq);
3643 -+
3644 -+ /*
3645 -+ * Don't allow the seek distance to get too large from the
3646 -+ * odd fragment, pagein, etc.
3647 -+ */
3648 -+ if (bfqq->seek_samples == 0) /* first request, not really a seek */
3649 -+ sdist = 0;
3650 -+ else if (bfqq->seek_samples <= 60) /* second & third seek */
3651 -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
3652 -+ else
3653 -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
3654 -+
3655 -+ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
3656 -+ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
3657 -+ total = bfqq->seek_total + (bfqq->seek_samples/2);
3658 -+ do_div(total, bfqq->seek_samples);
3659 -+ bfqq->seek_mean = (sector_t)total;
3660 -+
3661 -+ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
3662 -+ (u64)bfqq->seek_mean);
3663 -+}
3664 -+
3665 -+/*
3666 -+ * Disable idle window if the process thinks too long or seeks so much that
3667 -+ * it doesn't matter.
3668 -+ */
3669 -+static void bfq_update_idle_window(struct bfq_data *bfqd,
3670 -+ struct bfq_queue *bfqq,
3671 -+ struct bfq_io_cq *bic)
3672 -+{
3673 -+ int enable_idle;
3674 -+
3675 -+ /* Don't idle for async or idle io prio class. */
3676 -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
3677 -+ return;
3678 -+
3679 -+ enable_idle = bfq_bfqq_idle_window(bfqq);
3680 -+
3681 -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
3682 -+ bfqd->bfq_slice_idle == 0 ||
3683 -+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
3684 -+ bfqq->raising_coeff == 1))
3685 -+ enable_idle = 0;
3686 -+ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
3687 -+ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
3688 -+ bfqq->raising_coeff == 1)
3689 -+ enable_idle = 0;
3690 -+ else
3691 -+ enable_idle = 1;
3692 -+ }
3693 -+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
3694 -+ enable_idle);
3695 -+
3696 -+ if (enable_idle)
3697 -+ bfq_mark_bfqq_idle_window(bfqq);
3698 -+ else
3699 -+ bfq_clear_bfqq_idle_window(bfqq);
3700 -+}
3701 -+
3702 -+/*
3703 -+ * Called when a new fs request (rq) is added to bfqq. Check if there's
3704 -+ * something we should do about it.
3705 -+ */
3706 -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3707 -+ struct request *rq)
3708 -+{
3709 -+ struct bfq_io_cq *bic = RQ_BIC(rq);
3710 -+
3711 -+ if (rq->cmd_flags & REQ_META)
3712 -+ bfqq->meta_pending++;
3713 -+
3714 -+ bfq_update_io_thinktime(bfqd, bic);
3715 -+ bfq_update_io_seektime(bfqd, bfqq, rq);
3716 -+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
3717 -+ !BFQQ_SEEKY(bfqq))
3718 -+ bfq_update_idle_window(bfqd, bfqq, bic);
3719 -+
3720 -+ bfq_log_bfqq(bfqd, bfqq,
3721 -+ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
3722 -+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
3723 -+ (long long unsigned)bfqq->seek_mean);
3724 -+
3725 -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
3726 -+
3727 -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
3728 -+ int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
3729 -+ blk_rq_sectors(rq) < 32;
3730 -+ int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
3731 -+
3732 -+ /*
3733 -+ * There is just this request queued: if the request
3734 -+ * is small and the queue is not to be expired, then
3735 -+ * just exit.
3736 -+ *
3737 -+ * In this way, if the disk is being idled to wait for
3738 -+ * a new request from the in-service queue, we avoid
3739 -+ * unplugging the device and committing the disk to serve
3740 -+ * just a small request. On the contrary, we wait for
3741 -+ * the block layer to decide when to unplug the device:
3742 -+ * hopefully, new requests will be merged to this one
3743 -+ * quickly, then the device will be unplugged and
3744 -+ * larger requests will be dispatched.
3745 -+ */
3746 -+ if (small_req && !budget_timeout)
3747 -+ return;
3748 -+
3749 -+ /*
3750 -+ * A large enough request arrived, or the queue is to
3751 -+ * be expired: in both cases disk idling is to be
3752 -+ * stopped, so clear wait_request flag and reset
3753 -+ * timer.
3754 -+ */
3755 -+ bfq_clear_bfqq_wait_request(bfqq);
3756 -+ del_timer(&bfqd->idle_slice_timer);
3757 -+
3758 -+ /*
3759 -+ * The queue is not empty, because a new request just
3760 -+ * arrived. Hence we can safely expire the queue, in
3761 -+ * case of budget timeout, without risking that the
3762 -+ * timestamps of the queue are not updated correctly.
3763 -+ * See [1] for more details.
3764 -+ */
3765 -+ if (budget_timeout)
3766 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
3767 -+
3768 -+ /*
3769 -+ * Let the request rip immediately, or let a new queue be
3770 -+ * selected if bfqq has just been expired.
3771 -+ */
3772 -+ __blk_run_queue(bfqd->queue);
3773 -+ }
3774 -+}
3775 -+
3776 -+static void bfq_insert_request(struct request_queue *q, struct request *rq)
3777 -+{
3778 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3779 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3780 -+
3781 -+ assert_spin_locked(bfqd->queue->queue_lock);
3782 -+ bfq_init_prio_data(bfqq, RQ_BIC(rq));
3783 -+
3784 -+ bfq_add_rq_rb(rq);
3785 -+
3786 -+ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
3787 -+ list_add_tail(&rq->queuelist, &bfqq->fifo);
3788 -+
3789 -+ bfq_rq_enqueued(bfqd, bfqq, rq);
3790 -+}
3791 -+
3792 -+static void bfq_update_hw_tag(struct bfq_data *bfqd)
3793 -+{
3794 -+ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
3795 -+ bfqd->rq_in_driver);
3796 -+
3797 -+ if (bfqd->hw_tag == 1)
3798 -+ return;
3799 -+
3800 -+ /*
3801 -+ * This sample is valid if the number of outstanding requests
3802 -+ * is large enough to allow a queueing behavior. Note that the
3803 -+ * sum is not exact, as it's not taking into account deactivated
3804 -+ * requests.
3805 -+ */
3806 -+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
3807 -+ return;
3808 -+
3809 -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
3810 -+ return;
3811 -+
3812 -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
3813 -+ bfqd->max_rq_in_driver = 0;
3814 -+ bfqd->hw_tag_samples = 0;
3815 -+}
3816 -+
3817 -+static void bfq_completed_request(struct request_queue *q, struct request *rq)
3818 -+{
3819 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3820 -+ struct bfq_data *bfqd = bfqq->bfqd;
3821 -+ const int sync = rq_is_sync(rq);
3822 -+
3823 -+ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
3824 -+ blk_rq_sectors(rq), sync);
3825 -+
3826 -+ bfq_update_hw_tag(bfqd);
3827 -+
3828 -+ WARN_ON(!bfqd->rq_in_driver);
3829 -+ WARN_ON(!bfqq->dispatched);
3830 -+ bfqd->rq_in_driver--;
3831 -+ bfqq->dispatched--;
3832 -+
3833 -+ if (bfq_bfqq_sync(bfqq))
3834 -+ bfqd->sync_flight--;
3835 -+
3836 -+ if (sync)
3837 -+ RQ_BIC(rq)->ttime.last_end_request = jiffies;
3838 -+
3839 -+ /*
3840 -+ * The computation of softrt_next_start was scheduled for the next
3841 -+ * request completion: it is now time to compute it.
3842 -+ */
3843 -+ if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list))
3844 -+ bfqq->soft_rt_next_start =
3845 -+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
3846 -+
3847 -+ /*
3848 -+ * If this is the in-service queue, check if it needs to be expired,
3849 -+ * or if we want to idle in case it has no pending requests.
3850 -+ */
3851 -+ if (bfqd->in_service_queue == bfqq) {
3852 -+ if (bfq_bfqq_budget_new(bfqq))
3853 -+ bfq_set_budget_timeout(bfqd);
3854 -+
3855 -+ if (bfq_bfqq_must_idle(bfqq)) {
3856 -+ bfq_arm_slice_timer(bfqd);
3857 -+ goto out;
3858 -+ } else if (bfq_may_expire_for_budg_timeout(bfqq))
3859 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
3860 -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
3861 -+ (bfqq->dispatched == 0 ||
3862 -+ !bfq_bfqq_must_not_expire(bfqq)))
3863 -+ bfq_bfqq_expire(bfqd, bfqq, 0,
3864 -+ BFQ_BFQQ_NO_MORE_REQUESTS);
3865 -+ }
3866 -+
3867 -+ if (!bfqd->rq_in_driver)
3868 -+ bfq_schedule_dispatch(bfqd);
3869 -+
3870 -+out:
3871 -+ return;
3872 -+}
3873 -+
3874 -+static inline int __bfq_may_queue(struct bfq_queue *bfqq)
3875 -+{
3876 -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
3877 -+ bfq_clear_bfqq_must_alloc(bfqq);
3878 -+ return ELV_MQUEUE_MUST;
3879 -+ }
3880 -+
3881 -+ return ELV_MQUEUE_MAY;
3882 -+}
3883 -+
3884 -+static int bfq_may_queue(struct request_queue *q, int rw)
3885 -+{
3886 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3887 -+ struct task_struct *tsk = current;
3888 -+ struct bfq_io_cq *bic;
3889 -+ struct bfq_queue *bfqq;
3890 -+
3891 -+ /*
3892 -+ * Don't force setup of a queue from here, as a call to may_queue
3893 -+ * does not necessarily imply that a request actually will be queued.
3894 -+ * So just lookup a possibly existing queue, or return 'may queue'
3895 -+ * if that fails.
3896 -+ */
3897 -+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
3898 -+ if (bic == NULL)
3899 -+ return ELV_MQUEUE_MAY;
3900 -+
3901 -+ bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
3902 -+ if (bfqq != NULL) {
3903 -+ bfq_init_prio_data(bfqq, bic);
3904 -+
3905 -+ return __bfq_may_queue(bfqq);
3906 -+ }
3907 -+
3908 -+ return ELV_MQUEUE_MAY;
3909 -+}
3910 -+
3911 -+/*
3912 -+ * Queue lock held here.
3913 -+ */
3914 -+static void bfq_put_request(struct request *rq)
3915 -+{
3916 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
3917 -+
3918 -+ if (bfqq != NULL) {
3919 -+ const int rw = rq_data_dir(rq);
3920 -+
3921 -+ BUG_ON(!bfqq->allocated[rw]);
3922 -+ bfqq->allocated[rw]--;
3923 -+
3924 -+ rq->elv.priv[0] = NULL;
3925 -+ rq->elv.priv[1] = NULL;
3926 -+
3927 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
3928 -+ bfqq, atomic_read(&bfqq->ref));
3929 -+ bfq_put_queue(bfqq);
3930 -+ }
3931 -+}
3932 -+
3933 -+static struct bfq_queue *
3934 -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
3935 -+ struct bfq_queue *bfqq)
3936 -+{
3937 -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
3938 -+ (long unsigned)bfqq->new_bfqq->pid);
3939 -+ bic_set_bfqq(bic, bfqq->new_bfqq, 1);
3940 -+ bfq_mark_bfqq_coop(bfqq->new_bfqq);
3941 -+ bfq_put_queue(bfqq);
3942 -+ return bic_to_bfqq(bic, 1);
3943 -+}
3944 -+
3945 -+/*
3946 -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
3947 -+ * was the last process referring to said bfqq.
3948 -+ */
3949 -+static struct bfq_queue *
3950 -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
3951 -+{
3952 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
3953 -+ if (bfqq_process_refs(bfqq) == 1) {
3954 -+ bfqq->pid = current->pid;
3955 -+ bfq_clear_bfqq_coop(bfqq);
3956 -+ bfq_clear_bfqq_split_coop(bfqq);
3957 -+ return bfqq;
3958 -+ }
3959 -+
3960 -+ bic_set_bfqq(bic, NULL, 1);
3961 -+
3962 -+ bfq_put_cooperator(bfqq);
3963 -+
3964 -+ bfq_put_queue(bfqq);
3965 -+ return NULL;
3966 -+}
3967 -+
3968 -+/*
3969 -+ * Allocate bfq data structures associated with this request.
3970 -+ */
3971 -+static int bfq_set_request(struct request_queue *q, struct request *rq,
3972 -+ struct bio *bio, gfp_t gfp_mask)
3973 -+{
3974 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
3975 -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
3976 -+ const int rw = rq_data_dir(rq);
3977 -+ const int is_sync = rq_is_sync(rq);
3978 -+ struct bfq_queue *bfqq;
3979 -+ struct bfq_group *bfqg;
3980 -+ unsigned long flags;
3981 -+
3982 -+ might_sleep_if(gfp_mask & __GFP_WAIT);
3983 -+
3984 -+ bfq_changed_ioprio(bic);
3985 -+
3986 -+ spin_lock_irqsave(q->queue_lock, flags);
3987 -+
3988 -+ if (bic == NULL)
3989 -+ goto queue_fail;
3990 -+
3991 -+ bfqg = bfq_bic_update_cgroup(bic);
3992 -+
3993 -+new_queue:
3994 -+ bfqq = bic_to_bfqq(bic, is_sync);
3995 -+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
3996 -+ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
3997 -+ bic_set_bfqq(bic, bfqq, is_sync);
3998 -+ } else {
3999 -+ /*
4000 -+ * If the queue was seeky for too long, break it apart.
4001 -+ */
4002 -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
4003 -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
4004 -+ bfqq = bfq_split_bfqq(bic, bfqq);
4005 -+ if (!bfqq)
4006 -+ goto new_queue;
4007 -+ }
4008 -+
4009 -+ /*
4010 -+ * Check to see if this queue is scheduled to merge with
4011 -+ * another closely cooperating queue. The merging of queues
4012 -+ * happens here as it must be done in process context.
4013 -+ * The reference on new_bfqq was taken in merge_bfqqs.
4014 -+ */
4015 -+ if (bfqq->new_bfqq != NULL)
4016 -+ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
4017 -+ }
4018 -+
4019 -+ bfqq->allocated[rw]++;
4020 -+ atomic_inc(&bfqq->ref);
4021 -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
4022 -+ atomic_read(&bfqq->ref));
4023 -+
4024 -+ rq->elv.priv[0] = bic;
4025 -+ rq->elv.priv[1] = bfqq;
4026 -+
4027 -+ spin_unlock_irqrestore(q->queue_lock, flags);
4028 -+
4029 -+ return 0;
4030 -+
4031 -+queue_fail:
4032 -+ bfq_schedule_dispatch(bfqd);
4033 -+ spin_unlock_irqrestore(q->queue_lock, flags);
4034 -+
4035 -+ return 1;
4036 -+}
4037 -+
4038 -+static void bfq_kick_queue(struct work_struct *work)
4039 -+{
4040 -+ struct bfq_data *bfqd =
4041 -+ container_of(work, struct bfq_data, unplug_work);
4042 -+ struct request_queue *q = bfqd->queue;
4043 -+
4044 -+ spin_lock_irq(q->queue_lock);
4045 -+ __blk_run_queue(q);
4046 -+ spin_unlock_irq(q->queue_lock);
4047 -+}
4048 -+
4049 -+/*
4050 -+ * Handler of the expiration of the timer running if the in-service queue
4051 -+ * is idling inside its time slice.
4052 -+ */
4053 -+static void bfq_idle_slice_timer(unsigned long data)
4054 -+{
4055 -+ struct bfq_data *bfqd = (struct bfq_data *)data;
4056 -+ struct bfq_queue *bfqq;
4057 -+ unsigned long flags;
4058 -+ enum bfqq_expiration reason;
4059 -+
4060 -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
4061 -+
4062 -+ bfqq = bfqd->in_service_queue;
4063 -+ /*
4064 -+ * Theoretical race here: the in-service queue can be NULL or different
4065 -+ * from the queue that was idling if the timer handler spins on
4066 -+ * the queue_lock and a new request arrives for the current
4067 -+ * queue and there is a full dispatch cycle that changes the
4068 -+ * in-service queue. This can hardly happen, but in the worst case
4069 -+ * we just expire a queue too early.
4070 -+ */
4071 -+ if (bfqq != NULL) {
4072 -+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
4073 -+ if (bfq_bfqq_budget_timeout(bfqq))
4074 -+ /*
4075 -+ * Also here the queue can be safely expired
4076 -+ * for budget timeout without wasting
4077 -+ * guarantees
4078 -+ */
4079 -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
4080 -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
4081 -+ /*
4082 -+ * The queue may not be empty upon timer expiration,
4083 -+ * because we may not disable the timer when the first
4084 -+ * request of the in-service queue arrives during
4085 -+ * disk idling
4086 -+ */
4087 -+ reason = BFQ_BFQQ_TOO_IDLE;
4088 -+ else
4089 -+ goto schedule_dispatch;
4090 -+
4091 -+ bfq_bfqq_expire(bfqd, bfqq, 1, reason);
4092 -+ }
4093 -+
4094 -+schedule_dispatch:
4095 -+ bfq_schedule_dispatch(bfqd);
4096 -+
4097 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
4098 -+}
4099 -+
4100 -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
4101 -+{
4102 -+ del_timer_sync(&bfqd->idle_slice_timer);
4103 -+ cancel_work_sync(&bfqd->unplug_work);
4104 -+}
4105 -+
4106 -+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
4107 -+ struct bfq_queue **bfqq_ptr)
4108 -+{
4109 -+ struct bfq_group *root_group = bfqd->root_group;
4110 -+ struct bfq_queue *bfqq = *bfqq_ptr;
4111 -+
4112 -+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
4113 -+ if (bfqq != NULL) {
4114 -+ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
4115 -+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
4116 -+ bfqq, atomic_read(&bfqq->ref));
4117 -+ bfq_put_queue(bfqq);
4118 -+ *bfqq_ptr = NULL;
4119 -+ }
4120 -+}
4121 -+
4122 -+/*
4123 -+ * Release all the bfqg references to its async queues. If we are
4124 -+ * deallocating the group these queues may still contain requests, so
4125 -+ * we reparent them to the root cgroup (i.e., the only one that will
4126 -+ * exist for sure untill all the requests on a device are gone).
4127 -+ */
4128 -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
4129 -+{
4130 -+ int i, j;
4131 -+
4132 -+ for (i = 0; i < 2; i++)
4133 -+ for (j = 0; j < IOPRIO_BE_NR; j++)
4134 -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
4135 -+
4136 -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
4137 -+}
4138 -+
4139 -+static void bfq_exit_queue(struct elevator_queue *e)
4140 -+{
4141 -+ struct bfq_data *bfqd = e->elevator_data;
4142 -+ struct request_queue *q = bfqd->queue;
4143 -+ struct bfq_queue *bfqq, *n;
4144 -+
4145 -+ bfq_shutdown_timer_wq(bfqd);
4146 -+
4147 -+ spin_lock_irq(q->queue_lock);
4148 -+
4149 -+ BUG_ON(bfqd->in_service_queue != NULL);
4150 -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
4151 -+ bfq_deactivate_bfqq(bfqd, bfqq, 0);
4152 -+
4153 -+ bfq_disconnect_groups(bfqd);
4154 -+ spin_unlock_irq(q->queue_lock);
4155 -+
4156 -+ bfq_shutdown_timer_wq(bfqd);
4157 -+
4158 -+ synchronize_rcu();
4159 -+
4160 -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
4161 -+
4162 -+ bfq_free_root_group(bfqd);
4163 -+ kfree(bfqd);
4164 -+}
4165 -+
4166 -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
4167 -+{
4168 -+ struct bfq_group *bfqg;
4169 -+ struct bfq_data *bfqd;
4170 -+ struct elevator_queue *eq;
4171 -+
4172 -+ eq = elevator_alloc(q, e);
4173 -+ if (eq == NULL)
4174 -+ return -ENOMEM;
4175 -+
4176 -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
4177 -+ if (bfqd == NULL) {
4178 -+ kobject_put(&eq->kobj);
4179 -+ return -ENOMEM;
4180 -+ }
4181 -+ eq->elevator_data = bfqd;
4182 -+
4183 -+ /*
4184 -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
4185 -+ * Grab a permanent reference to it, so that the normal code flow
4186 -+ * will not attempt to free it.
4187 -+ */
4188 -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
4189 -+ atomic_inc(&bfqd->oom_bfqq.ref);
4190 -+
4191 -+ bfqd->queue = q;
4192 -+
4193 -+ spin_lock_irq(q->queue_lock);
4194 -+ q->elevator = eq;
4195 -+ spin_unlock_irq(q->queue_lock);
4196 -+
4197 -+ bfqg = bfq_alloc_root_group(bfqd, q->node);
4198 -+ if (bfqg == NULL) {
4199 -+ kfree(bfqd);
4200 -+ kobject_put(&eq->kobj);
4201 -+ return -ENOMEM;
4202 -+ }
4203 -+
4204 -+ bfqd->root_group = bfqg;
4205 -+
4206 -+ init_timer(&bfqd->idle_slice_timer);
4207 -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
4208 -+ bfqd->idle_slice_timer.data = (unsigned long)bfqd;
4209 -+
4210 -+ bfqd->rq_pos_tree = RB_ROOT;
4211 -+
4212 -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
4213 -+
4214 -+ INIT_LIST_HEAD(&bfqd->active_list);
4215 -+ INIT_LIST_HEAD(&bfqd->idle_list);
4216 -+
4217 -+ bfqd->hw_tag = -1;
4218 -+
4219 -+ bfqd->bfq_max_budget = bfq_default_max_budget;
4220 -+
4221 -+ bfqd->bfq_quantum = bfq_quantum;
4222 -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
4223 -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
4224 -+ bfqd->bfq_back_max = bfq_back_max;
4225 -+ bfqd->bfq_back_penalty = bfq_back_penalty;
4226 -+ bfqd->bfq_slice_idle = bfq_slice_idle;
4227 -+ bfqd->bfq_class_idle_last_service = 0;
4228 -+ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
4229 -+ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
4230 -+ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
4231 -+
4232 -+ bfqd->low_latency = true;
4233 -+
4234 -+ bfqd->bfq_raising_coeff = 20;
4235 -+ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
4236 -+ bfqd->bfq_raising_max_time = 0;
4237 -+ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
4238 -+ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
4239 -+ bfqd->bfq_raising_max_softrt_rate = 7000; /*
4240 -+ * Approximate rate required
4241 -+ * to playback or record a
4242 -+ * high-definition compressed
4243 -+ * video.
4244 -+ */
4245 -+ bfqd->raised_busy_queues = 0;
4246 -+
4247 -+ /* Initially estimate the device's peak rate as the reference rate */
4248 -+ if (blk_queue_nonrot(bfqd->queue)) {
4249 -+ bfqd->RT_prod = R_nonrot * T_nonrot;
4250 -+ bfqd->peak_rate = R_nonrot;
4251 -+ } else {
4252 -+ bfqd->RT_prod = R_rot * T_rot;
4253 -+ bfqd->peak_rate = R_rot;
4254 -+ }
4255 -+
4256 -+ return 0;
4257 -+}
4258 -+
4259 -+static void bfq_slab_kill(void)
4260 -+{
4261 -+ if (bfq_pool != NULL)
4262 -+ kmem_cache_destroy(bfq_pool);
4263 -+}
4264 -+
4265 -+static int __init bfq_slab_setup(void)
4266 -+{
4267 -+ bfq_pool = KMEM_CACHE(bfq_queue, 0);
4268 -+ if (bfq_pool == NULL)
4269 -+ return -ENOMEM;
4270 -+ return 0;
4271 -+}
4272 -+
4273 -+static ssize_t bfq_var_show(unsigned int var, char *page)
4274 -+{
4275 -+ return sprintf(page, "%d\n", var);
4276 -+}
4277 -+
4278 -+static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
4279 -+{
4280 -+ unsigned long new_val;
4281 -+ int ret = kstrtoul(page, 10, &new_val);
4282 -+
4283 -+ if (ret == 0)
4284 -+ *var = new_val;
4285 -+
4286 -+ return count;
4287 -+}
4288 -+
4289 -+static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
4290 -+{
4291 -+ struct bfq_data *bfqd = e->elevator_data;
4292 -+ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
4293 -+ jiffies_to_msecs(bfqd->bfq_raising_max_time) :
4294 -+ jiffies_to_msecs(bfq_wrais_duration(bfqd)));
4295 -+}
4296 -+
4297 -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
4298 -+{
4299 -+ struct bfq_queue *bfqq;
4300 -+ struct bfq_data *bfqd = e->elevator_data;
4301 -+ ssize_t num_char = 0;
4302 -+
4303 -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
4304 -+ bfqd->queued);
4305 -+
4306 -+ spin_lock_irq(bfqd->queue->queue_lock);
4307 -+
4308 -+ num_char += sprintf(page + num_char, "Active:\n");
4309 -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
4310 -+ num_char += sprintf(page + num_char,
4311 -+ "pid%d: weight %hu, nr_queued %d %d,"
4312 -+ " dur %d/%u\n",
4313 -+ bfqq->pid,
4314 -+ bfqq->entity.weight,
4315 -+ bfqq->queued[0],
4316 -+ bfqq->queued[1],
4317 -+ jiffies_to_msecs(jiffies -
4318 -+ bfqq->last_rais_start_finish),
4319 -+ jiffies_to_msecs(bfqq->raising_cur_max_time));
4320 -+ }
4321 -+
4322 -+ num_char += sprintf(page + num_char, "Idle:\n");
4323 -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
4324 -+ num_char += sprintf(page + num_char,
4325 -+ "pid%d: weight %hu, dur %d/%u\n",
4326 -+ bfqq->pid,
4327 -+ bfqq->entity.weight,
4328 -+ jiffies_to_msecs(jiffies -
4329 -+ bfqq->last_rais_start_finish),
4330 -+ jiffies_to_msecs(bfqq->raising_cur_max_time));
4331 -+ }
4332 -+
4333 -+ spin_unlock_irq(bfqd->queue->queue_lock);
4334 -+
4335 -+ return num_char;
4336 -+}
4337 -+
4338 -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
4339 -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
4340 -+{ \
4341 -+ struct bfq_data *bfqd = e->elevator_data; \
4342 -+ unsigned int __data = __VAR; \
4343 -+ if (__CONV) \
4344 -+ __data = jiffies_to_msecs(__data); \
4345 -+ return bfq_var_show(__data, (page)); \
4346 -+}
4347 -+SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
4348 -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
4349 -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
4350 -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
4351 -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
4352 -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
4353 -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
4354 -+SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
4355 -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
4356 -+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
4357 -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
4358 -+SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
4359 -+SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
4360 -+SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
4361 -+ 1);
4362 -+SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
4363 -+ bfqd->bfq_raising_min_inter_arr_async,
4364 -+ 1);
4365 -+SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
4366 -+ bfqd->bfq_raising_max_softrt_rate, 0);
4367 -+#undef SHOW_FUNCTION
4368 -+
4369 -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
4370 -+static ssize_t \
4371 -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \
4372 -+{ \
4373 -+ struct bfq_data *bfqd = e->elevator_data; \
4374 -+ unsigned long uninitialized_var(__data); \
4375 -+ int ret = bfq_var_store(&__data, (page), count); \
4376 -+ if (__data < (MIN)) \
4377 -+ __data = (MIN); \
4378 -+ else if (__data > (MAX)) \
4379 -+ __data = (MAX); \
4380 -+ if (__CONV) \
4381 -+ *(__PTR) = msecs_to_jiffies(__data); \
4382 -+ else \
4383 -+ *(__PTR) = __data; \
4384 -+ return ret; \
4385 -+}
4386 -+STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
4387 -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
4388 -+ INT_MAX, 1);
4389 -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
4390 -+ INT_MAX, 1);
4391 -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
4392 -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
4393 -+ INT_MAX, 0);
4394 -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
4395 -+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
4396 -+ 1, INT_MAX, 0);
4397 -+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
4398 -+ INT_MAX, 1);
4399 -+STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
4400 -+ INT_MAX, 0);
4401 -+STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
4402 -+ INT_MAX, 1);
4403 -+STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
4404 -+ INT_MAX, 1);
4405 -+STORE_FUNCTION(bfq_raising_min_idle_time_store,
4406 -+ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
4407 -+STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
4408 -+ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
4409 -+STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
4410 -+ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
4411 -+#undef STORE_FUNCTION
4412 -+
4413 -+/* do nothing for the moment */
4414 -+static ssize_t bfq_weights_store(struct elevator_queue *e,
4415 -+ const char *page, size_t count)
4416 -+{
4417 -+ return count;
4418 -+}
4419 -+
4420 -+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
4421 -+{
4422 -+ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
4423 -+
4424 -+ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
4425 -+ return bfq_calc_max_budget(bfqd->peak_rate, timeout);
4426 -+ else
4427 -+ return bfq_default_max_budget;
4428 -+}
4429 -+
4430 -+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
4431 -+ const char *page, size_t count)
4432 -+{
4433 -+ struct bfq_data *bfqd = e->elevator_data;
4434 -+ unsigned long uninitialized_var(__data);
4435 -+ int ret = bfq_var_store(&__data, (page), count);
4436 -+
4437 -+ if (__data == 0)
4438 -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4439 -+ else {
4440 -+ if (__data > INT_MAX)
4441 -+ __data = INT_MAX;
4442 -+ bfqd->bfq_max_budget = __data;
4443 -+ }
4444 -+
4445 -+ bfqd->bfq_user_max_budget = __data;
4446 -+
4447 -+ return ret;
4448 -+}
4449 -+
4450 -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
4451 -+ const char *page, size_t count)
4452 -+{
4453 -+ struct bfq_data *bfqd = e->elevator_data;
4454 -+ unsigned long uninitialized_var(__data);
4455 -+ int ret = bfq_var_store(&__data, (page), count);
4456 -+
4457 -+ if (__data < 1)
4458 -+ __data = 1;
4459 -+ else if (__data > INT_MAX)
4460 -+ __data = INT_MAX;
4461 -+
4462 -+ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
4463 -+ if (bfqd->bfq_user_max_budget == 0)
4464 -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
4465 -+
4466 -+ return ret;
4467 -+}
4468 -+
4469 -+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
4470 -+ const char *page, size_t count)
4471 -+{
4472 -+ struct bfq_data *bfqd = e->elevator_data;
4473 -+ unsigned long uninitialized_var(__data);
4474 -+ int ret = bfq_var_store(&__data, (page), count);
4475 -+
4476 -+ if (__data > 1)
4477 -+ __data = 1;
4478 -+ if (__data == 0 && bfqd->low_latency != 0)
4479 -+ bfq_end_raising(bfqd);
4480 -+ bfqd->low_latency = __data;
4481 -+
4482 -+ return ret;
4483 -+}
4484 -+
4485 -+#define BFQ_ATTR(name) \
4486 -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
4487 -+
4488 -+static struct elv_fs_entry bfq_attrs[] = {
4489 -+ BFQ_ATTR(quantum),
4490 -+ BFQ_ATTR(fifo_expire_sync),
4491 -+ BFQ_ATTR(fifo_expire_async),
4492 -+ BFQ_ATTR(back_seek_max),
4493 -+ BFQ_ATTR(back_seek_penalty),
4494 -+ BFQ_ATTR(slice_idle),
4495 -+ BFQ_ATTR(max_budget),
4496 -+ BFQ_ATTR(max_budget_async_rq),
4497 -+ BFQ_ATTR(timeout_sync),
4498 -+ BFQ_ATTR(timeout_async),
4499 -+ BFQ_ATTR(low_latency),
4500 -+ BFQ_ATTR(raising_coeff),
4501 -+ BFQ_ATTR(raising_max_time),
4502 -+ BFQ_ATTR(raising_rt_max_time),
4503 -+ BFQ_ATTR(raising_min_idle_time),
4504 -+ BFQ_ATTR(raising_min_inter_arr_async),
4505 -+ BFQ_ATTR(raising_max_softrt_rate),
4506 -+ BFQ_ATTR(weights),
4507 -+ __ATTR_NULL
4508 -+};
4509 -+
4510 -+static struct elevator_type iosched_bfq = {
4511 -+ .ops = {
4512 -+ .elevator_merge_fn = bfq_merge,
4513 -+ .elevator_merged_fn = bfq_merged_request,
4514 -+ .elevator_merge_req_fn = bfq_merged_requests,
4515 -+ .elevator_allow_merge_fn = bfq_allow_merge,
4516 -+ .elevator_dispatch_fn = bfq_dispatch_requests,
4517 -+ .elevator_add_req_fn = bfq_insert_request,
4518 -+ .elevator_activate_req_fn = bfq_activate_request,
4519 -+ .elevator_deactivate_req_fn = bfq_deactivate_request,
4520 -+ .elevator_completed_req_fn = bfq_completed_request,
4521 -+ .elevator_former_req_fn = elv_rb_former_request,
4522 -+ .elevator_latter_req_fn = elv_rb_latter_request,
4523 -+ .elevator_init_icq_fn = bfq_init_icq,
4524 -+ .elevator_exit_icq_fn = bfq_exit_icq,
4525 -+ .elevator_set_req_fn = bfq_set_request,
4526 -+ .elevator_put_req_fn = bfq_put_request,
4527 -+ .elevator_may_queue_fn = bfq_may_queue,
4528 -+ .elevator_init_fn = bfq_init_queue,
4529 -+ .elevator_exit_fn = bfq_exit_queue,
4530 -+ },
4531 -+ .icq_size = sizeof(struct bfq_io_cq),
4532 -+ .icq_align = __alignof__(struct bfq_io_cq),
4533 -+ .elevator_attrs = bfq_attrs,
4534 -+ .elevator_name = "bfq",
4535 -+ .elevator_owner = THIS_MODULE,
4536 -+};
4537 -+
4538 -+static int __init bfq_init(void)
4539 -+{
4540 -+ /*
4541 -+ * Can be 0 on HZ < 1000 setups.
4542 -+ */
4543 -+ if (bfq_slice_idle == 0)
4544 -+ bfq_slice_idle = 1;
4545 -+
4546 -+ if (bfq_timeout_async == 0)
4547 -+ bfq_timeout_async = 1;
4548 -+
4549 -+ if (bfq_slab_setup())
4550 -+ return -ENOMEM;
4551 -+
4552 -+ elv_register(&iosched_bfq);
4553 -+ printk(KERN_INFO "BFQ I/O-scheduler version: v7");
4554 -+
4555 -+ return 0;
4556 -+}
4557 -+
4558 -+static void __exit bfq_exit(void)
4559 -+{
4560 -+ elv_unregister(&iosched_bfq);
4561 -+ bfq_slab_kill();
4562 -+}
4563 -+
4564 -+module_init(bfq_init);
4565 -+module_exit(bfq_exit);
4566 -+
4567 -+MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
4568 -+MODULE_LICENSE("GPL");
4569 -+MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
4570 -diff --git a/block/bfq-sched.c b/block/bfq-sched.c
4571 -new file mode 100644
4572 -index 0000000..30df81c
4573 ---- /dev/null
4574 -+++ b/block/bfq-sched.c
4575 -@@ -0,0 +1,1077 @@
4576 -+/*
4577 -+ * BFQ: Hierarchical B-WF2Q+ scheduler.
4578 -+ *
4579 -+ * Based on ideas and code from CFQ:
4580 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
4581 -+ *
4582 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
4583 -+ * Paolo Valente <paolo.valente@×××××××.it>
4584 -+ *
4585 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
4586 -+ */
4587 -+
4588 -+#ifdef CONFIG_CGROUP_BFQIO
4589 -+#define for_each_entity(entity) \
4590 -+ for (; entity != NULL; entity = entity->parent)
4591 -+
4592 -+#define for_each_entity_safe(entity, parent) \
4593 -+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
4594 -+
4595 -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
4596 -+ int extract,
4597 -+ struct bfq_data *bfqd);
4598 -+
4599 -+static inline void bfq_update_budget(struct bfq_entity *next_active)
4600 -+{
4601 -+ struct bfq_entity *bfqg_entity;
4602 -+ struct bfq_group *bfqg;
4603 -+ struct bfq_sched_data *group_sd;
4604 -+
4605 -+ BUG_ON(next_active == NULL);
4606 -+
4607 -+ group_sd = next_active->sched_data;
4608 -+
4609 -+ bfqg = container_of(group_sd, struct bfq_group, sched_data);
4610 -+ /*
4611 -+ * bfq_group's my_entity field is not NULL only if the group
4612 -+ * is not the root group. We must not touch the root entity
4613 -+ * as it must never become an active entity.
4614 -+ */
4615 -+ bfqg_entity = bfqg->my_entity;
4616 -+ if (bfqg_entity != NULL)
4617 -+ bfqg_entity->budget = next_active->budget;
4618 -+}
4619 -+
4620 -+static int bfq_update_next_active(struct bfq_sched_data *sd)
4621 -+{
4622 -+ struct bfq_entity *next_active;
4623 -+
4624 -+ if (sd->active_entity != NULL)
4625 -+ /* will update/requeue at the end of service */
4626 -+ return 0;
4627 -+
4628 -+ /*
4629 -+ * NOTE: this can be improved in many ways, such as returning
4630 -+ * 1 (and thus propagating upwards the update) only when the
4631 -+ * budget changes, or caching the bfqq that will be scheduled
4632 -+ * next from this subtree. By now we worry more about
4633 -+ * correctness than about performance...
4634 -+ */
4635 -+ next_active = bfq_lookup_next_entity(sd, 0, NULL);
4636 -+ sd->next_active = next_active;
4637 -+
4638 -+ if (next_active != NULL)
4639 -+ bfq_update_budget(next_active);
4640 -+
4641 -+ return 1;
4642 -+}
4643 -+
4644 -+static inline void bfq_check_next_active(struct bfq_sched_data *sd,
4645 -+ struct bfq_entity *entity)
4646 -+{
4647 -+ BUG_ON(sd->next_active != entity);
4648 -+}
4649 -+#else
4650 -+#define for_each_entity(entity) \
4651 -+ for (; entity != NULL; entity = NULL)
4652 -+
4653 -+#define for_each_entity_safe(entity, parent) \
4654 -+ for (parent = NULL; entity != NULL; entity = parent)
4655 -+
4656 -+static inline int bfq_update_next_active(struct bfq_sched_data *sd)
4657 -+{
4658 -+ return 0;
4659 -+}
4660 -+
4661 -+static inline void bfq_check_next_active(struct bfq_sched_data *sd,
4662 -+ struct bfq_entity *entity)
4663 -+{
4664 -+}
4665 -+
4666 -+static inline void bfq_update_budget(struct bfq_entity *next_active)
4667 -+{
4668 -+}
4669 -+#endif
4670 -+
4671 -+/*
4672 -+ * Shift for timestamp calculations. This actually limits the maximum
4673 -+ * service allowed in one timestamp delta (small shift values increase it),
4674 -+ * the maximum total weight that can be used for the queues in the system
4675 -+ * (big shift values increase it), and the period of virtual time wraparounds.
4676 -+ */
4677 -+#define WFQ_SERVICE_SHIFT 22
4678 -+
4679 -+/**
4680 -+ * bfq_gt - compare two timestamps.
4681 -+ * @a: first ts.
4682 -+ * @b: second ts.
4683 -+ *
4684 -+ * Return @a > @b, dealing with wrapping correctly.
4685 -+ */
4686 -+static inline int bfq_gt(u64 a, u64 b)
4687 -+{
4688 -+ return (s64)(a - b) > 0;
4689 -+}
4690 -+
4691 -+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
4692 -+{
4693 -+ struct bfq_queue *bfqq = NULL;
4694 -+
4695 -+ BUG_ON(entity == NULL);
4696 -+
4697 -+ if (entity->my_sched_data == NULL)
4698 -+ bfqq = container_of(entity, struct bfq_queue, entity);
4699 -+
4700 -+ return bfqq;
4701 -+}
4702 -+
4703 -+
4704 -+/**
4705 -+ * bfq_delta - map service into the virtual time domain.
4706 -+ * @service: amount of service.
4707 -+ * @weight: scale factor (weight of an entity or weight sum).
4708 -+ */
4709 -+static inline u64 bfq_delta(unsigned long service,
4710 -+ unsigned long weight)
4711 -+{
4712 -+ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
4713 -+
4714 -+ do_div(d, weight);
4715 -+ return d;
4716 -+}
4717 -+
4718 -+/**
4719 -+ * bfq_calc_finish - assign the finish time to an entity.
4720 -+ * @entity: the entity to act upon.
4721 -+ * @service: the service to be charged to the entity.
4722 -+ */
4723 -+static inline void bfq_calc_finish(struct bfq_entity *entity,
4724 -+ unsigned long service)
4725 -+{
4726 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4727 -+
4728 -+ BUG_ON(entity->weight == 0);
4729 -+
4730 -+ entity->finish = entity->start +
4731 -+ bfq_delta(service, entity->weight);
4732 -+
4733 -+ if (bfqq != NULL) {
4734 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
4735 -+ "calc_finish: serv %lu, w %d",
4736 -+ service, entity->weight);
4737 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
4738 -+ "calc_finish: start %llu, finish %llu, delta %llu",
4739 -+ entity->start, entity->finish,
4740 -+ bfq_delta(service, entity->weight));
4741 -+ }
4742 -+}
4743 -+
4744 -+/**
4745 -+ * bfq_entity_of - get an entity from a node.
4746 -+ * @node: the node field of the entity.
4747 -+ *
4748 -+ * Convert a node pointer to the relative entity. This is used only
4749 -+ * to simplify the logic of some functions and not as the generic
4750 -+ * conversion mechanism because, e.g., in the tree walking functions,
4751 -+ * the check for a %NULL value would be redundant.
4752 -+ */
4753 -+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
4754 -+{
4755 -+ struct bfq_entity *entity = NULL;
4756 -+
4757 -+ if (node != NULL)
4758 -+ entity = rb_entry(node, struct bfq_entity, rb_node);
4759 -+
4760 -+ return entity;
4761 -+}
4762 -+
4763 -+/**
4764 -+ * bfq_extract - remove an entity from a tree.
4765 -+ * @root: the tree root.
4766 -+ * @entity: the entity to remove.
4767 -+ */
4768 -+static inline void bfq_extract(struct rb_root *root,
4769 -+ struct bfq_entity *entity)
4770 -+{
4771 -+ BUG_ON(entity->tree != root);
4772 -+
4773 -+ entity->tree = NULL;
4774 -+ rb_erase(&entity->rb_node, root);
4775 -+}
4776 -+
4777 -+/**
4778 -+ * bfq_idle_extract - extract an entity from the idle tree.
4779 -+ * @st: the service tree of the owning @entity.
4780 -+ * @entity: the entity being removed.
4781 -+ */
4782 -+static void bfq_idle_extract(struct bfq_service_tree *st,
4783 -+ struct bfq_entity *entity)
4784 -+{
4785 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4786 -+ struct rb_node *next;
4787 -+
4788 -+ BUG_ON(entity->tree != &st->idle);
4789 -+
4790 -+ if (entity == st->first_idle) {
4791 -+ next = rb_next(&entity->rb_node);
4792 -+ st->first_idle = bfq_entity_of(next);
4793 -+ }
4794 -+
4795 -+ if (entity == st->last_idle) {
4796 -+ next = rb_prev(&entity->rb_node);
4797 -+ st->last_idle = bfq_entity_of(next);
4798 -+ }
4799 -+
4800 -+ bfq_extract(&st->idle, entity);
4801 -+
4802 -+ if (bfqq != NULL)
4803 -+ list_del(&bfqq->bfqq_list);
4804 -+}
4805 -+
4806 -+/**
4807 -+ * bfq_insert - generic tree insertion.
4808 -+ * @root: tree root.
4809 -+ * @entity: entity to insert.
4810 -+ *
4811 -+ * This is used for the idle and the active tree, since they are both
4812 -+ * ordered by finish time.
4813 -+ */
4814 -+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
4815 -+{
4816 -+ struct bfq_entity *entry;
4817 -+ struct rb_node **node = &root->rb_node;
4818 -+ struct rb_node *parent = NULL;
4819 -+
4820 -+ BUG_ON(entity->tree != NULL);
4821 -+
4822 -+ while (*node != NULL) {
4823 -+ parent = *node;
4824 -+ entry = rb_entry(parent, struct bfq_entity, rb_node);
4825 -+
4826 -+ if (bfq_gt(entry->finish, entity->finish))
4827 -+ node = &parent->rb_left;
4828 -+ else
4829 -+ node = &parent->rb_right;
4830 -+ }
4831 -+
4832 -+ rb_link_node(&entity->rb_node, parent, node);
4833 -+ rb_insert_color(&entity->rb_node, root);
4834 -+
4835 -+ entity->tree = root;
4836 -+}
4837 -+
4838 -+/**
4839 -+ * bfq_update_min - update the min_start field of a entity.
4840 -+ * @entity: the entity to update.
4841 -+ * @node: one of its children.
4842 -+ *
4843 -+ * This function is called when @entity may store an invalid value for
4844 -+ * min_start due to updates to the active tree. The function assumes
4845 -+ * that the subtree rooted at @node (which may be its left or its right
4846 -+ * child) has a valid min_start value.
4847 -+ */
4848 -+static inline void bfq_update_min(struct bfq_entity *entity,
4849 -+ struct rb_node *node)
4850 -+{
4851 -+ struct bfq_entity *child;
4852 -+
4853 -+ if (node != NULL) {
4854 -+ child = rb_entry(node, struct bfq_entity, rb_node);
4855 -+ if (bfq_gt(entity->min_start, child->min_start))
4856 -+ entity->min_start = child->min_start;
4857 -+ }
4858 -+}
4859 -+
4860 -+/**
4861 -+ * bfq_update_active_node - recalculate min_start.
4862 -+ * @node: the node to update.
4863 -+ *
4864 -+ * @node may have changed position or one of its children may have moved,
4865 -+ * this function updates its min_start value. The left and right subtrees
4866 -+ * are assumed to hold a correct min_start value.
4867 -+ */
4868 -+static inline void bfq_update_active_node(struct rb_node *node)
4869 -+{
4870 -+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
4871 -+
4872 -+ entity->min_start = entity->start;
4873 -+ bfq_update_min(entity, node->rb_right);
4874 -+ bfq_update_min(entity, node->rb_left);
4875 -+}
4876 -+
4877 -+/**
4878 -+ * bfq_update_active_tree - update min_start for the whole active tree.
4879 -+ * @node: the starting node.
4880 -+ *
4881 -+ * @node must be the deepest modified node after an update. This function
4882 -+ * updates its min_start using the values held by its children, assuming
4883 -+ * that they did not change, and then updates all the nodes that may have
4884 -+ * changed in the path to the root. The only nodes that may have changed
4885 -+ * are the ones in the path or their siblings.
4886 -+ */
4887 -+static void bfq_update_active_tree(struct rb_node *node)
4888 -+{
4889 -+ struct rb_node *parent;
4890 -+
4891 -+up:
4892 -+ bfq_update_active_node(node);
4893 -+
4894 -+ parent = rb_parent(node);
4895 -+ if (parent == NULL)
4896 -+ return;
4897 -+
4898 -+ if (node == parent->rb_left && parent->rb_right != NULL)
4899 -+ bfq_update_active_node(parent->rb_right);
4900 -+ else if (parent->rb_left != NULL)
4901 -+ bfq_update_active_node(parent->rb_left);
4902 -+
4903 -+ node = parent;
4904 -+ goto up;
4905 -+}
4906 -+
4907 -+/**
4908 -+ * bfq_active_insert - insert an entity in the active tree of its group/device.
4909 -+ * @st: the service tree of the entity.
4910 -+ * @entity: the entity being inserted.
4911 -+ *
4912 -+ * The active tree is ordered by finish time, but an extra key is kept
4913 -+ * per each node, containing the minimum value for the start times of
4914 -+ * its children (and the node itself), so it's possible to search for
4915 -+ * the eligible node with the lowest finish time in logarithmic time.
4916 -+ */
4917 -+static void bfq_active_insert(struct bfq_service_tree *st,
4918 -+ struct bfq_entity *entity)
4919 -+{
4920 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4921 -+ struct rb_node *node = &entity->rb_node;
4922 -+
4923 -+ bfq_insert(&st->active, entity);
4924 -+
4925 -+ if (node->rb_left != NULL)
4926 -+ node = node->rb_left;
4927 -+ else if (node->rb_right != NULL)
4928 -+ node = node->rb_right;
4929 -+
4930 -+ bfq_update_active_tree(node);
4931 -+
4932 -+ if (bfqq != NULL)
4933 -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
4934 -+}
4935 -+
4936 -+/**
4937 -+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
4938 -+ * @ioprio: the ioprio value to convert.
4939 -+ */
4940 -+static unsigned short bfq_ioprio_to_weight(int ioprio)
4941 -+{
4942 -+ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
4943 -+ return IOPRIO_BE_NR - ioprio;
4944 -+}
4945 -+
4946 -+/**
4947 -+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
4948 -+ * @weight: the weight value to convert.
4949 -+ *
4950 -+ * To preserve as mush as possible the old only-ioprio user interface,
4951 -+ * 0 is used as an escape ioprio value for weights (numerically) equal or
4952 -+ * larger than IOPRIO_BE_NR
4953 -+ */
4954 -+static unsigned short bfq_weight_to_ioprio(int weight)
4955 -+{
4956 -+ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
4957 -+ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
4958 -+}
4959 -+
4960 -+static inline void bfq_get_entity(struct bfq_entity *entity)
4961 -+{
4962 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
4963 -+ struct bfq_sched_data *sd;
4964 -+
4965 -+ if (bfqq != NULL) {
4966 -+ sd = entity->sched_data;
4967 -+ atomic_inc(&bfqq->ref);
4968 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
4969 -+ bfqq, atomic_read(&bfqq->ref));
4970 -+ }
4971 -+}
4972 -+
4973 -+/**
4974 -+ * bfq_find_deepest - find the deepest node that an extraction can modify.
4975 -+ * @node: the node being removed.
4976 -+ *
4977 -+ * Do the first step of an extraction in an rb tree, looking for the
4978 -+ * node that will replace @node, and returning the deepest node that
4979 -+ * the following modifications to the tree can touch. If @node is the
4980 -+ * last node in the tree return %NULL.
4981 -+ */
4982 -+static struct rb_node *bfq_find_deepest(struct rb_node *node)
4983 -+{
4984 -+ struct rb_node *deepest;
4985 -+
4986 -+ if (node->rb_right == NULL && node->rb_left == NULL)
4987 -+ deepest = rb_parent(node);
4988 -+ else if (node->rb_right == NULL)
4989 -+ deepest = node->rb_left;
4990 -+ else if (node->rb_left == NULL)
4991 -+ deepest = node->rb_right;
4992 -+ else {
4993 -+ deepest = rb_next(node);
4994 -+ if (deepest->rb_right != NULL)
4995 -+ deepest = deepest->rb_right;
4996 -+ else if (rb_parent(deepest) != node)
4997 -+ deepest = rb_parent(deepest);
4998 -+ }
4999 -+
5000 -+ return deepest;
5001 -+}
5002 -+
5003 -+/**
5004 -+ * bfq_active_extract - remove an entity from the active tree.
5005 -+ * @st: the service_tree containing the tree.
5006 -+ * @entity: the entity being removed.
5007 -+ */
5008 -+static void bfq_active_extract(struct bfq_service_tree *st,
5009 -+ struct bfq_entity *entity)
5010 -+{
5011 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5012 -+ struct rb_node *node;
5013 -+
5014 -+ node = bfq_find_deepest(&entity->rb_node);
5015 -+ bfq_extract(&st->active, entity);
5016 -+
5017 -+ if (node != NULL)
5018 -+ bfq_update_active_tree(node);
5019 -+
5020 -+ if (bfqq != NULL)
5021 -+ list_del(&bfqq->bfqq_list);
5022 -+}
5023 -+
5024 -+/**
5025 -+ * bfq_idle_insert - insert an entity into the idle tree.
5026 -+ * @st: the service tree containing the tree.
5027 -+ * @entity: the entity to insert.
5028 -+ */
5029 -+static void bfq_idle_insert(struct bfq_service_tree *st,
5030 -+ struct bfq_entity *entity)
5031 -+{
5032 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5033 -+ struct bfq_entity *first_idle = st->first_idle;
5034 -+ struct bfq_entity *last_idle = st->last_idle;
5035 -+
5036 -+ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
5037 -+ st->first_idle = entity;
5038 -+ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
5039 -+ st->last_idle = entity;
5040 -+
5041 -+ bfq_insert(&st->idle, entity);
5042 -+
5043 -+ if (bfqq != NULL)
5044 -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
5045 -+}
5046 -+
5047 -+/**
5048 -+ * bfq_forget_entity - remove an entity from the wfq trees.
5049 -+ * @st: the service tree.
5050 -+ * @entity: the entity being removed.
5051 -+ *
5052 -+ * Update the device status and forget everything about @entity, putting
5053 -+ * the device reference to it, if it is a queue. Entities belonging to
5054 -+ * groups are not refcounted.
5055 -+ */
5056 -+static void bfq_forget_entity(struct bfq_service_tree *st,
5057 -+ struct bfq_entity *entity)
5058 -+{
5059 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5060 -+ struct bfq_sched_data *sd;
5061 -+
5062 -+ BUG_ON(!entity->on_st);
5063 -+
5064 -+ entity->on_st = 0;
5065 -+ st->wsum -= entity->weight;
5066 -+ if (bfqq != NULL) {
5067 -+ sd = entity->sched_data;
5068 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
5069 -+ bfqq, atomic_read(&bfqq->ref));
5070 -+ bfq_put_queue(bfqq);
5071 -+ }
5072 -+}
5073 -+
5074 -+/**
5075 -+ * bfq_put_idle_entity - release the idle tree ref of an entity.
5076 -+ * @st: service tree for the entity.
5077 -+ * @entity: the entity being released.
5078 -+ */
5079 -+static void bfq_put_idle_entity(struct bfq_service_tree *st,
5080 -+ struct bfq_entity *entity)
5081 -+{
5082 -+ bfq_idle_extract(st, entity);
5083 -+ bfq_forget_entity(st, entity);
5084 -+}
5085 -+
5086 -+/**
5087 -+ * bfq_forget_idle - update the idle tree if necessary.
5088 -+ * @st: the service tree to act upon.
5089 -+ *
5090 -+ * To preserve the global O(log N) complexity we only remove one entry here;
5091 -+ * as the idle tree will not grow indefinitely this can be done safely.
5092 -+ */
5093 -+static void bfq_forget_idle(struct bfq_service_tree *st)
5094 -+{
5095 -+ struct bfq_entity *first_idle = st->first_idle;
5096 -+ struct bfq_entity *last_idle = st->last_idle;
5097 -+
5098 -+ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
5099 -+ !bfq_gt(last_idle->finish, st->vtime)) {
5100 -+ /*
5101 -+ * Forget the whole idle tree, increasing the vtime past
5102 -+ * the last finish time of idle entities.
5103 -+ */
5104 -+ st->vtime = last_idle->finish;
5105 -+ }
5106 -+
5107 -+ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
5108 -+ bfq_put_idle_entity(st, first_idle);
5109 -+}
5110 -+
5111 -+static struct bfq_service_tree *
5112 -+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
5113 -+ struct bfq_entity *entity)
5114 -+{
5115 -+ struct bfq_service_tree *new_st = old_st;
5116 -+
5117 -+ if (entity->ioprio_changed) {
5118 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
5119 -+
5120 -+ BUG_ON(old_st->wsum < entity->weight);
5121 -+ old_st->wsum -= entity->weight;
5122 -+
5123 -+ if (entity->new_weight != entity->orig_weight) {
5124 -+ entity->orig_weight = entity->new_weight;
5125 -+ entity->ioprio =
5126 -+ bfq_weight_to_ioprio(entity->orig_weight);
5127 -+ } else if (entity->new_ioprio != entity->ioprio) {
5128 -+ entity->ioprio = entity->new_ioprio;
5129 -+ entity->orig_weight =
5130 -+ bfq_ioprio_to_weight(entity->ioprio);
5131 -+ } else
5132 -+ entity->new_weight = entity->orig_weight =
5133 -+ bfq_ioprio_to_weight(entity->ioprio);
5134 -+
5135 -+ entity->ioprio_class = entity->new_ioprio_class;
5136 -+ entity->ioprio_changed = 0;
5137 -+
5138 -+ /*
5139 -+ * NOTE: here we may be changing the weight too early,
5140 -+ * this will cause unfairness. The correct approach
5141 -+ * would have required additional complexity to defer
5142 -+ * weight changes to the proper time instants (i.e.,
5143 -+ * when entity->finish <= old_st->vtime).
5144 -+ */
5145 -+ new_st = bfq_entity_service_tree(entity);
5146 -+ entity->weight = entity->orig_weight *
5147 -+ (bfqq != NULL ? bfqq->raising_coeff : 1);
5148 -+ new_st->wsum += entity->weight;
5149 -+
5150 -+ if (new_st != old_st)
5151 -+ entity->start = new_st->vtime;
5152 -+ }
5153 -+
5154 -+ return new_st;
5155 -+}
5156 -+
5157 -+/**
5158 -+ * bfq_bfqq_served - update the scheduler status after selection for service.
5159 -+ * @bfqq: the queue being served.
5160 -+ * @served: bytes to transfer.
5161 -+ *
5162 -+ * NOTE: this can be optimized, as the timestamps of upper level entities
5163 -+ * are synchronized every time a new bfqq is selected for service. By now,
5164 -+ * we keep it to better check consistency.
5165 -+ */
5166 -+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
5167 -+{
5168 -+ struct bfq_entity *entity = &bfqq->entity;
5169 -+ struct bfq_service_tree *st;
5170 -+
5171 -+ for_each_entity(entity) {
5172 -+ st = bfq_entity_service_tree(entity);
5173 -+
5174 -+ entity->service += served;
5175 -+ BUG_ON(entity->service > entity->budget);
5176 -+ BUG_ON(st->wsum == 0);
5177 -+
5178 -+ st->vtime += bfq_delta(served, st->wsum);
5179 -+ bfq_forget_idle(st);
5180 -+ }
5181 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
5182 -+}
5183 -+
5184 -+/**
5185 -+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
5186 -+ * @bfqq: the queue that needs a service update.
5187 -+ *
5188 -+ * When it's not possible to be fair in the service domain, because
5189 -+ * a queue is not consuming its budget fast enough (the meaning of
5190 -+ * fast depends on the timeout parameter), we charge it a full
5191 -+ * budget. In this way we should obtain a sort of time-domain
5192 -+ * fairness among all the seeky/slow queues.
5193 -+ */
5194 -+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
5195 -+{
5196 -+ struct bfq_entity *entity = &bfqq->entity;
5197 -+
5198 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
5199 -+
5200 -+ bfq_bfqq_served(bfqq, entity->budget - entity->service);
5201 -+}
5202 -+
5203 -+/**
5204 -+ * __bfq_activate_entity - activate an entity.
5205 -+ * @entity: the entity being activated.
5206 -+ *
5207 -+ * Called whenever an entity is activated, i.e., it is not active and one
5208 -+ * of its children receives a new request, or has to be reactivated due to
5209 -+ * budget exhaustion. It uses the current budget of the entity (and the
5210 -+ * service received if @entity is active) of the queue to calculate its
5211 -+ * timestamps.
5212 -+ */
5213 -+static void __bfq_activate_entity(struct bfq_entity *entity)
5214 -+{
5215 -+ struct bfq_sched_data *sd = entity->sched_data;
5216 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
5217 -+
5218 -+ if (entity == sd->active_entity) {
5219 -+ BUG_ON(entity->tree != NULL);
5220 -+ /*
5221 -+ * If we are requeueing the current entity we have
5222 -+ * to take care of not charging to it service it has
5223 -+ * not received.
5224 -+ */
5225 -+ bfq_calc_finish(entity, entity->service);
5226 -+ entity->start = entity->finish;
5227 -+ sd->active_entity = NULL;
5228 -+ } else if (entity->tree == &st->active) {
5229 -+ /*
5230 -+ * Requeueing an entity due to a change of some
5231 -+ * next_active entity below it. We reuse the old
5232 -+ * start time.
5233 -+ */
5234 -+ bfq_active_extract(st, entity);
5235 -+ } else if (entity->tree == &st->idle) {
5236 -+ /*
5237 -+ * Must be on the idle tree, bfq_idle_extract() will
5238 -+ * check for that.
5239 -+ */
5240 -+ bfq_idle_extract(st, entity);
5241 -+ entity->start = bfq_gt(st->vtime, entity->finish) ?
5242 -+ st->vtime : entity->finish;
5243 -+ } else {
5244 -+ /*
5245 -+ * The finish time of the entity may be invalid, and
5246 -+ * it is in the past for sure, otherwise the queue
5247 -+ * would have been on the idle tree.
5248 -+ */
5249 -+ entity->start = st->vtime;
5250 -+ st->wsum += entity->weight;
5251 -+ bfq_get_entity(entity);
5252 -+
5253 -+ BUG_ON(entity->on_st);
5254 -+ entity->on_st = 1;
5255 -+ }
5256 -+
5257 -+ st = __bfq_entity_update_weight_prio(st, entity);
5258 -+ bfq_calc_finish(entity, entity->budget);
5259 -+ bfq_active_insert(st, entity);
5260 -+}
5261 -+
5262 -+/**
5263 -+ * bfq_activate_entity - activate an entity and its ancestors if necessary.
5264 -+ * @entity: the entity to activate.
5265 -+ *
5266 -+ * Activate @entity and all the entities on the path from it to the root.
5267 -+ */
5268 -+static void bfq_activate_entity(struct bfq_entity *entity)
5269 -+{
5270 -+ struct bfq_sched_data *sd;
5271 -+
5272 -+ for_each_entity(entity) {
5273 -+ __bfq_activate_entity(entity);
5274 -+
5275 -+ sd = entity->sched_data;
5276 -+ if (!bfq_update_next_active(sd))
5277 -+ /*
5278 -+ * No need to propagate the activation to the
5279 -+ * upper entities, as they will be updated when
5280 -+ * the active entity is rescheduled.
5281 -+ */
5282 -+ break;
5283 -+ }
5284 -+}
5285 -+
5286 -+/**
5287 -+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
5288 -+ * @entity: the entity to deactivate.
5289 -+ * @requeue: if false, the entity will not be put into the idle tree.
5290 -+ *
5291 -+ * Deactivate an entity, independently from its previous state. If the
5292 -+ * entity was not on a service tree just return, otherwise if it is on
5293 -+ * any scheduler tree, extract it from that tree, and if necessary
5294 -+ * and if the caller did not specify @requeue, put it on the idle tree.
5295 -+ *
5296 -+ * Return %1 if the caller should update the entity hierarchy, i.e.,
5297 -+ * if the entity was under service or if it was the next_active for
5298 -+ * its sched_data; return %0 otherwise.
5299 -+ */
5300 -+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
5301 -+{
5302 -+ struct bfq_sched_data *sd = entity->sched_data;
5303 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
5304 -+ int was_active = entity == sd->active_entity;
5305 -+ int ret = 0;
5306 -+
5307 -+ if (!entity->on_st)
5308 -+ return 0;
5309 -+
5310 -+ BUG_ON(was_active && entity->tree != NULL);
5311 -+
5312 -+ if (was_active) {
5313 -+ bfq_calc_finish(entity, entity->service);
5314 -+ sd->active_entity = NULL;
5315 -+ } else if (entity->tree == &st->active)
5316 -+ bfq_active_extract(st, entity);
5317 -+ else if (entity->tree == &st->idle)
5318 -+ bfq_idle_extract(st, entity);
5319 -+ else if (entity->tree != NULL)
5320 -+ BUG();
5321 -+
5322 -+ if (was_active || sd->next_active == entity)
5323 -+ ret = bfq_update_next_active(sd);
5324 -+
5325 -+ if (!requeue || !bfq_gt(entity->finish, st->vtime))
5326 -+ bfq_forget_entity(st, entity);
5327 -+ else
5328 -+ bfq_idle_insert(st, entity);
5329 -+
5330 -+ BUG_ON(sd->active_entity == entity);
5331 -+ BUG_ON(sd->next_active == entity);
5332 -+
5333 -+ return ret;
5334 -+}
5335 -+
5336 -+/**
5337 -+ * bfq_deactivate_entity - deactivate an entity.
5338 -+ * @entity: the entity to deactivate.
5339 -+ * @requeue: true if the entity can be put on the idle tree
5340 -+ */
5341 -+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
5342 -+{
5343 -+ struct bfq_sched_data *sd;
5344 -+ struct bfq_entity *parent;
5345 -+
5346 -+ for_each_entity_safe(entity, parent) {
5347 -+ sd = entity->sched_data;
5348 -+
5349 -+ if (!__bfq_deactivate_entity(entity, requeue))
5350 -+ /*
5351 -+ * The parent entity is still backlogged, and
5352 -+ * we don't need to update it as it is still
5353 -+ * under service.
5354 -+ */
5355 -+ break;
5356 -+
5357 -+ if (sd->next_active != NULL)
5358 -+ /*
5359 -+ * The parent entity is still backlogged and
5360 -+ * the budgets on the path towards the root
5361 -+ * need to be updated.
5362 -+ */
5363 -+ goto update;
5364 -+
5365 -+ /*
5366 -+ * If we reach there the parent is no more backlogged and
5367 -+ * we want to propagate the dequeue upwards.
5368 -+ */
5369 -+ requeue = 1;
5370 -+ }
5371 -+
5372 -+ return;
5373 -+
5374 -+update:
5375 -+ entity = parent;
5376 -+ for_each_entity(entity) {
5377 -+ __bfq_activate_entity(entity);
5378 -+
5379 -+ sd = entity->sched_data;
5380 -+ if (!bfq_update_next_active(sd))
5381 -+ break;
5382 -+ }
5383 -+}
5384 -+
5385 -+/**
5386 -+ * bfq_update_vtime - update vtime if necessary.
5387 -+ * @st: the service tree to act upon.
5388 -+ *
5389 -+ * If necessary update the service tree vtime to have at least one
5390 -+ * eligible entity, skipping to its start time. Assumes that the
5391 -+ * active tree of the device is not empty.
5392 -+ *
5393 -+ * NOTE: this hierarchical implementation updates vtimes quite often,
5394 -+ * we may end up with reactivated tasks getting timestamps after a
5395 -+ * vtime skip done because we needed a ->first_active entity on some
5396 -+ * intermediate node.
5397 -+ */
5398 -+static void bfq_update_vtime(struct bfq_service_tree *st)
5399 -+{
5400 -+ struct bfq_entity *entry;
5401 -+ struct rb_node *node = st->active.rb_node;
5402 -+
5403 -+ entry = rb_entry(node, struct bfq_entity, rb_node);
5404 -+ if (bfq_gt(entry->min_start, st->vtime)) {
5405 -+ st->vtime = entry->min_start;
5406 -+ bfq_forget_idle(st);
5407 -+ }
5408 -+}
5409 -+
5410 -+/**
5411 -+ * bfq_first_active - find the eligible entity with the smallest finish time
5412 -+ * @st: the service tree to select from.
5413 -+ *
5414 -+ * This function searches the first schedulable entity, starting from the
5415 -+ * root of the tree and going on the left every time on this side there is
5416 -+ * a subtree with at least one eligible (start >= vtime) entity. The path
5417 -+ * on the right is followed only if a) the left subtree contains no eligible
5418 -+ * entities and b) no eligible entity has been found yet.
5419 -+ */
5420 -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
5421 -+{
5422 -+ struct bfq_entity *entry, *first = NULL;
5423 -+ struct rb_node *node = st->active.rb_node;
5424 -+
5425 -+ while (node != NULL) {
5426 -+ entry = rb_entry(node, struct bfq_entity, rb_node);
5427 -+left:
5428 -+ if (!bfq_gt(entry->start, st->vtime))
5429 -+ first = entry;
5430 -+
5431 -+ BUG_ON(bfq_gt(entry->min_start, st->vtime));
5432 -+
5433 -+ if (node->rb_left != NULL) {
5434 -+ entry = rb_entry(node->rb_left,
5435 -+ struct bfq_entity, rb_node);
5436 -+ if (!bfq_gt(entry->min_start, st->vtime)) {
5437 -+ node = node->rb_left;
5438 -+ goto left;
5439 -+ }
5440 -+ }
5441 -+ if (first != NULL)
5442 -+ break;
5443 -+ node = node->rb_right;
5444 -+ }
5445 -+
5446 -+ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
5447 -+ return first;
5448 -+}
5449 -+
5450 -+/**
5451 -+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
5452 -+ * @st: the service tree.
5453 -+ *
5454 -+ * Update the virtual time in @st and return the first eligible entity
5455 -+ * it contains.
5456 -+ */
5457 -+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
5458 -+ bool force)
5459 -+{
5460 -+ struct bfq_entity *entity, *new_next_active = NULL;
5461 -+
5462 -+ if (RB_EMPTY_ROOT(&st->active))
5463 -+ return NULL;
5464 -+
5465 -+ bfq_update_vtime(st);
5466 -+ entity = bfq_first_active_entity(st);
5467 -+ BUG_ON(bfq_gt(entity->start, st->vtime));
5468 -+
5469 -+ /*
5470 -+ * If the chosen entity does not match with the sched_data's
5471 -+ * next_active and we are forcedly serving the IDLE priority
5472 -+ * class tree, bubble up budget update.
5473 -+ */
5474 -+ if (unlikely(force && entity != entity->sched_data->next_active)) {
5475 -+ new_next_active = entity;
5476 -+ for_each_entity(new_next_active)
5477 -+ bfq_update_budget(new_next_active);
5478 -+ }
5479 -+
5480 -+ return entity;
5481 -+}
5482 -+
5483 -+/**
5484 -+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
5485 -+ * @sd: the sched_data.
5486 -+ * @extract: if true the returned entity will be also extracted from @sd.
5487 -+ *
5488 -+ * NOTE: since we cache the next_active entity at each level of the
5489 -+ * hierarchy, the complexity of the lookup can be decreased with
5490 -+ * absolutely no effort just returning the cached next_active value;
5491 -+ * we prefer to do full lookups to test the consistency of * the data
5492 -+ * structures.
5493 -+ */
5494 -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
5495 -+ int extract,
5496 -+ struct bfq_data *bfqd)
5497 -+{
5498 -+ struct bfq_service_tree *st = sd->service_tree;
5499 -+ struct bfq_entity *entity;
5500 -+ int i = 0;
5501 -+
5502 -+ BUG_ON(sd->active_entity != NULL);
5503 -+
5504 -+ if (bfqd != NULL &&
5505 -+ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
5506 -+ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
5507 -+ true);
5508 -+ if (entity != NULL) {
5509 -+ i = BFQ_IOPRIO_CLASSES - 1;
5510 -+ bfqd->bfq_class_idle_last_service = jiffies;
5511 -+ sd->next_active = entity;
5512 -+ }
5513 -+ }
5514 -+ for (; i < BFQ_IOPRIO_CLASSES; i++) {
5515 -+ entity = __bfq_lookup_next_entity(st + i, false);
5516 -+ if (entity != NULL) {
5517 -+ if (extract) {
5518 -+ bfq_check_next_active(sd, entity);
5519 -+ bfq_active_extract(st + i, entity);
5520 -+ sd->active_entity = entity;
5521 -+ sd->next_active = NULL;
5522 -+ }
5523 -+ break;
5524 -+ }
5525 -+ }
5526 -+
5527 -+ return entity;
5528 -+}
5529 -+
5530 -+/*
5531 -+ * Get next queue for service.
5532 -+ */
5533 -+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
5534 -+{
5535 -+ struct bfq_entity *entity = NULL;
5536 -+ struct bfq_sched_data *sd;
5537 -+ struct bfq_queue *bfqq;
5538 -+
5539 -+ BUG_ON(bfqd->in_service_queue != NULL);
5540 -+
5541 -+ if (bfqd->busy_queues == 0)
5542 -+ return NULL;
5543 -+
5544 -+ sd = &bfqd->root_group->sched_data;
5545 -+ for (; sd != NULL; sd = entity->my_sched_data) {
5546 -+ entity = bfq_lookup_next_entity(sd, 1, bfqd);
5547 -+ BUG_ON(entity == NULL);
5548 -+ entity->service = 0;
5549 -+ }
5550 -+
5551 -+ bfqq = bfq_entity_to_bfqq(entity);
5552 -+ BUG_ON(bfqq == NULL);
5553 -+
5554 -+ return bfqq;
5555 -+}
5556 -+
5557 -+/*
5558 -+ * Forced extraction of the given queue.
5559 -+ */
5560 -+static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
5561 -+ struct bfq_queue *bfqq)
5562 -+{
5563 -+ struct bfq_entity *entity;
5564 -+ struct bfq_sched_data *sd;
5565 -+
5566 -+ BUG_ON(bfqd->in_service_queue != NULL);
5567 -+
5568 -+ entity = &bfqq->entity;
5569 -+ /*
5570 -+ * Bubble up extraction/update from the leaf to the root.
5571 -+ */
5572 -+ for_each_entity(entity) {
5573 -+ sd = entity->sched_data;
5574 -+ bfq_update_budget(entity);
5575 -+ bfq_update_vtime(bfq_entity_service_tree(entity));
5576 -+ bfq_active_extract(bfq_entity_service_tree(entity), entity);
5577 -+ sd->active_entity = entity;
5578 -+ sd->next_active = NULL;
5579 -+ entity->service = 0;
5580 -+ }
5581 -+
5582 -+ return;
5583 -+}
5584 -+
5585 -+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
5586 -+{
5587 -+ if (bfqd->in_service_bic != NULL) {
5588 -+ put_io_context(bfqd->in_service_bic->icq.ioc);
5589 -+ bfqd->in_service_bic = NULL;
5590 -+ }
5591 -+
5592 -+ bfqd->in_service_queue = NULL;
5593 -+ del_timer(&bfqd->idle_slice_timer);
5594 -+}
5595 -+
5596 -+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5597 -+ int requeue)
5598 -+{
5599 -+ struct bfq_entity *entity = &bfqq->entity;
5600 -+
5601 -+ if (bfqq == bfqd->in_service_queue)
5602 -+ __bfq_bfqd_reset_in_service(bfqd);
5603 -+
5604 -+ bfq_deactivate_entity(entity, requeue);
5605 -+}
5606 -+
5607 -+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5608 -+{
5609 -+ struct bfq_entity *entity = &bfqq->entity;
5610 -+
5611 -+ bfq_activate_entity(entity);
5612 -+}
5613 -+
5614 -+/*
5615 -+ * Called when the bfqq no longer has requests pending, remove it from
5616 -+ * the service tree.
5617 -+ */
5618 -+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5619 -+ int requeue)
5620 -+{
5621 -+ BUG_ON(!bfq_bfqq_busy(bfqq));
5622 -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
5623 -+
5624 -+ bfq_log_bfqq(bfqd, bfqq, "del from busy");
5625 -+
5626 -+ bfq_clear_bfqq_busy(bfqq);
5627 -+
5628 -+ BUG_ON(bfqd->busy_queues == 0);
5629 -+ bfqd->busy_queues--;
5630 -+ if (bfqq->raising_coeff > 1)
5631 -+ bfqd->raised_busy_queues--;
5632 -+
5633 -+ bfq_deactivate_bfqq(bfqd, bfqq, requeue);
5634 -+}
5635 -+
5636 -+/*
5637 -+ * Called when an inactive queue receives a new request.
5638 -+ */
5639 -+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5640 -+{
5641 -+ BUG_ON(bfq_bfqq_busy(bfqq));
5642 -+ BUG_ON(bfqq == bfqd->in_service_queue);
5643 -+
5644 -+ bfq_log_bfqq(bfqd, bfqq, "add to busy");
5645 -+
5646 -+ bfq_activate_bfqq(bfqd, bfqq);
5647 -+
5648 -+ bfq_mark_bfqq_busy(bfqq);
5649 -+ bfqd->busy_queues++;
5650 -+ if (bfqq->raising_coeff > 1)
5651 -+ bfqd->raised_busy_queues++;
5652 -+}
5653 -diff --git a/block/bfq.h b/block/bfq.h
5654 -new file mode 100644
5655 -index 0000000..68b28e3
5656 ---- /dev/null
5657 -+++ b/block/bfq.h
5658 -@@ -0,0 +1,614 @@
5659 -+/*
5660 -+ * BFQ-v7 for 3.13.0: data structures and common functions prototypes.
5661 -+ *
5662 -+ * Based on ideas and code from CFQ:
5663 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
5664 -+ *
5665 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
5666 -+ * Paolo Valente <paolo.valente@×××××××.it>
5667 -+ *
5668 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
5669 -+ */
5670 -+
5671 -+#ifndef _BFQ_H
5672 -+#define _BFQ_H
5673 -+
5674 -+#include <linux/blktrace_api.h>
5675 -+#include <linux/hrtimer.h>
5676 -+#include <linux/ioprio.h>
5677 -+#include <linux/rbtree.h>
5678 -+
5679 -+#define BFQ_IOPRIO_CLASSES 3
5680 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
5681 -+
5682 -+#define BFQ_MIN_WEIGHT 1
5683 -+#define BFQ_MAX_WEIGHT 1000
5684 -+
5685 -+#define BFQ_DEFAULT_GRP_WEIGHT 10
5686 -+#define BFQ_DEFAULT_GRP_IOPRIO 0
5687 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
5688 -+
5689 -+struct bfq_entity;
5690 -+
5691 -+/**
5692 -+ * struct bfq_service_tree - per ioprio_class service tree.
5693 -+ * @active: tree for active entities (i.e., those backlogged).
5694 -+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
5695 -+ * @first_idle: idle entity with minimum F_i.
5696 -+ * @last_idle: idle entity with maximum F_i.
5697 -+ * @vtime: scheduler virtual time.
5698 -+ * @wsum: scheduler weight sum; active and idle entities contribute to it.
5699 -+ *
5700 -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
5701 -+ * ioprio_class has its own independent scheduler, and so its own
5702 -+ * bfq_service_tree. All the fields are protected by the queue lock
5703 -+ * of the containing bfqd.
5704 -+ */
5705 -+struct bfq_service_tree {
5706 -+ struct rb_root active;
5707 -+ struct rb_root idle;
5708 -+
5709 -+ struct bfq_entity *first_idle;
5710 -+ struct bfq_entity *last_idle;
5711 -+
5712 -+ u64 vtime;
5713 -+ unsigned long wsum;
5714 -+};
5715 -+
5716 -+/**
5717 -+ * struct bfq_sched_data - multi-class scheduler.
5718 -+ * @active_entity: entity under service.
5719 -+ * @next_active: head-of-the-line entity in the scheduler.
5720 -+ * @service_tree: array of service trees, one per ioprio_class.
5721 -+ *
5722 -+ * bfq_sched_data is the basic scheduler queue. It supports three
5723 -+ * ioprio_classes, and can be used either as a toplevel queue or as
5724 -+ * an intermediate queue on a hierarchical setup.
5725 -+ * @next_active points to the active entity of the sched_data service
5726 -+ * trees that will be scheduled next.
5727 -+ *
5728 -+ * The supported ioprio_classes are the same as in CFQ, in descending
5729 -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
5730 -+ * Requests from higher priority queues are served before all the
5731 -+ * requests from lower priority queues; among requests of the same
5732 -+ * queue requests are served according to B-WF2Q+.
5733 -+ * All the fields are protected by the queue lock of the containing bfqd.
5734 -+ */
5735 -+struct bfq_sched_data {
5736 -+ struct bfq_entity *active_entity;
5737 -+ struct bfq_entity *next_active;
5738 -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
5739 -+};
5740 -+
5741 -+/**
5742 -+ * struct bfq_entity - schedulable entity.
5743 -+ * @rb_node: service_tree member.
5744 -+ * @on_st: flag, true if the entity is on a tree (either the active or
5745 -+ * the idle one of its service_tree).
5746 -+ * @finish: B-WF2Q+ finish timestamp (aka F_i).
5747 -+ * @start: B-WF2Q+ start timestamp (aka S_i).
5748 -+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
5749 -+ * @min_start: minimum start time of the (active) subtree rooted at
5750 -+ * this entity; used for O(log N) lookups into active trees.
5751 -+ * @service: service received during the last round of service.
5752 -+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
5753 -+ * @weight: weight of the queue
5754 -+ * @parent: parent entity, for hierarchical scheduling.
5755 -+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
5756 -+ * associated scheduler queue, %NULL on leaf nodes.
5757 -+ * @sched_data: the scheduler queue this entity belongs to.
5758 -+ * @ioprio: the ioprio in use.
5759 -+ * @new_weight: when a weight change is requested, the new weight value.
5760 -+ * @orig_weight: original weight, used to implement weight boosting
5761 -+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
5762 -+ * @ioprio_class: the ioprio_class in use.
5763 -+ * @new_ioprio_class: when an ioprio_class change is requested, the new
5764 -+ * ioprio_class value.
5765 -+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
5766 -+ * ioprio_class change.
5767 -+ *
5768 -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
5769 -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
5770 -+ * entity belongs to the sched_data of the parent group in the cgroup
5771 -+ * hierarchy. Non-leaf entities have also their own sched_data, stored
5772 -+ * in @my_sched_data.
5773 -+ *
5774 -+ * Each entity stores independently its priority values; this would
5775 -+ * allow different weights on different devices, but this
5776 -+ * functionality is not exported to userspace by now. Priorities and
5777 -+ * weights are updated lazily, first storing the new values into the
5778 -+ * new_* fields, then setting the @ioprio_changed flag. As soon as
5779 -+ * there is a transition in the entity state that allows the priority
5780 -+ * update to take place the effective and the requested priority
5781 -+ * values are synchronized.
5782 -+ *
5783 -+ * Unless cgroups are used, the weight value is calculated from the
5784 -+ * ioprio to export the same interface as CFQ. When dealing with
5785 -+ * ``well-behaved'' queues (i.e., queues that do not spend too much
5786 -+ * time to consume their budget and have true sequential behavior, and
5787 -+ * when there are no external factors breaking anticipation) the
5788 -+ * relative weights at each level of the cgroups hierarchy should be
5789 -+ * guaranteed. All the fields are protected by the queue lock of the
5790 -+ * containing bfqd.
5791 -+ */
5792 -+struct bfq_entity {
5793 -+ struct rb_node rb_node;
5794 -+
5795 -+ int on_st;
5796 -+
5797 -+ u64 finish;
5798 -+ u64 start;
5799 -+
5800 -+ struct rb_root *tree;
5801 -+
5802 -+ u64 min_start;
5803 -+
5804 -+ unsigned long service, budget;
5805 -+ unsigned short weight, new_weight;
5806 -+ unsigned short orig_weight;
5807 -+
5808 -+ struct bfq_entity *parent;
5809 -+
5810 -+ struct bfq_sched_data *my_sched_data;
5811 -+ struct bfq_sched_data *sched_data;
5812 -+
5813 -+ unsigned short ioprio, new_ioprio;
5814 -+ unsigned short ioprio_class, new_ioprio_class;
5815 -+
5816 -+ int ioprio_changed;
5817 -+};
5818 -+
5819 -+struct bfq_group;
5820 -+
5821 -+/**
5822 -+ * struct bfq_queue - leaf schedulable entity.
5823 -+ * @ref: reference counter.
5824 -+ * @bfqd: parent bfq_data.
5825 -+ * @new_bfqq: shared bfq_queue if queue is cooperating with
5826 -+ * one or more other queues.
5827 -+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
5828 -+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
5829 -+ * @sort_list: sorted list of pending requests.
5830 -+ * @next_rq: if fifo isn't expired, next request to serve.
5831 -+ * @queued: nr of requests queued in @sort_list.
5832 -+ * @allocated: currently allocated requests.
5833 -+ * @meta_pending: pending metadata requests.
5834 -+ * @fifo: fifo list of requests in sort_list.
5835 -+ * @entity: entity representing this queue in the scheduler.
5836 -+ * @max_budget: maximum budget allowed from the feedback mechanism.
5837 -+ * @budget_timeout: budget expiration (in jiffies).
5838 -+ * @dispatched: number of requests on the dispatch list or inside driver.
5839 -+ * @org_ioprio: saved ioprio during boosted periods.
5840 -+ * @flags: status flags.
5841 -+ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
5842 -+ * @seek_samples: number of seeks sampled
5843 -+ * @seek_total: sum of the distances of the seeks sampled
5844 -+ * @seek_mean: mean seek distance
5845 -+ * @last_request_pos: position of the last request enqueued
5846 -+ * @pid: pid of the process owning the queue, used for logging purposes.
5847 -+ * @last_rais_start_time: last (idle -> weight-raised) transition attempt
5848 -+ * @raising_cur_max_time: current max raising time for this queue
5849 -+ * @last_idle_bklogged: time of the last transition of the @bfq_queue from
5850 -+ * idle to backlogged
5851 -+ * @service_from_backlogged: cumulative service received from the @bfq_queue
5852 -+ * since the last transition from idle to backlogged
5853 -+ *
5854 -+ * A bfq_queue is a leaf request queue; it can be associated to an io_context
5855 -+ * or more (if it is an async one). @cgroup holds a reference to the
5856 -+ * cgroup, to be sure that it does not disappear while a bfqq still
5857 -+ * references it (mostly to avoid races between request issuing and task
5858 -+ * migration followed by cgroup distruction).
5859 -+ * All the fields are protected by the queue lock of the containing bfqd.
5860 -+ */
5861 -+struct bfq_queue {
5862 -+ atomic_t ref;
5863 -+ struct bfq_data *bfqd;
5864 -+
5865 -+ /* fields for cooperating queues handling */
5866 -+ struct bfq_queue *new_bfqq;
5867 -+ struct rb_node pos_node;
5868 -+ struct rb_root *pos_root;
5869 -+
5870 -+ struct rb_root sort_list;
5871 -+ struct request *next_rq;
5872 -+ int queued[2];
5873 -+ int allocated[2];
5874 -+ int meta_pending;
5875 -+ struct list_head fifo;
5876 -+
5877 -+ struct bfq_entity entity;
5878 -+
5879 -+ unsigned long max_budget;
5880 -+ unsigned long budget_timeout;
5881 -+
5882 -+ int dispatched;
5883 -+
5884 -+ unsigned short org_ioprio;
5885 -+
5886 -+ unsigned int flags;
5887 -+
5888 -+ struct list_head bfqq_list;
5889 -+
5890 -+ unsigned int seek_samples;
5891 -+ u64 seek_total;
5892 -+ sector_t seek_mean;
5893 -+ sector_t last_request_pos;
5894 -+
5895 -+ pid_t pid;
5896 -+
5897 -+ /* weight-raising fields */
5898 -+ unsigned int raising_cur_max_time;
5899 -+ unsigned long soft_rt_next_start;
5900 -+ u64 last_rais_start_finish;
5901 -+ unsigned int raising_coeff;
5902 -+ u64 last_idle_bklogged;
5903 -+ unsigned long service_from_backlogged;
5904 -+};
5905 -+
5906 -+/**
5907 -+ * struct bfq_ttime - per process thinktime stats.
5908 -+ * @ttime_total: total process thinktime
5909 -+ * @ttime_samples: number of thinktime samples
5910 -+ * @ttime_mean: average process thinktime
5911 -+ */
5912 -+struct bfq_ttime {
5913 -+ unsigned long last_end_request;
5914 -+
5915 -+ unsigned long ttime_total;
5916 -+ unsigned long ttime_samples;
5917 -+ unsigned long ttime_mean;
5918 -+};
5919 -+
5920 -+/**
5921 -+ * struct bfq_io_cq - per (request_queue, io_context) structure.
5922 -+ * @icq: associated io_cq structure
5923 -+ * @bfqq: array of two process queues, the sync and the async
5924 -+ * @ttime: associated @bfq_ttime struct
5925 -+ */
5926 -+struct bfq_io_cq {
5927 -+ struct io_cq icq; /* must be the first member */
5928 -+ struct bfq_queue *bfqq[2];
5929 -+ struct bfq_ttime ttime;
5930 -+ int ioprio;
5931 -+};
5932 -+
5933 -+/**
5934 -+ * struct bfq_data - per device data structure.
5935 -+ * @queue: request queue for the managed device.
5936 -+ * @root_group: root bfq_group for the device.
5937 -+ * @rq_pos_tree: rbtree sorted by next_request position,
5938 -+ * used when determining if two or more queues
5939 -+ * have interleaving requests (see bfq_close_cooperator).
5940 -+ * @busy_queues: number of bfq_queues containing requests (including the
5941 -+ * queue under service, even if it is idling).
5942 -+ * @raised_busy_queues: number of weight-raised busy bfq_queues.
5943 -+ * @queued: number of queued requests.
5944 -+ * @rq_in_driver: number of requests dispatched and waiting for completion.
5945 -+ * @sync_flight: number of sync requests in the driver.
5946 -+ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
5947 -+ * completed requests .
5948 -+ * @hw_tag_samples: nr of samples used to calculate hw_tag.
5949 -+ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
5950 -+ * @budgets_assigned: number of budgets assigned.
5951 -+ * @idle_slice_timer: timer set when idling for the next sequential request
5952 -+ * from the queue under service.
5953 -+ * @unplug_work: delayed work to restart dispatching on the request queue.
5954 -+ * @in_service_queue: bfq_queue under service.
5955 -+ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.
5956 -+ * @last_position: on-disk position of the last served request.
5957 -+ * @last_budget_start: beginning of the last budget.
5958 -+ * @last_idling_start: beginning of the last idle slice.
5959 -+ * @peak_rate: peak transfer rate observed for a budget.
5960 -+ * @peak_rate_samples: number of samples used to calculate @peak_rate.
5961 -+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
5962 -+ * @group_list: list of all the bfq_groups active on the device.
5963 -+ * @active_list: list of all the bfq_queues active on the device.
5964 -+ * @idle_list: list of all the bfq_queues idle on the device.
5965 -+ * @bfq_quantum: max number of requests dispatched per dispatch round.
5966 -+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
5967 -+ * requests are served in fifo order.
5968 -+ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
5969 -+ * @bfq_back_max: maximum allowed backward seek.
5970 -+ * @bfq_slice_idle: maximum idling time.
5971 -+ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
5972 -+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
5973 -+ * async queues.
5974 -+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
5975 -+ * to prevent seeky queues to impose long latencies to well
5976 -+ * behaved ones (this also implies that seeky queues cannot
5977 -+ * receive guarantees in the service domain; after a timeout
5978 -+ * they are charged for the whole allocated budget, to try
5979 -+ * to preserve a behavior reasonably fair among them, but
5980 -+ * without service-domain guarantees).
5981 -+ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
5982 -+ * queue is multiplied
5983 -+ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
5984 -+ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
5985 -+ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
5986 -+ * may be reactivated for a queue (in jiffies)
5987 -+ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
5988 -+ * after which weight-raising may be
5989 -+ * reactivated for an already busy queue
5990 -+ * (in jiffies)
5991 -+ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
5992 -+ * sectors per seconds
5993 -+ * @RT_prod: cached value of the product R*T used for computing the maximum
5994 -+ * duration of the weight raising automatically
5995 -+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
5996 -+ *
5997 -+ * All the fields are protected by the @queue lock.
5998 -+ */
5999 -+struct bfq_data {
6000 -+ struct request_queue *queue;
6001 -+
6002 -+ struct bfq_group *root_group;
6003 -+
6004 -+ struct rb_root rq_pos_tree;
6005 -+
6006 -+ int busy_queues;
6007 -+ int raised_busy_queues;
6008 -+ int queued;
6009 -+ int rq_in_driver;
6010 -+ int sync_flight;
6011 -+
6012 -+ int max_rq_in_driver;
6013 -+ int hw_tag_samples;
6014 -+ int hw_tag;
6015 -+
6016 -+ int budgets_assigned;
6017 -+
6018 -+ struct timer_list idle_slice_timer;
6019 -+ struct work_struct unplug_work;
6020 -+
6021 -+ struct bfq_queue *in_service_queue;
6022 -+ struct bfq_io_cq *in_service_bic;
6023 -+
6024 -+ sector_t last_position;
6025 -+
6026 -+ ktime_t last_budget_start;
6027 -+ ktime_t last_idling_start;
6028 -+ int peak_rate_samples;
6029 -+ u64 peak_rate;
6030 -+ unsigned long bfq_max_budget;
6031 -+
6032 -+ struct hlist_head group_list;
6033 -+ struct list_head active_list;
6034 -+ struct list_head idle_list;
6035 -+
6036 -+ unsigned int bfq_quantum;
6037 -+ unsigned int bfq_fifo_expire[2];
6038 -+ unsigned int bfq_back_penalty;
6039 -+ unsigned int bfq_back_max;
6040 -+ unsigned int bfq_slice_idle;
6041 -+ u64 bfq_class_idle_last_service;
6042 -+
6043 -+ unsigned int bfq_user_max_budget;
6044 -+ unsigned int bfq_max_budget_async_rq;
6045 -+ unsigned int bfq_timeout[2];
6046 -+
6047 -+ bool low_latency;
6048 -+
6049 -+ /* parameters of the low_latency heuristics */
6050 -+ unsigned int bfq_raising_coeff;
6051 -+ unsigned int bfq_raising_max_time;
6052 -+ unsigned int bfq_raising_rt_max_time;
6053 -+ unsigned int bfq_raising_min_idle_time;
6054 -+ unsigned long bfq_raising_min_inter_arr_async;
6055 -+ unsigned int bfq_raising_max_softrt_rate;
6056 -+ u64 RT_prod;
6057 -+
6058 -+ struct bfq_queue oom_bfqq;
6059 -+};
6060 -+
6061 -+enum bfqq_state_flags {
6062 -+ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */
6063 -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
6064 -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
6065 -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
6066 -+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
6067 -+ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */
6068 -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
6069 -+ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
6070 -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
6071 -+ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
6072 -+ BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */
6073 -+};
6074 -+
6075 -+#define BFQ_BFQQ_FNS(name) \
6076 -+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
6077 -+{ \
6078 -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
6079 -+} \
6080 -+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
6081 -+{ \
6082 -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
6083 -+} \
6084 -+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
6085 -+{ \
6086 -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
6087 -+}
6088 -+
6089 -+BFQ_BFQQ_FNS(busy);
6090 -+BFQ_BFQQ_FNS(wait_request);
6091 -+BFQ_BFQQ_FNS(must_alloc);
6092 -+BFQ_BFQQ_FNS(fifo_expire);
6093 -+BFQ_BFQQ_FNS(idle_window);
6094 -+BFQ_BFQQ_FNS(prio_changed);
6095 -+BFQ_BFQQ_FNS(sync);
6096 -+BFQ_BFQQ_FNS(budget_new);
6097 -+BFQ_BFQQ_FNS(coop);
6098 -+BFQ_BFQQ_FNS(split_coop);
6099 -+BFQ_BFQQ_FNS(softrt_update);
6100 -+#undef BFQ_BFQQ_FNS
6101 -+
6102 -+/* Logging facilities. */
6103 -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
6104 -+ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
6105 -+
6106 -+#define bfq_log(bfqd, fmt, args...) \
6107 -+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
6108 -+
6109 -+/* Expiration reasons. */
6110 -+enum bfqq_expiration {
6111 -+ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */
6112 -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
6113 -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
6114 -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
6115 -+};
6116 -+
6117 -+#ifdef CONFIG_CGROUP_BFQIO
6118 -+/**
6119 -+ * struct bfq_group - per (device, cgroup) data structure.
6120 -+ * @entity: schedulable entity to insert into the parent group sched_data.
6121 -+ * @sched_data: own sched_data, to contain child entities (they may be
6122 -+ * both bfq_queues and bfq_groups).
6123 -+ * @group_node: node to be inserted into the bfqio_cgroup->group_data
6124 -+ * list of the containing cgroup's bfqio_cgroup.
6125 -+ * @bfqd_node: node to be inserted into the @bfqd->group_list list
6126 -+ * of the groups active on the same device; used for cleanup.
6127 -+ * @bfqd: the bfq_data for the device this group acts upon.
6128 -+ * @async_bfqq: array of async queues for all the tasks belonging to
6129 -+ * the group, one queue per ioprio value per ioprio_class,
6130 -+ * except for the idle class that has only one queue.
6131 -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
6132 -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
6133 -+ * to avoid too many special cases during group creation/migration.
6134 -+ *
6135 -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
6136 -+ * there is a set of bfq_groups, each one collecting the lower-level
6137 -+ * entities belonging to the group that are acting on the same device.
6138 -+ *
6139 -+ * Locking works as follows:
6140 -+ * o @group_node is protected by the bfqio_cgroup lock, and is accessed
6141 -+ * via RCU from its readers.
6142 -+ * o @bfqd is protected by the queue lock, RCU is used to access it
6143 -+ * from the readers.
6144 -+ * o All the other fields are protected by the @bfqd queue lock.
6145 -+ */
6146 -+struct bfq_group {
6147 -+ struct bfq_entity entity;
6148 -+ struct bfq_sched_data sched_data;
6149 -+
6150 -+ struct hlist_node group_node;
6151 -+ struct hlist_node bfqd_node;
6152 -+
6153 -+ void *bfqd;
6154 -+
6155 -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
6156 -+ struct bfq_queue *async_idle_bfqq;
6157 -+
6158 -+ struct bfq_entity *my_entity;
6159 -+};
6160 -+
6161 -+/**
6162 -+ * struct bfqio_cgroup - bfq cgroup data structure.
6163 -+ * @css: subsystem state for bfq in the containing cgroup.
6164 -+ * @online: flag marked when the subsystem is inserted.
6165 -+ * @weight: cgroup weight.
6166 -+ * @ioprio: cgroup ioprio.
6167 -+ * @ioprio_class: cgroup ioprio_class.
6168 -+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
6169 -+ * @group_data: list containing the bfq_group belonging to this cgroup.
6170 -+ *
6171 -+ * @group_data is accessed using RCU, with @lock protecting the updates,
6172 -+ * @ioprio and @ioprio_class are protected by @lock.
6173 -+ */
6174 -+struct bfqio_cgroup {
6175 -+ struct cgroup_subsys_state css;
6176 -+ bool online;
6177 -+
6178 -+ unsigned short weight, ioprio, ioprio_class;
6179 -+
6180 -+ spinlock_t lock;
6181 -+ struct hlist_head group_data;
6182 -+};
6183 -+#else
6184 -+struct bfq_group {
6185 -+ struct bfq_sched_data sched_data;
6186 -+
6187 -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
6188 -+ struct bfq_queue *async_idle_bfqq;
6189 -+};
6190 -+#endif
6191 -+
6192 -+static inline struct bfq_service_tree *
6193 -+bfq_entity_service_tree(struct bfq_entity *entity)
6194 -+{
6195 -+ struct bfq_sched_data *sched_data = entity->sched_data;
6196 -+ unsigned int idx = entity->ioprio_class - 1;
6197 -+
6198 -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
6199 -+ BUG_ON(sched_data == NULL);
6200 -+
6201 -+ return sched_data->service_tree + idx;
6202 -+}
6203 -+
6204 -+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
6205 -+ int is_sync)
6206 -+{
6207 -+ return bic->bfqq[!!is_sync];
6208 -+}
6209 -+
6210 -+static inline void bic_set_bfqq(struct bfq_io_cq *bic,
6211 -+ struct bfq_queue *bfqq, int is_sync)
6212 -+{
6213 -+ bic->bfqq[!!is_sync] = bfqq;
6214 -+}
6215 -+
6216 -+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
6217 -+{
6218 -+ return bic->icq.q->elevator->elevator_data;
6219 -+}
6220 -+
6221 -+/**
6222 -+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
6223 -+ * @ptr: a pointer to a bfqd.
6224 -+ * @flags: storage for the flags to be saved.
6225 -+ *
6226 -+ * This function allows bfqg->bfqd to be protected by the
6227 -+ * queue lock of the bfqd they reference; the pointer is dereferenced
6228 -+ * under RCU, so the storage for bfqd is assured to be safe as long
6229 -+ * as the RCU read side critical section does not end. After the
6230 -+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
6231 -+ * sure that no other writer accessed it. If we raced with a writer,
6232 -+ * the function returns NULL, with the queue unlocked, otherwise it
6233 -+ * returns the dereferenced pointer, with the queue locked.
6234 -+ */
6235 -+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
6236 -+ unsigned long *flags)
6237 -+{
6238 -+ struct bfq_data *bfqd;
6239 -+
6240 -+ rcu_read_lock();
6241 -+ bfqd = rcu_dereference(*(struct bfq_data **)ptr);
6242 -+
6243 -+ if (bfqd != NULL) {
6244 -+ spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
6245 -+ if (*ptr == bfqd)
6246 -+ goto out;
6247 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
6248 -+ }
6249 -+
6250 -+ bfqd = NULL;
6251 -+out:
6252 -+ rcu_read_unlock();
6253 -+ return bfqd;
6254 -+}
6255 -+
6256 -+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
6257 -+ unsigned long *flags)
6258 -+{
6259 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
6260 -+}
6261 -+
6262 -+static void bfq_changed_ioprio(struct bfq_io_cq *bic);
6263 -+static void bfq_put_queue(struct bfq_queue *bfqq);
6264 -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
6265 -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
6266 -+ struct bfq_group *bfqg, int is_sync,
6267 -+ struct bfq_io_cq *bic, gfp_t gfp_mask);
6268 -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
6269 -+ struct bfq_group *bfqg);
6270 -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
6271 -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
6272 -+#endif
6273 ---
6274 -1.8.5.2
6275 -
6276
6277 Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1
6278 ===================================================================
6279 --- genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1 2014-03-26 23:50:52 UTC (rev 2715)
6280 +++ genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1 2014-03-31 12:03:14 UTC (rev 2716)
6281 @@ -1,6040 +0,0 @@
6282 -From be5107dc591f7ae692ca7cceecbba72e4c174c37 Mon Sep 17 00:00:00 2001
6283 -From: Paolo Valente <paolo.valente@×××××××.it>
6284 -Date: Thu, 9 May 2013 19:10:02 +0200
6285 -Subject: [PATCH 2/3] block: introduce the BFQ-v7r1 I/O sched for 3.13
6286 -
6287 -Add the BFQ-v7r1 I/O scheduler to 3.13.
6288 -The general structure is borrowed from CFQ, as much of the code for
6289 -handling I/O contexts Over time, several useful features have been
6290 -ported from CFQ as well (details in the changelog in README.BFQ). A
6291 -(bfq_)queue is associated to each task doing I/O on a device, and each
6292 -time a scheduling decision has to be made a queue is selected and served
6293 -until it expires.
6294 -
6295 - - Slices are given in the service domain: tasks are assigned
6296 - budgets, measured in number of sectors. Once got the disk, a task
6297 - must however consume its assigned budget within a configurable
6298 - maximum time (by default, the maximum possible value of the
6299 - budgets is automatically computed to comply with this timeout).
6300 - This allows the desired latency vs "throughput boosting" tradeoff
6301 - to be set.
6302 -
6303 - - Budgets are scheduled according to a variant of WF2Q+, implemented
6304 - using an augmented rb-tree to take eligibility into account while
6305 - preserving an O(log N) overall complexity.
6306 -
6307 - - A low-latency tunable is provided; if enabled, both interactive
6308 - and soft real-time applications are guaranteed a very low latency.
6309 -
6310 - - Latency guarantees are preserved also in the presence of NCQ.
6311 -
6312 - - Also with flash-based devices, a high throughput is achieved
6313 - while still preserving latency guarantees.
6314 -
6315 - - BFQ features Early Queue Merge (EQM), a sort of fusion of the
6316 - cooperating-queue-merging and the preemption mechanisms present
6317 - in CFQ. EQM is in fact a unified mechanism that tries to get a
6318 - sequential read pattern, and hence a high throughput, with any
6319 - set of processes performing interleaved I/O over a contiguous
6320 - sequence of sectors.
6321 -
6322 - - BFQ supports full hierarchical scheduling, exporting a cgroups
6323 - interface. Since each node has a full scheduler, each group can
6324 - be assigned its own weight.
6325 -
6326 - - If the cgroups interface is not used, only I/O priorities can be
6327 - assigned to processes, with ioprio values mapped to weights
6328 - with the relation weight = IOPRIO_BE_NR - ioprio.
6329 -
6330 - - ioprio classes are served in strict priority order, i.e., lower
6331 - priority queues are not served as long as there are higher
6332 - priority queues. Among queues in the same class the bandwidth is
6333 - distributed in proportion to the weight of each queue. A very
6334 - thin extra bandwidth is however guaranteed to the Idle class, to
6335 - prevent it from starving.
6336 -
6337 -Signed-off-by: Paolo Valente <paolo.valente@×××××××.it>
6338 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
6339 ----
6340 - block/bfq-cgroup.c | 911 ++++++++++++++
6341 - block/bfq-ioc.c | 36 +
6342 - block/bfq-iosched.c | 3298 +++++++++++++++++++++++++++++++++++++++++++++++++++
6343 - block/bfq-sched.c | 1078 +++++++++++++++++
6344 - block/bfq.h | 614 ++++++++++
6345 - 5 files changed, 5937 insertions(+)
6346 - create mode 100644 block/bfq-cgroup.c
6347 - create mode 100644 block/bfq-ioc.c
6348 - create mode 100644 block/bfq-iosched.c
6349 - create mode 100644 block/bfq-sched.c
6350 - create mode 100644 block/bfq.h
6351 -
6352 -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
6353 -new file mode 100644
6354 -index 0000000..79a288a
6355 ---- /dev/null
6356 -+++ b/block/bfq-cgroup.c
6357 -@@ -0,0 +1,911 @@
6358 -+/*
6359 -+ * BFQ: CGROUPS support.
6360 -+ *
6361 -+ * Based on ideas and code from CFQ:
6362 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
6363 -+ *
6364 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
6365 -+ * Paolo Valente <paolo.valente@×××××××.it>
6366 -+ *
6367 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
6368 -+ *
6369 -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
6370 -+ */
6371 -+
6372 -+#ifdef CONFIG_CGROUP_BFQIO
6373 -+
6374 -+static DEFINE_MUTEX(bfqio_mutex);
6375 -+
6376 -+static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
6377 -+{
6378 -+ return bgrp ? !bgrp->online : false;
6379 -+}
6380 -+
6381 -+static struct bfqio_cgroup bfqio_root_cgroup = {
6382 -+ .weight = BFQ_DEFAULT_GRP_WEIGHT,
6383 -+ .ioprio = BFQ_DEFAULT_GRP_IOPRIO,
6384 -+ .ioprio_class = BFQ_DEFAULT_GRP_CLASS,
6385 -+};
6386 -+
6387 -+static inline void bfq_init_entity(struct bfq_entity *entity,
6388 -+ struct bfq_group *bfqg)
6389 -+{
6390 -+ entity->weight = entity->new_weight;
6391 -+ entity->orig_weight = entity->new_weight;
6392 -+ entity->ioprio = entity->new_ioprio;
6393 -+ entity->ioprio_class = entity->new_ioprio_class;
6394 -+ entity->parent = bfqg->my_entity;
6395 -+ entity->sched_data = &bfqg->sched_data;
6396 -+}
6397 -+
6398 -+static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
6399 -+{
6400 -+ return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
6401 -+}
6402 -+
6403 -+/*
6404 -+ * Search the bfq_group for bfqd into the hash table (by now only a list)
6405 -+ * of bgrp. Must be called under rcu_read_lock().
6406 -+ */
6407 -+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
6408 -+ struct bfq_data *bfqd)
6409 -+{
6410 -+ struct bfq_group *bfqg;
6411 -+ void *key;
6412 -+
6413 -+ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
6414 -+ key = rcu_dereference(bfqg->bfqd);
6415 -+ if (key == bfqd)
6416 -+ return bfqg;
6417 -+ }
6418 -+
6419 -+ return NULL;
6420 -+}
6421 -+
6422 -+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
6423 -+ struct bfq_group *bfqg)
6424 -+{
6425 -+ struct bfq_entity *entity = &bfqg->entity;
6426 -+
6427 -+ /*
6428 -+ * If the weight of the entity has never been set via the sysfs
6429 -+ * interface, then bgrp->weight == 0. In this case we initialize
6430 -+ * the weight from the current ioprio value. Otherwise, the group
6431 -+ * weight, if set, has priority over the ioprio value.
6432 -+ */
6433 -+ if (bgrp->weight == 0) {
6434 -+ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
6435 -+ entity->new_ioprio = bgrp->ioprio;
6436 -+ } else {
6437 -+ entity->new_weight = bgrp->weight;
6438 -+ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
6439 -+ }
6440 -+ entity->orig_weight = entity->weight = entity->new_weight;
6441 -+ entity->ioprio = entity->new_ioprio;
6442 -+ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
6443 -+ entity->my_sched_data = &bfqg->sched_data;
6444 -+}
6445 -+
6446 -+static inline void bfq_group_set_parent(struct bfq_group *bfqg,
6447 -+ struct bfq_group *parent)
6448 -+{
6449 -+ struct bfq_entity *entity;
6450 -+
6451 -+ BUG_ON(parent == NULL);
6452 -+ BUG_ON(bfqg == NULL);
6453 -+
6454 -+ entity = &bfqg->entity;
6455 -+ entity->parent = parent->my_entity;
6456 -+ entity->sched_data = &parent->sched_data;
6457 -+}
6458 -+
6459 -+/**
6460 -+ * bfq_group_chain_alloc - allocate a chain of groups.
6461 -+ * @bfqd: queue descriptor.
6462 -+ * @css: the leaf cgroup_subsys_state this chain starts from.
6463 -+ *
6464 -+ * Allocate a chain of groups starting from the one belonging to
6465 -+ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
6466 -+ * to the root has already an allocated group on @bfqd.
6467 -+ */
6468 -+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
6469 -+ struct cgroup_subsys_state *css)
6470 -+{
6471 -+ struct bfqio_cgroup *bgrp;
6472 -+ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
6473 -+
6474 -+ for (; css != NULL; css = css->parent) {
6475 -+ bgrp = css_to_bfqio(css);
6476 -+
6477 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
6478 -+ if (bfqg != NULL) {
6479 -+ /*
6480 -+ * All the cgroups in the path from there to the
6481 -+ * root must have a bfq_group for bfqd, so we don't
6482 -+ * need any more allocations.
6483 -+ */
6484 -+ break;
6485 -+ }
6486 -+
6487 -+ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
6488 -+ if (bfqg == NULL)
6489 -+ goto cleanup;
6490 -+
6491 -+ bfq_group_init_entity(bgrp, bfqg);
6492 -+ bfqg->my_entity = &bfqg->entity;
6493 -+
6494 -+ if (leaf == NULL) {
6495 -+ leaf = bfqg;
6496 -+ prev = leaf;
6497 -+ } else {
6498 -+ bfq_group_set_parent(prev, bfqg);
6499 -+ /*
6500 -+ * Build a list of allocated nodes using the bfqd
6501 -+ * filed, that is still unused and will be initialized
6502 -+ * only after the node will be connected.
6503 -+ */
6504 -+ prev->bfqd = bfqg;
6505 -+ prev = bfqg;
6506 -+ }
6507 -+ }
6508 -+
6509 -+ return leaf;
6510 -+
6511 -+cleanup:
6512 -+ while (leaf != NULL) {
6513 -+ prev = leaf;
6514 -+ leaf = leaf->bfqd;
6515 -+ kfree(prev);
6516 -+ }
6517 -+
6518 -+ return NULL;
6519 -+}
6520 -+
6521 -+/**
6522 -+ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
6523 -+ * @bfqd: the queue descriptor.
6524 -+ * @css: the leaf cgroup_subsys_state to start from.
6525 -+ * @leaf: the leaf group (to be associated to @cgroup).
6526 -+ *
6527 -+ * Try to link a chain of groups to a cgroup hierarchy, connecting the
6528 -+ * nodes bottom-up, so we can be sure that when we find a cgroup in the
6529 -+ * hierarchy that already as a group associated to @bfqd all the nodes
6530 -+ * in the path to the root cgroup have one too.
6531 -+ *
6532 -+ * On locking: the queue lock protects the hierarchy (there is a hierarchy
6533 -+ * per device) while the bfqio_cgroup lock protects the list of groups
6534 -+ * belonging to the same cgroup.
6535 -+ */
6536 -+static void bfq_group_chain_link(struct bfq_data *bfqd,
6537 -+ struct cgroup_subsys_state *css,
6538 -+ struct bfq_group *leaf)
6539 -+{
6540 -+ struct bfqio_cgroup *bgrp;
6541 -+ struct bfq_group *bfqg, *next, *prev = NULL;
6542 -+ unsigned long flags;
6543 -+
6544 -+ assert_spin_locked(bfqd->queue->queue_lock);
6545 -+
6546 -+ for (; css != NULL && leaf != NULL; css = css->parent) {
6547 -+ bgrp = css_to_bfqio(css);
6548 -+ next = leaf->bfqd;
6549 -+
6550 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
6551 -+ BUG_ON(bfqg != NULL);
6552 -+
6553 -+ spin_lock_irqsave(&bgrp->lock, flags);
6554 -+
6555 -+ rcu_assign_pointer(leaf->bfqd, bfqd);
6556 -+ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
6557 -+ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
6558 -+
6559 -+ spin_unlock_irqrestore(&bgrp->lock, flags);
6560 -+
6561 -+ prev = leaf;
6562 -+ leaf = next;
6563 -+ }
6564 -+
6565 -+ BUG_ON(css == NULL && leaf != NULL);
6566 -+ if (css != NULL && prev != NULL) {
6567 -+ bgrp = css_to_bfqio(css);
6568 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
6569 -+ bfq_group_set_parent(prev, bfqg);
6570 -+ }
6571 -+}
6572 -+
6573 -+/**
6574 -+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
6575 -+ * @bfqd: queue descriptor.
6576 -+ * @cgroup: cgroup being searched for.
6577 -+ *
6578 -+ * Return a group associated to @bfqd in @cgroup, allocating one if
6579 -+ * necessary. When a group is returned all the cgroups in the path
6580 -+ * to the root have a group associated to @bfqd.
6581 -+ *
6582 -+ * If the allocation fails, return the root group: this breaks guarantees
6583 -+ * but is a safe fallbak. If this loss becames a problem it can be
6584 -+ * mitigated using the equivalent weight (given by the product of the
6585 -+ * weights of the groups in the path from @group to the root) in the
6586 -+ * root scheduler.
6587 -+ *
6588 -+ * We allocate all the missing nodes in the path from the leaf cgroup
6589 -+ * to the root and we connect the nodes only after all the allocations
6590 -+ * have been successful.
6591 -+ */
6592 -+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
6593 -+ struct cgroup_subsys_state *css)
6594 -+{
6595 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
6596 -+ struct bfq_group *bfqg;
6597 -+
6598 -+ bfqg = bfqio_lookup_group(bgrp, bfqd);
6599 -+ if (bfqg != NULL)
6600 -+ return bfqg;
6601 -+
6602 -+ bfqg = bfq_group_chain_alloc(bfqd, css);
6603 -+ if (bfqg != NULL)
6604 -+ bfq_group_chain_link(bfqd, css, bfqg);
6605 -+ else
6606 -+ bfqg = bfqd->root_group;
6607 -+
6608 -+ return bfqg;
6609 -+}
6610 -+
6611 -+/**
6612 -+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
6613 -+ * @bfqd: queue descriptor.
6614 -+ * @bfqq: the queue to move.
6615 -+ * @entity: @bfqq's entity.
6616 -+ * @bfqg: the group to move to.
6617 -+ *
6618 -+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
6619 -+ * it on the new one. Avoid putting the entity on the old group idle tree.
6620 -+ *
6621 -+ * Must be called under the queue lock; the cgroup owning @bfqg must
6622 -+ * not disappear (by now this just means that we are called under
6623 -+ * rcu_read_lock()).
6624 -+ */
6625 -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
6626 -+ struct bfq_entity *entity, struct bfq_group *bfqg)
6627 -+{
6628 -+ int busy, resume;
6629 -+
6630 -+ busy = bfq_bfqq_busy(bfqq);
6631 -+ resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
6632 -+
6633 -+ BUG_ON(resume && !entity->on_st);
6634 -+ BUG_ON(busy && !resume && entity->on_st &&
6635 -+ bfqq != bfqd->in_service_queue);
6636 -+
6637 -+ if (busy) {
6638 -+ BUG_ON(atomic_read(&bfqq->ref) < 2);
6639 -+
6640 -+ if (!resume)
6641 -+ bfq_del_bfqq_busy(bfqd, bfqq, 0);
6642 -+ else
6643 -+ bfq_deactivate_bfqq(bfqd, bfqq, 0);
6644 -+ } else if (entity->on_st)
6645 -+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
6646 -+
6647 -+ /*
6648 -+ * Here we use a reference to bfqg. We don't need a refcounter
6649 -+ * as the cgroup reference will not be dropped, so that its
6650 -+ * destroy() callback will not be invoked.
6651 -+ */
6652 -+ entity->parent = bfqg->my_entity;
6653 -+ entity->sched_data = &bfqg->sched_data;
6654 -+
6655 -+ if (busy && resume)
6656 -+ bfq_activate_bfqq(bfqd, bfqq);
6657 -+
6658 -+ if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)
6659 -+ bfq_schedule_dispatch(bfqd);
6660 -+}
6661 -+
6662 -+/**
6663 -+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
6664 -+ * @bfqd: the queue descriptor.
6665 -+ * @bic: the bic to move.
6666 -+ * @cgroup: the cgroup to move to.
6667 -+ *
6668 -+ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
6669 -+ * has to make sure that the reference to cgroup is valid across the call.
6670 -+ *
6671 -+ * NOTE: an alternative approach might have been to store the current
6672 -+ * cgroup in bfqq and getting a reference to it, reducing the lookup
6673 -+ * time here, at the price of slightly more complex code.
6674 -+ */
6675 -+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
6676 -+ struct bfq_io_cq *bic,
6677 -+ struct cgroup_subsys_state *css)
6678 -+{
6679 -+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
6680 -+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
6681 -+ struct bfq_entity *entity;
6682 -+ struct bfq_group *bfqg;
6683 -+ struct bfqio_cgroup *bgrp;
6684 -+
6685 -+ bgrp = css_to_bfqio(css);
6686 -+
6687 -+ bfqg = bfq_find_alloc_group(bfqd, css);
6688 -+ if (async_bfqq != NULL) {
6689 -+ entity = &async_bfqq->entity;
6690 -+
6691 -+ if (entity->sched_data != &bfqg->sched_data) {
6692 -+ bic_set_bfqq(bic, NULL, 0);
6693 -+ bfq_log_bfqq(bfqd, async_bfqq,
6694 -+ "bic_change_group: %p %d",
6695 -+ async_bfqq, atomic_read(&async_bfqq->ref));
6696 -+ bfq_put_queue(async_bfqq);
6697 -+ }
6698 -+ }
6699 -+
6700 -+ if (sync_bfqq != NULL) {
6701 -+ entity = &sync_bfqq->entity;
6702 -+ if (entity->sched_data != &bfqg->sched_data)
6703 -+ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
6704 -+ }
6705 -+
6706 -+ return bfqg;
6707 -+}
6708 -+
6709 -+/**
6710 -+ * bfq_bic_change_cgroup - move @bic to @cgroup.
6711 -+ * @bic: the bic being migrated.
6712 -+ * @cgroup: the destination cgroup.
6713 -+ *
6714 -+ * When the task owning @bic is moved to @cgroup, @bic is immediately
6715 -+ * moved into its new parent group.
6716 -+ */
6717 -+static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
6718 -+ struct cgroup_subsys_state *css)
6719 -+{
6720 -+ struct bfq_data *bfqd;
6721 -+ unsigned long uninitialized_var(flags);
6722 -+
6723 -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
6724 -+ &flags);
6725 -+ if (bfqd != NULL) {
6726 -+ __bfq_bic_change_cgroup(bfqd, bic, css);
6727 -+ bfq_put_bfqd_unlock(bfqd, &flags);
6728 -+ }
6729 -+}
6730 -+
6731 -+/**
6732 -+ * bfq_bic_update_cgroup - update the cgroup of @bic.
6733 -+ * @bic: the @bic to update.
6734 -+ *
6735 -+ * Make sure that @bic is enqueued in the cgroup of the current task.
6736 -+ * We need this in addition to moving bics during the cgroup attach
6737 -+ * phase because the task owning @bic could be at its first disk
6738 -+ * access or we may end up in the root cgroup as the result of a
6739 -+ * memory allocation failure and here we try to move to the right
6740 -+ * group.
6741 -+ *
6742 -+ * Must be called under the queue lock. It is safe to use the returned
6743 -+ * value even after the rcu_read_unlock() as the migration/destruction
6744 -+ * paths act under the queue lock too. IOW it is impossible to race with
6745 -+ * group migration/destruction and end up with an invalid group as:
6746 -+ * a) here cgroup has not yet been destroyed, nor its destroy callback
6747 -+ * has started execution, as current holds a reference to it,
6748 -+ * b) if it is destroyed after rcu_read_unlock() [after current is
6749 -+ * migrated to a different cgroup] its attach() callback will have
6750 -+ * taken care of remove all the references to the old cgroup data.
6751 -+ */
6752 -+static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
6753 -+{
6754 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
6755 -+ struct bfq_group *bfqg;
6756 -+ struct cgroup_subsys_state *css;
6757 -+
6758 -+ BUG_ON(bfqd == NULL);
6759 -+
6760 -+ rcu_read_lock();
6761 -+ css = task_css(current, bfqio_subsys_id);
6762 -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
6763 -+ rcu_read_unlock();
6764 -+
6765 -+ return bfqg;
6766 -+}
6767 -+
6768 -+/**
6769 -+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
6770 -+ * @st: the service tree being flushed.
6771 -+ */
6772 -+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
6773 -+{
6774 -+ struct bfq_entity *entity = st->first_idle;
6775 -+
6776 -+ for (; entity != NULL; entity = st->first_idle)
6777 -+ __bfq_deactivate_entity(entity, 0);
6778 -+}
6779 -+
6780 -+/**
6781 -+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
6782 -+ * @bfqd: the device data structure with the root group.
6783 -+ * @entity: the entity to move.
6784 -+ */
6785 -+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
6786 -+ struct bfq_entity *entity)
6787 -+{
6788 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
6789 -+
6790 -+ BUG_ON(bfqq == NULL);
6791 -+ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
6792 -+ return;
6793 -+}
6794 -+
6795 -+/**
6796 -+ * bfq_reparent_active_entities - move to the root group all active entities.
6797 -+ * @bfqd: the device data structure with the root group.
6798 -+ * @bfqg: the group to move from.
6799 -+ * @st: the service tree with the entities.
6800 -+ *
6801 -+ * Needs queue_lock to be taken and reference to be valid over the call.
6802 -+ */
6803 -+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
6804 -+ struct bfq_group *bfqg,
6805 -+ struct bfq_service_tree *st)
6806 -+{
6807 -+ struct rb_root *active = &st->active;
6808 -+ struct bfq_entity *entity = NULL;
6809 -+
6810 -+ if (!RB_EMPTY_ROOT(&st->active))
6811 -+ entity = bfq_entity_of(rb_first(active));
6812 -+
6813 -+ for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))
6814 -+ bfq_reparent_leaf_entity(bfqd, entity);
6815 -+
6816 -+ if (bfqg->sched_data.in_service_entity != NULL)
6817 -+ bfq_reparent_leaf_entity(bfqd,
6818 -+ bfqg->sched_data.in_service_entity);
6819 -+
6820 -+ return;
6821 -+}
6822 -+
6823 -+/**
6824 -+ * bfq_destroy_group - destroy @bfqg.
6825 -+ * @bgrp: the bfqio_cgroup containing @bfqg.
6826 -+ * @bfqg: the group being destroyed.
6827 -+ *
6828 -+ * Destroy @bfqg, making sure that it is not referenced from its parent.
6829 -+ */
6830 -+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
6831 -+{
6832 -+ struct bfq_data *bfqd;
6833 -+ struct bfq_service_tree *st;
6834 -+ struct bfq_entity *entity = bfqg->my_entity;
6835 -+ unsigned long uninitialized_var(flags);
6836 -+ int i;
6837 -+
6838 -+ hlist_del(&bfqg->group_node);
6839 -+
6840 -+ /*
6841 -+ * Empty all service_trees belonging to this group before deactivating
6842 -+ * the group itself.
6843 -+ */
6844 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
6845 -+ st = bfqg->sched_data.service_tree + i;
6846 -+
6847 -+ /*
6848 -+ * The idle tree may still contain bfq_queues belonging
6849 -+ * to exited task because they never migrated to a different
6850 -+ * cgroup from the one being destroyed now. Noone else
6851 -+ * can access them so it's safe to act without any lock.
6852 -+ */
6853 -+ bfq_flush_idle_tree(st);
6854 -+
6855 -+ /*
6856 -+ * It may happen that some queues are still active
6857 -+ * (busy) upon group destruction (if the corresponding
6858 -+ * processes have been forced to terminate). We move
6859 -+ * all the leaf entities corresponding to these queues
6860 -+ * to the root_group.
6861 -+ * Also, it may happen that the group has an entity
6862 -+ * under service, which is disconnected from the active
6863 -+ * tree: it must be moved, too.
6864 -+ * There is no need to put the sync queues, as the
6865 -+ * scheduler has taken no reference.
6866 -+ */
6867 -+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
6868 -+ if (bfqd != NULL) {
6869 -+ bfq_reparent_active_entities(bfqd, bfqg, st);
6870 -+ bfq_put_bfqd_unlock(bfqd, &flags);
6871 -+ }
6872 -+ BUG_ON(!RB_EMPTY_ROOT(&st->active));
6873 -+ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
6874 -+ }
6875 -+ BUG_ON(bfqg->sched_data.next_in_service != NULL);
6876 -+ BUG_ON(bfqg->sched_data.in_service_entity != NULL);
6877 -+
6878 -+ /*
6879 -+ * We may race with device destruction, take extra care when
6880 -+ * dereferencing bfqg->bfqd.
6881 -+ */
6882 -+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
6883 -+ if (bfqd != NULL) {
6884 -+ hlist_del(&bfqg->bfqd_node);
6885 -+ __bfq_deactivate_entity(entity, 0);
6886 -+ bfq_put_async_queues(bfqd, bfqg);
6887 -+ bfq_put_bfqd_unlock(bfqd, &flags);
6888 -+ }
6889 -+ BUG_ON(entity->tree != NULL);
6890 -+
6891 -+ /*
6892 -+ * No need to defer the kfree() to the end of the RCU grace
6893 -+ * period: we are called from the destroy() callback of our
6894 -+ * cgroup, so we can be sure that noone is a) still using
6895 -+ * this cgroup or b) doing lookups in it.
6896 -+ */
6897 -+ kfree(bfqg);
6898 -+}
6899 -+
6900 -+static void bfq_end_raising_async(struct bfq_data *bfqd)
6901 -+{
6902 -+ struct hlist_node *tmp;
6903 -+ struct bfq_group *bfqg;
6904 -+
6905 -+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
6906 -+ bfq_end_raising_async_queues(bfqd, bfqg);
6907 -+ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
6908 -+}
6909 -+
6910 -+/**
6911 -+ * bfq_disconnect_groups - diconnect @bfqd from all its groups.
6912 -+ * @bfqd: the device descriptor being exited.
6913 -+ *
6914 -+ * When the device exits we just make sure that no lookup can return
6915 -+ * the now unused group structures. They will be deallocated on cgroup
6916 -+ * destruction.
6917 -+ */
6918 -+static void bfq_disconnect_groups(struct bfq_data *bfqd)
6919 -+{
6920 -+ struct hlist_node *tmp;
6921 -+ struct bfq_group *bfqg;
6922 -+
6923 -+ bfq_log(bfqd, "disconnect_groups beginning");
6924 -+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
6925 -+ hlist_del(&bfqg->bfqd_node);
6926 -+
6927 -+ __bfq_deactivate_entity(bfqg->my_entity, 0);
6928 -+
6929 -+ /*
6930 -+ * Don't remove from the group hash, just set an
6931 -+ * invalid key. No lookups can race with the
6932 -+ * assignment as bfqd is being destroyed; this
6933 -+ * implies also that new elements cannot be added
6934 -+ * to the list.
6935 -+ */
6936 -+ rcu_assign_pointer(bfqg->bfqd, NULL);
6937 -+
6938 -+ bfq_log(bfqd, "disconnect_groups: put async for group %p",
6939 -+ bfqg);
6940 -+ bfq_put_async_queues(bfqd, bfqg);
6941 -+ }
6942 -+}
6943 -+
6944 -+static inline void bfq_free_root_group(struct bfq_data *bfqd)
6945 -+{
6946 -+ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
6947 -+ struct bfq_group *bfqg = bfqd->root_group;
6948 -+
6949 -+ bfq_put_async_queues(bfqd, bfqg);
6950 -+
6951 -+ spin_lock_irq(&bgrp->lock);
6952 -+ hlist_del_rcu(&bfqg->group_node);
6953 -+ spin_unlock_irq(&bgrp->lock);
6954 -+
6955 -+ /*
6956 -+ * No need to synchronize_rcu() here: since the device is gone
6957 -+ * there cannot be any read-side access to its root_group.
6958 -+ */
6959 -+ kfree(bfqg);
6960 -+}
6961 -+
6962 -+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
6963 -+{
6964 -+ struct bfq_group *bfqg;
6965 -+ struct bfqio_cgroup *bgrp;
6966 -+ int i;
6967 -+
6968 -+ bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);
6969 -+ if (bfqg == NULL)
6970 -+ return NULL;
6971 -+
6972 -+ bfqg->entity.parent = NULL;
6973 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
6974 -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
6975 -+
6976 -+ bgrp = &bfqio_root_cgroup;
6977 -+ spin_lock_irq(&bgrp->lock);
6978 -+ rcu_assign_pointer(bfqg->bfqd, bfqd);
6979 -+ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
6980 -+ spin_unlock_irq(&bgrp->lock);
6981 -+
6982 -+ return bfqg;
6983 -+}
6984 -+
6985 -+#define SHOW_FUNCTION(__VAR) \
6986 -+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
6987 -+ struct cftype *cftype) \
6988 -+{ \
6989 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
6990 -+ u64 ret = -ENODEV; \
6991 -+ \
6992 -+ mutex_lock(&bfqio_mutex); \
6993 -+ if (bfqio_is_removed(bgrp)) \
6994 -+ goto out_unlock; \
6995 -+ \
6996 -+ spin_lock_irq(&bgrp->lock); \
6997 -+ ret = bgrp->__VAR; \
6998 -+ spin_unlock_irq(&bgrp->lock); \
6999 -+ \
7000 -+out_unlock: \
7001 -+ mutex_unlock(&bfqio_mutex); \
7002 -+ return ret; \
7003 -+}
7004 -+
7005 -+SHOW_FUNCTION(weight);
7006 -+SHOW_FUNCTION(ioprio);
7007 -+SHOW_FUNCTION(ioprio_class);
7008 -+#undef SHOW_FUNCTION
7009 -+
7010 -+#define STORE_FUNCTION(__VAR, __MIN, __MAX) \
7011 -+static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
7012 -+ struct cftype *cftype, \
7013 -+ u64 val) \
7014 -+{ \
7015 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \
7016 -+ struct bfq_group *bfqg; \
7017 -+ int ret = -EINVAL; \
7018 -+ \
7019 -+ if (val < (__MIN) || val > (__MAX)) \
7020 -+ return ret; \
7021 -+ \
7022 -+ ret = -ENODEV; \
7023 -+ mutex_lock(&bfqio_mutex); \
7024 -+ if (bfqio_is_removed(bgrp)) \
7025 -+ goto out_unlock; \
7026 -+ ret = 0; \
7027 -+ \
7028 -+ spin_lock_irq(&bgrp->lock); \
7029 -+ bgrp->__VAR = (unsigned short)val; \
7030 -+ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \
7031 -+ /* \
7032 -+ * Setting the ioprio_changed flag of the entity \
7033 -+ * to 1 with new_##__VAR == ##__VAR would re-set \
7034 -+ * the value of the weight to its ioprio mapping. \
7035 -+ * Set the flag only if necessary. \
7036 -+ */ \
7037 -+ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \
7038 -+ bfqg->entity.new_##__VAR = (unsigned short)val; \
7039 -+ smp_wmb(); \
7040 -+ bfqg->entity.ioprio_changed = 1; \
7041 -+ } \
7042 -+ } \
7043 -+ spin_unlock_irq(&bgrp->lock); \
7044 -+ \
7045 -+out_unlock: \
7046 -+ mutex_unlock(&bfqio_mutex); \
7047 -+ return ret; \
7048 -+}
7049 -+
7050 -+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
7051 -+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
7052 -+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
7053 -+#undef STORE_FUNCTION
7054 -+
7055 -+static struct cftype bfqio_files[] = {
7056 -+ {
7057 -+ .name = "weight",
7058 -+ .read_u64 = bfqio_cgroup_weight_read,
7059 -+ .write_u64 = bfqio_cgroup_weight_write,
7060 -+ },
7061 -+ {
7062 -+ .name = "ioprio",
7063 -+ .read_u64 = bfqio_cgroup_ioprio_read,
7064 -+ .write_u64 = bfqio_cgroup_ioprio_write,
7065 -+ },
7066 -+ {
7067 -+ .name = "ioprio_class",
7068 -+ .read_u64 = bfqio_cgroup_ioprio_class_read,
7069 -+ .write_u64 = bfqio_cgroup_ioprio_class_write,
7070 -+ },
7071 -+ { }, /* terminate */
7072 -+};
7073 -+
7074 -+static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state
7075 -+ *parent_css)
7076 -+{
7077 -+ struct bfqio_cgroup *bgrp;
7078 -+
7079 -+ if (parent_css != NULL) {
7080 -+ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
7081 -+ if (bgrp == NULL)
7082 -+ return ERR_PTR(-ENOMEM);
7083 -+ } else
7084 -+ bgrp = &bfqio_root_cgroup;
7085 -+
7086 -+ spin_lock_init(&bgrp->lock);
7087 -+ INIT_HLIST_HEAD(&bgrp->group_data);
7088 -+ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
7089 -+ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
7090 -+
7091 -+ return &bgrp->css;
7092 -+}
7093 -+
7094 -+/*
7095 -+ * We cannot support shared io contexts, as we have no means to support
7096 -+ * two tasks with the same ioc in two different groups without major rework
7097 -+ * of the main bic/bfqq data structures. By now we allow a task to change
7098 -+ * its cgroup only if it's the only owner of its ioc; the drawback of this
7099 -+ * behavior is that a group containing a task that forked using CLONE_IO
7100 -+ * will not be destroyed until the tasks sharing the ioc die.
7101 -+ */
7102 -+static int bfqio_can_attach(struct cgroup_subsys_state *css,
7103 -+ struct cgroup_taskset *tset)
7104 -+{
7105 -+ struct task_struct *task;
7106 -+ struct io_context *ioc;
7107 -+ int ret = 0;
7108 -+
7109 -+ cgroup_taskset_for_each(task, css, tset) {
7110 -+ /*
7111 -+ * task_lock() is needed to avoid races with
7112 -+ * exit_io_context()
7113 -+ */
7114 -+ task_lock(task);
7115 -+ ioc = task->io_context;
7116 -+ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
7117 -+ /*
7118 -+ * ioc == NULL means that the task is either too young
7119 -+ * or exiting: if it has still no ioc the ioc can't be
7120 -+ * shared, if the task is exiting the attach will fail
7121 -+ * anyway, no matter what we return here.
7122 -+ */
7123 -+ ret = -EINVAL;
7124 -+ task_unlock(task);
7125 -+ if (ret)
7126 -+ break;
7127 -+ }
7128 -+
7129 -+ return ret;
7130 -+}
7131 -+
7132 -+static void bfqio_attach(struct cgroup_subsys_state *css,
7133 -+ struct cgroup_taskset *tset)
7134 -+{
7135 -+ struct task_struct *task;
7136 -+ struct io_context *ioc;
7137 -+ struct io_cq *icq;
7138 -+
7139 -+ /*
7140 -+ * IMPORTANT NOTE: The move of more than one process at a time to a
7141 -+ * new group has not yet been tested.
7142 -+ */
7143 -+ cgroup_taskset_for_each(task, css, tset) {
7144 -+ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
7145 -+ if (ioc) {
7146 -+ /*
7147 -+ * Handle cgroup change here.
7148 -+ */
7149 -+ rcu_read_lock();
7150 -+ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
7151 -+ if (!strncmp(
7152 -+ icq->q->elevator->type->elevator_name,
7153 -+ "bfq", ELV_NAME_MAX))
7154 -+ bfq_bic_change_cgroup(icq_to_bic(icq),
7155 -+ css);
7156 -+ rcu_read_unlock();
7157 -+ put_io_context(ioc);
7158 -+ }
7159 -+ }
7160 -+}
7161 -+
7162 -+static void bfqio_destroy(struct cgroup_subsys_state *css)
7163 -+{
7164 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
7165 -+ struct hlist_node *tmp;
7166 -+ struct bfq_group *bfqg;
7167 -+
7168 -+ /*
7169 -+ * Since we are destroying the cgroup, there are no more tasks
7170 -+ * referencing it, and all the RCU grace periods that may have
7171 -+ * referenced it are ended (as the destruction of the parent
7172 -+ * cgroup is RCU-safe); bgrp->group_data will not be accessed by
7173 -+ * anything else and we don't need any synchronization.
7174 -+ */
7175 -+ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
7176 -+ bfq_destroy_group(bgrp, bfqg);
7177 -+
7178 -+ BUG_ON(!hlist_empty(&bgrp->group_data));
7179 -+
7180 -+ kfree(bgrp);
7181 -+}
7182 -+
7183 -+static int bfqio_css_online(struct cgroup_subsys_state *css)
7184 -+{
7185 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
7186 -+
7187 -+ mutex_lock(&bfqio_mutex);
7188 -+ bgrp->online = true;
7189 -+ mutex_unlock(&bfqio_mutex);
7190 -+
7191 -+ return 0;
7192 -+}
7193 -+
7194 -+static void bfqio_css_offline(struct cgroup_subsys_state *css)
7195 -+{
7196 -+ struct bfqio_cgroup *bgrp = css_to_bfqio(css);
7197 -+
7198 -+ mutex_lock(&bfqio_mutex);
7199 -+ bgrp->online = false;
7200 -+ mutex_unlock(&bfqio_mutex);
7201 -+}
7202 -+
7203 -+struct cgroup_subsys bfqio_subsys = {
7204 -+ .name = "bfqio",
7205 -+ .css_alloc = bfqio_create,
7206 -+ .css_online = bfqio_css_online,
7207 -+ .css_offline = bfqio_css_offline,
7208 -+ .can_attach = bfqio_can_attach,
7209 -+ .attach = bfqio_attach,
7210 -+ .css_free = bfqio_destroy,
7211 -+ .subsys_id = bfqio_subsys_id,
7212 -+ .base_cftypes = bfqio_files,
7213 -+};
7214 -+#else
7215 -+static inline void bfq_init_entity(struct bfq_entity *entity,
7216 -+ struct bfq_group *bfqg)
7217 -+{
7218 -+ entity->weight = entity->new_weight;
7219 -+ entity->orig_weight = entity->new_weight;
7220 -+ entity->ioprio = entity->new_ioprio;
7221 -+ entity->ioprio_class = entity->new_ioprio_class;
7222 -+ entity->sched_data = &bfqg->sched_data;
7223 -+}
7224 -+
7225 -+static inline struct bfq_group *
7226 -+bfq_bic_update_cgroup(struct bfq_io_cq *bic)
7227 -+{
7228 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
7229 -+ return bfqd->root_group;
7230 -+}
7231 -+
7232 -+static inline void bfq_bfqq_move(struct bfq_data *bfqd,
7233 -+ struct bfq_queue *bfqq,
7234 -+ struct bfq_entity *entity,
7235 -+ struct bfq_group *bfqg)
7236 -+{
7237 -+}
7238 -+
7239 -+static void bfq_end_raising_async(struct bfq_data *bfqd)
7240 -+{
7241 -+ bfq_end_raising_async_queues(bfqd, bfqd->root_group);
7242 -+}
7243 -+
7244 -+static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
7245 -+{
7246 -+ bfq_put_async_queues(bfqd, bfqd->root_group);
7247 -+}
7248 -+
7249 -+static inline void bfq_free_root_group(struct bfq_data *bfqd)
7250 -+{
7251 -+ kfree(bfqd->root_group);
7252 -+}
7253 -+
7254 -+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
7255 -+{
7256 -+ struct bfq_group *bfqg;
7257 -+ int i;
7258 -+
7259 -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
7260 -+ if (bfqg == NULL)
7261 -+ return NULL;
7262 -+
7263 -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
7264 -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
7265 -+
7266 -+ return bfqg;
7267 -+}
7268 -+#endif
7269 -diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
7270 -new file mode 100644
7271 -index 0000000..7f6b000
7272 ---- /dev/null
7273 -+++ b/block/bfq-ioc.c
7274 -@@ -0,0 +1,36 @@
7275 -+/*
7276 -+ * BFQ: I/O context handling.
7277 -+ *
7278 -+ * Based on ideas and code from CFQ:
7279 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
7280 -+ *
7281 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
7282 -+ * Paolo Valente <paolo.valente@×××××××.it>
7283 -+ *
7284 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
7285 -+ */
7286 -+
7287 -+/**
7288 -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
7289 -+ * @icq: the iocontext queue.
7290 -+ */
7291 -+static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
7292 -+{
7293 -+ /* bic->icq is the first member, %NULL will convert to %NULL */
7294 -+ return container_of(icq, struct bfq_io_cq, icq);
7295 -+}
7296 -+
7297 -+/**
7298 -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
7299 -+ * @bfqd: the lookup key.
7300 -+ * @ioc: the io_context of the process doing I/O.
7301 -+ *
7302 -+ * Queue lock must be held.
7303 -+ */
7304 -+static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
7305 -+ struct io_context *ioc)
7306 -+{
7307 -+ if (ioc)
7308 -+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
7309 -+ return NULL;
7310 -+}
7311 -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
7312 -new file mode 100644
7313 -index 0000000..eb760de
7314 ---- /dev/null
7315 -+++ b/block/bfq-iosched.c
7316 -@@ -0,0 +1,3298 @@
7317 -+/*
7318 -+ * BFQ, or Budget Fair Queueing, disk scheduler.
7319 -+ *
7320 -+ * Based on ideas and code from CFQ:
7321 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
7322 -+ *
7323 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
7324 -+ * Paolo Valente <paolo.valente@×××××××.it>
7325 -+ *
7326 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
7327 -+ *
7328 -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
7329 -+ *
7330 -+ * BFQ is a proportional share disk scheduling algorithm based on the
7331 -+ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in
7332 -+ * number of sectors, to tasks instead of time slices. The disk is not granted
7333 -+ * to the in-service task for a given time slice, but until it has exahusted
7334 -+ * its assigned budget. This change from the time to the service domain allows
7335 -+ * BFQ to distribute the disk bandwidth among tasks as desired, without any
7336 -+ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an
7337 -+ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to
7338 -+ * their budgets (more precisely BFQ schedules queues associated to tasks).
7339 -+ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to
7340 -+ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low
7341 -+ * latencies to interactive and soft real-time applications.
7342 -+ *
7343 -+ * BFQ is described in [1], where also a reference to the initial, more
7344 -+ * theoretical paper on BFQ can be found. The interested reader can find in
7345 -+ * the latter paper full details on the main algorithm as well as formulas of
7346 -+ * the guarantees, plus formal proofs of all the properties. With respect to
7347 -+ * the version of BFQ presented in these papers, this implementation adds a
7348 -+ * few more heuristics, such as the one that guarantees a low latency to soft
7349 -+ * real-time applications, and a hierarchical extension based on H-WF2Q+.
7350 -+ *
7351 -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
7352 -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
7353 -+ * complexity derives from the one introduced with EEVDF in [3].
7354 -+ *
7355 -+ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
7356 -+ * with the BFQ Disk I/O Scheduler'',
7357 -+ * Proceedings of the 5th Annual International Systems and Storage
7358 -+ * Conference (SYSTOR '12), June 2012.
7359 -+ *
7360 -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
7361 -+ *
7362 -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
7363 -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
7364 -+ * Oct 1997.
7365 -+ *
7366 -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
7367 -+ *
7368 -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
7369 -+ * First: A Flexible and Accurate Mechanism for Proportional Share
7370 -+ * Resource Allocation,'' technical report.
7371 -+ *
7372 -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
7373 -+ */
7374 -+#include <linux/module.h>
7375 -+#include <linux/slab.h>
7376 -+#include <linux/blkdev.h>
7377 -+#include <linux/cgroup.h>
7378 -+#include <linux/elevator.h>
7379 -+#include <linux/jiffies.h>
7380 -+#include <linux/rbtree.h>
7381 -+#include <linux/ioprio.h>
7382 -+#include "bfq.h"
7383 -+#include "blk.h"
7384 -+
7385 -+/* Max number of dispatches in one round of service. */
7386 -+static const int bfq_quantum = 4;
7387 -+
7388 -+/* Expiration time of sync (0) and async (1) requests, in jiffies. */
7389 -+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
7390 -+
7391 -+/* Maximum backwards seek, in KiB. */
7392 -+static const int bfq_back_max = 16 * 1024;
7393 -+
7394 -+/* Penalty of a backwards seek, in number of sectors. */
7395 -+static const int bfq_back_penalty = 2;
7396 -+
7397 -+/* Idling period duration, in jiffies. */
7398 -+static int bfq_slice_idle = HZ / 125;
7399 -+
7400 -+/* Default maximum budget values, in sectors and number of requests. */
7401 -+static const int bfq_default_max_budget = 16 * 1024;
7402 -+static const int bfq_max_budget_async_rq = 4;
7403 -+
7404 -+/*
7405 -+ * Async to sync throughput distribution is controlled as follows:
7406 -+ * when an async request is served, the entity is charged the number
7407 -+ * of sectors of the request, multipled by the factor below
7408 -+ */
7409 -+static const int bfq_async_charge_factor = 10;
7410 -+
7411 -+/* Default timeout values, in jiffies, approximating CFQ defaults. */
7412 -+static const int bfq_timeout_sync = HZ / 8;
7413 -+static int bfq_timeout_async = HZ / 25;
7414 -+
7415 -+struct kmem_cache *bfq_pool;
7416 -+
7417 -+/* Below this threshold (in ms), we consider thinktime immediate. */
7418 -+#define BFQ_MIN_TT 2
7419 -+
7420 -+/* hw_tag detection: parallel requests threshold and min samples needed. */
7421 -+#define BFQ_HW_QUEUE_THRESHOLD 4
7422 -+#define BFQ_HW_QUEUE_SAMPLES 32
7423 -+
7424 -+#define BFQQ_SEEK_THR (sector_t)(8 * 1024)
7425 -+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
7426 -+
7427 -+/* Min samples used for peak rate estimation (for autotuning). */
7428 -+#define BFQ_PEAK_RATE_SAMPLES 32
7429 -+
7430 -+/* Shift used for peak rate fixed precision calculations. */
7431 -+#define BFQ_RATE_SHIFT 16
7432 -+
7433 -+/*
7434 -+ * The duration of the weight raising for interactive applications is
7435 -+ * computed automatically (as default behaviour), using the following
7436 -+ * formula: duration = (R / r) * T, where r is the peak rate of the
7437 -+ * disk, and R and T are two reference parameters. In particular, R is
7438 -+ * the peak rate of a reference disk, and T is about the maximum time
7439 -+ * for starting popular large applications on that disk, under BFQ and
7440 -+ * while reading two files in parallel. Finally, BFQ uses two
7441 -+ * different pairs (R, T) depending on whether the disk is rotational
7442 -+ * or non-rotational.
7443 -+ */
7444 -+#define T_rot (msecs_to_jiffies(5500))
7445 -+#define T_nonrot (msecs_to_jiffies(2000))
7446 -+/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
7447 -+#define R_rot 17415
7448 -+#define R_nonrot 34791
7449 -+
7450 -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
7451 -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
7452 -+
7453 -+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
7454 -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
7455 -+
7456 -+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
7457 -+
7458 -+#include "bfq-ioc.c"
7459 -+#include "bfq-sched.c"
7460 -+#include "bfq-cgroup.c"
7461 -+
7462 -+#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\
7463 -+ IOPRIO_CLASS_IDLE)
7464 -+#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\
7465 -+ IOPRIO_CLASS_RT)
7466 -+
7467 -+#define bfq_sample_valid(samples) ((samples) > 80)
7468 -+
7469 -+/*
7470 -+ * We regard a request as SYNC, if either it's a read or has the SYNC bit
7471 -+ * set (in which case it could also be a direct WRITE).
7472 -+ */
7473 -+static inline int bfq_bio_sync(struct bio *bio)
7474 -+{
7475 -+ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
7476 -+ return 1;
7477 -+
7478 -+ return 0;
7479 -+}
7480 -+
7481 -+/*
7482 -+ * Scheduler run of queue, if there are requests pending and no one in the
7483 -+ * driver that will restart queueing.
7484 -+ */
7485 -+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
7486 -+{
7487 -+ if (bfqd->queued != 0) {
7488 -+ bfq_log(bfqd, "schedule dispatch");
7489 -+ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
7490 -+ }
7491 -+}
7492 -+
7493 -+/*
7494 -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
7495 -+ * We choose the request that is closesr to the head right now. Distance
7496 -+ * behind the head is penalized and only allowed to a certain extent.
7497 -+ */
7498 -+static struct request *bfq_choose_req(struct bfq_data *bfqd,
7499 -+ struct request *rq1,
7500 -+ struct request *rq2,
7501 -+ sector_t last)
7502 -+{
7503 -+ sector_t s1, s2, d1 = 0, d2 = 0;
7504 -+ unsigned long back_max;
7505 -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
7506 -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
7507 -+ unsigned wrap = 0; /* bit mask: requests behind the disk head? */
7508 -+
7509 -+ if (rq1 == NULL || rq1 == rq2)
7510 -+ return rq2;
7511 -+ if (rq2 == NULL)
7512 -+ return rq1;
7513 -+
7514 -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
7515 -+ return rq1;
7516 -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
7517 -+ return rq2;
7518 -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
7519 -+ return rq1;
7520 -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
7521 -+ return rq2;
7522 -+
7523 -+ s1 = blk_rq_pos(rq1);
7524 -+ s2 = blk_rq_pos(rq2);
7525 -+
7526 -+ /*
7527 -+ * By definition, 1KiB is 2 sectors.
7528 -+ */
7529 -+ back_max = bfqd->bfq_back_max * 2;
7530 -+
7531 -+ /*
7532 -+ * Strict one way elevator _except_ in the case where we allow
7533 -+ * short backward seeks which are biased as twice the cost of a
7534 -+ * similar forward seek.
7535 -+ */
7536 -+ if (s1 >= last)
7537 -+ d1 = s1 - last;
7538 -+ else if (s1 + back_max >= last)
7539 -+ d1 = (last - s1) * bfqd->bfq_back_penalty;
7540 -+ else
7541 -+ wrap |= BFQ_RQ1_WRAP;
7542 -+
7543 -+ if (s2 >= last)
7544 -+ d2 = s2 - last;
7545 -+ else if (s2 + back_max >= last)
7546 -+ d2 = (last - s2) * bfqd->bfq_back_penalty;
7547 -+ else
7548 -+ wrap |= BFQ_RQ2_WRAP;
7549 -+
7550 -+ /* Found required data */
7551 -+
7552 -+ /*
7553 -+ * By doing switch() on the bit mask "wrap" we avoid having to
7554 -+ * check two variables for all permutations: --> faster!
7555 -+ */
7556 -+ switch (wrap) {
7557 -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
7558 -+ if (d1 < d2)
7559 -+ return rq1;
7560 -+ else if (d2 < d1)
7561 -+ return rq2;
7562 -+ else {
7563 -+ if (s1 >= s2)
7564 -+ return rq1;
7565 -+ else
7566 -+ return rq2;
7567 -+ }
7568 -+
7569 -+ case BFQ_RQ2_WRAP:
7570 -+ return rq1;
7571 -+ case BFQ_RQ1_WRAP:
7572 -+ return rq2;
7573 -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
7574 -+ default:
7575 -+ /*
7576 -+ * Since both rqs are wrapped,
7577 -+ * start with the one that's further behind head
7578 -+ * (--> only *one* back seek required),
7579 -+ * since back seek takes more time than forward.
7580 -+ */
7581 -+ if (s1 <= s2)
7582 -+ return rq1;
7583 -+ else
7584 -+ return rq2;
7585 -+ }
7586 -+}
7587 -+
7588 -+static struct bfq_queue *
7589 -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
7590 -+ sector_t sector, struct rb_node **ret_parent,
7591 -+ struct rb_node ***rb_link)
7592 -+{
7593 -+ struct rb_node **p, *parent;
7594 -+ struct bfq_queue *bfqq = NULL;
7595 -+
7596 -+ parent = NULL;
7597 -+ p = &root->rb_node;
7598 -+ while (*p) {
7599 -+ struct rb_node **n;
7600 -+
7601 -+ parent = *p;
7602 -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
7603 -+
7604 -+ /*
7605 -+ * Sort strictly based on sector. Smallest to the left,
7606 -+ * largest to the right.
7607 -+ */
7608 -+ if (sector > blk_rq_pos(bfqq->next_rq))
7609 -+ n = &(*p)->rb_right;
7610 -+ else if (sector < blk_rq_pos(bfqq->next_rq))
7611 -+ n = &(*p)->rb_left;
7612 -+ else
7613 -+ break;
7614 -+ p = n;
7615 -+ bfqq = NULL;
7616 -+ }
7617 -+
7618 -+ *ret_parent = parent;
7619 -+ if (rb_link)
7620 -+ *rb_link = p;
7621 -+
7622 -+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
7623 -+ (long long unsigned)sector,
7624 -+ bfqq != NULL ? bfqq->pid : 0);
7625 -+
7626 -+ return bfqq;
7627 -+}
7628 -+
7629 -+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
7630 -+{
7631 -+ struct rb_node **p, *parent;
7632 -+ struct bfq_queue *__bfqq;
7633 -+
7634 -+ if (bfqq->pos_root != NULL) {
7635 -+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
7636 -+ bfqq->pos_root = NULL;
7637 -+ }
7638 -+
7639 -+ if (bfq_class_idle(bfqq))
7640 -+ return;
7641 -+ if (!bfqq->next_rq)
7642 -+ return;
7643 -+
7644 -+ bfqq->pos_root = &bfqd->rq_pos_tree;
7645 -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
7646 -+ blk_rq_pos(bfqq->next_rq), &parent, &p);
7647 -+ if (__bfqq == NULL) {
7648 -+ rb_link_node(&bfqq->pos_node, parent, p);
7649 -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
7650 -+ } else
7651 -+ bfqq->pos_root = NULL;
7652 -+}
7653 -+
7654 -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
7655 -+ struct bfq_queue *bfqq,
7656 -+ struct request *last)
7657 -+{
7658 -+ struct rb_node *rbnext = rb_next(&last->rb_node);
7659 -+ struct rb_node *rbprev = rb_prev(&last->rb_node);
7660 -+ struct request *next = NULL, *prev = NULL;
7661 -+
7662 -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node));
7663 -+
7664 -+ if (rbprev != NULL)
7665 -+ prev = rb_entry_rq(rbprev);
7666 -+
7667 -+ if (rbnext != NULL)
7668 -+ next = rb_entry_rq(rbnext);
7669 -+ else {
7670 -+ rbnext = rb_first(&bfqq->sort_list);
7671 -+ if (rbnext && rbnext != &last->rb_node)
7672 -+ next = rb_entry_rq(rbnext);
7673 -+ }
7674 -+
7675 -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
7676 -+}
7677 -+
7678 -+static void bfq_del_rq_rb(struct request *rq)
7679 -+{
7680 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
7681 -+ struct bfq_data *bfqd = bfqq->bfqd;
7682 -+ const int sync = rq_is_sync(rq);
7683 -+
7684 -+ BUG_ON(bfqq->queued[sync] == 0);
7685 -+ bfqq->queued[sync]--;
7686 -+ bfqd->queued--;
7687 -+
7688 -+ elv_rb_del(&bfqq->sort_list, rq);
7689 -+
7690 -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
7691 -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
7692 -+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
7693 -+ /*
7694 -+ * Remove queue from request-position tree as it is empty.
7695 -+ */
7696 -+ if (bfqq->pos_root != NULL) {
7697 -+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
7698 -+ bfqq->pos_root = NULL;
7699 -+ }
7700 -+ }
7701 -+}
7702 -+
7703 -+/* see the definition of bfq_async_charge_factor for details */
7704 -+static inline unsigned long bfq_serv_to_charge(struct request *rq,
7705 -+ struct bfq_queue *bfqq)
7706 -+{
7707 -+ return blk_rq_sectors(rq) *
7708 -+ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
7709 -+ bfq_async_charge_factor));
7710 -+}
7711 -+
7712 -+/**
7713 -+ * bfq_updated_next_req - update the queue after a new next_rq selection.
7714 -+ * @bfqd: the device data the queue belongs to.
7715 -+ * @bfqq: the queue to update.
7716 -+ *
7717 -+ * If the first request of a queue changes we make sure that the queue
7718 -+ * has enough budget to serve at least its first request (if the
7719 -+ * request has grown). We do this because if the queue has not enough
7720 -+ * budget for its first request, it has to go through two dispatch
7721 -+ * rounds to actually get it dispatched.
7722 -+ */
7723 -+static void bfq_updated_next_req(struct bfq_data *bfqd,
7724 -+ struct bfq_queue *bfqq)
7725 -+{
7726 -+ struct bfq_entity *entity = &bfqq->entity;
7727 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
7728 -+ struct request *next_rq = bfqq->next_rq;
7729 -+ unsigned long new_budget;
7730 -+
7731 -+ if (next_rq == NULL)
7732 -+ return;
7733 -+
7734 -+ if (bfqq == bfqd->in_service_queue)
7735 -+ /*
7736 -+ * In order not to break guarantees, budgets cannot be
7737 -+ * changed after an entity has been selected.
7738 -+ */
7739 -+ return;
7740 -+
7741 -+ BUG_ON(entity->tree != &st->active);
7742 -+ BUG_ON(entity == entity->sched_data->in_service_entity);
7743 -+
7744 -+ new_budget = max_t(unsigned long, bfqq->max_budget,
7745 -+ bfq_serv_to_charge(next_rq, bfqq));
7746 -+ entity->budget = new_budget;
7747 -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
7748 -+ bfq_activate_bfqq(bfqd, bfqq);
7749 -+}
7750 -+
7751 -+static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
7752 -+{
7753 -+ u64 dur;
7754 -+
7755 -+ if (bfqd->bfq_raising_max_time > 0)
7756 -+ return bfqd->bfq_raising_max_time;
7757 -+
7758 -+ dur = bfqd->RT_prod;
7759 -+ do_div(dur, bfqd->peak_rate);
7760 -+
7761 -+ return dur;
7762 -+}
7763 -+
7764 -+static void bfq_add_rq_rb(struct request *rq)
7765 -+{
7766 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
7767 -+ struct bfq_entity *entity = &bfqq->entity;
7768 -+ struct bfq_data *bfqd = bfqq->bfqd;
7769 -+ struct request *next_rq, *prev;
7770 -+ unsigned long old_raising_coeff = bfqq->raising_coeff;
7771 -+ int idle_for_long_time = 0;
7772 -+
7773 -+ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
7774 -+ bfqq->queued[rq_is_sync(rq)]++;
7775 -+ bfqd->queued++;
7776 -+
7777 -+ elv_rb_add(&bfqq->sort_list, rq);
7778 -+
7779 -+ /*
7780 -+ * Check if this request is a better next-serve candidate.
7781 -+ */
7782 -+ prev = bfqq->next_rq;
7783 -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
7784 -+ BUG_ON(next_rq == NULL);
7785 -+ bfqq->next_rq = next_rq;
7786 -+
7787 -+ /*
7788 -+ * Adjust priority tree position, if next_rq changes.
7789 -+ */
7790 -+ if (prev != bfqq->next_rq)
7791 -+ bfq_rq_pos_tree_add(bfqd, bfqq);
7792 -+
7793 -+ if (!bfq_bfqq_busy(bfqq)) {
7794 -+ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
7795 -+ time_is_before_jiffies(bfqq->soft_rt_next_start);
7796 -+ idle_for_long_time = time_is_before_jiffies(
7797 -+ bfqq->budget_timeout +
7798 -+ bfqd->bfq_raising_min_idle_time);
7799 -+ entity->budget = max_t(unsigned long, bfqq->max_budget,
7800 -+ bfq_serv_to_charge(next_rq, bfqq));
7801 -+
7802 -+ if (!bfqd->low_latency)
7803 -+ goto add_bfqq_busy;
7804 -+
7805 -+ /*
7806 -+ * If the queue is not being boosted and has been idle
7807 -+ * for enough time, start a weight-raising period
7808 -+ */
7809 -+ if (old_raising_coeff == 1 &&
7810 -+ (idle_for_long_time || soft_rt)) {
7811 -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
7812 -+ if (idle_for_long_time)
7813 -+ bfqq->raising_cur_max_time =
7814 -+ bfq_wrais_duration(bfqd);
7815 -+ else
7816 -+ bfqq->raising_cur_max_time =
7817 -+ bfqd->bfq_raising_rt_max_time;
7818 -+ bfq_log_bfqq(bfqd, bfqq,
7819 -+ "wrais starting at %lu, "
7820 -+ "rais_max_time %u",
7821 -+ jiffies,
7822 -+ jiffies_to_msecs(bfqq->
7823 -+ raising_cur_max_time));
7824 -+ } else if (old_raising_coeff > 1) {
7825 -+ if (idle_for_long_time)
7826 -+ bfqq->raising_cur_max_time =
7827 -+ bfq_wrais_duration(bfqd);
7828 -+ else if (bfqq->raising_cur_max_time ==
7829 -+ bfqd->bfq_raising_rt_max_time &&
7830 -+ !soft_rt) {
7831 -+ bfqq->raising_coeff = 1;
7832 -+ bfq_log_bfqq(bfqd, bfqq,
7833 -+ "wrais ending at %lu, "
7834 -+ "rais_max_time %u",
7835 -+ jiffies,
7836 -+ jiffies_to_msecs(bfqq->
7837 -+ raising_cur_max_time));
7838 -+ } else if (time_before(
7839 -+ bfqq->last_rais_start_finish +
7840 -+ bfqq->raising_cur_max_time,
7841 -+ jiffies +
7842 -+ bfqd->bfq_raising_rt_max_time) &&
7843 -+ soft_rt) {
7844 -+ /*
7845 -+ *
7846 -+ * The remaining weight-raising time is lower
7847 -+ * than bfqd->bfq_raising_rt_max_time, which
7848 -+ * means that the application is enjoying
7849 -+ * weight raising either because deemed soft rt
7850 -+ * in the near past, or because deemed
7851 -+ * interactive a long ago. In both cases,
7852 -+ * resetting now the current remaining weight-
7853 -+ * raising time for the application to the
7854 -+ * weight-raising duration for soft rt
7855 -+ * applications would not cause any latency
7856 -+ * increase for the application (as the new
7857 -+ * duration would be higher than the remaining
7858 -+ * time).
7859 -+ *
7860 -+ * In addition, the application is now meeting
7861 -+ * the requirements for being deemed soft rt.
7862 -+ * In the end we can correctly and safely
7863 -+ * (re)charge the weight-raising duration for
7864 -+ * the application with the weight-raising
7865 -+ * duration for soft rt applications.
7866 -+ *
7867 -+ * In particular, doing this recharge now, i.e.,
7868 -+ * before the weight-raising period for the
7869 -+ * application finishes, reduces the probability
7870 -+ * of the following negative scenario:
7871 -+ * 1) the weight of a soft rt application is
7872 -+ * raised at startup (as for any newly
7873 -+ * created application),
7874 -+ * 2) since the application is not interactive,
7875 -+ * at a certain time weight-raising is
7876 -+ * stopped for the application,
7877 -+ * 3) at that time the application happens to
7878 -+ * still have pending requests, and hence
7879 -+ * is destined to not have a chance to be
7880 -+ * deemed soft rt before these requests are
7881 -+ * completed (see the comments to the
7882 -+ * function bfq_bfqq_softrt_next_start()
7883 -+ * for details on soft rt detection),
7884 -+ * 4) these pending requests experience a high
7885 -+ * latency because the application is not
7886 -+ * weight-raised while they are pending.
7887 -+ */
7888 -+ bfqq->last_rais_start_finish = jiffies;
7889 -+ bfqq->raising_cur_max_time =
7890 -+ bfqd->bfq_raising_rt_max_time;
7891 -+ }
7892 -+ }
7893 -+ if (old_raising_coeff != bfqq->raising_coeff)
7894 -+ entity->ioprio_changed = 1;
7895 -+add_bfqq_busy:
7896 -+ bfqq->last_idle_bklogged = jiffies;
7897 -+ bfqq->service_from_backlogged = 0;
7898 -+ bfq_clear_bfqq_softrt_update(bfqq);
7899 -+ bfq_add_bfqq_busy(bfqd, bfqq);
7900 -+ } else {
7901 -+ if (bfqd->low_latency && old_raising_coeff == 1 &&
7902 -+ !rq_is_sync(rq) &&
7903 -+ time_is_before_jiffies(
7904 -+ bfqq->last_rais_start_finish +
7905 -+ bfqd->bfq_raising_min_inter_arr_async)) {
7906 -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff;
7907 -+ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
7908 -+
7909 -+ bfqd->raised_busy_queues++;
7910 -+ entity->ioprio_changed = 1;
7911 -+ bfq_log_bfqq(bfqd, bfqq,
7912 -+ "non-idle wrais starting at %lu, "
7913 -+ "rais_max_time %u",
7914 -+ jiffies,
7915 -+ jiffies_to_msecs(bfqq->
7916 -+ raising_cur_max_time));
7917 -+ }
7918 -+ bfq_updated_next_req(bfqd, bfqq);
7919 -+ }
7920 -+
7921 -+ if (bfqd->low_latency &&
7922 -+ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
7923 -+ idle_for_long_time))
7924 -+ bfqq->last_rais_start_finish = jiffies;
7925 -+}
7926 -+
7927 -+static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
7928 -+{
7929 -+ elv_rb_del(&bfqq->sort_list, rq);
7930 -+ bfqq->queued[rq_is_sync(rq)]--;
7931 -+ bfqq->bfqd->queued--;
7932 -+ bfq_add_rq_rb(rq);
7933 -+}
7934 -+
7935 -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
7936 -+ struct bio *bio)
7937 -+{
7938 -+ struct task_struct *tsk = current;
7939 -+ struct bfq_io_cq *bic;
7940 -+ struct bfq_queue *bfqq;
7941 -+
7942 -+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
7943 -+ if (bic == NULL)
7944 -+ return NULL;
7945 -+
7946 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
7947 -+ if (bfqq != NULL)
7948 -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
7949 -+
7950 -+ return NULL;
7951 -+}
7952 -+
7953 -+static void bfq_activate_request(struct request_queue *q, struct request *rq)
7954 -+{
7955 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
7956 -+
7957 -+ bfqd->rq_in_driver++;
7958 -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
7959 -+ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
7960 -+ (long long unsigned)bfqd->last_position);
7961 -+}
7962 -+
7963 -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
7964 -+{
7965 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
7966 -+
7967 -+ WARN_ON(bfqd->rq_in_driver == 0);
7968 -+ bfqd->rq_in_driver--;
7969 -+}
7970 -+
7971 -+static void bfq_remove_request(struct request *rq)
7972 -+{
7973 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
7974 -+ struct bfq_data *bfqd = bfqq->bfqd;
7975 -+
7976 -+ if (bfqq->next_rq == rq) {
7977 -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
7978 -+ bfq_updated_next_req(bfqd, bfqq);
7979 -+ }
7980 -+
7981 -+ list_del_init(&rq->queuelist);
7982 -+ bfq_del_rq_rb(rq);
7983 -+
7984 -+ if (rq->cmd_flags & REQ_META) {
7985 -+ WARN_ON(bfqq->meta_pending == 0);
7986 -+ bfqq->meta_pending--;
7987 -+ }
7988 -+}
7989 -+
7990 -+static int bfq_merge(struct request_queue *q, struct request **req,
7991 -+ struct bio *bio)
7992 -+{
7993 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
7994 -+ struct request *__rq;
7995 -+
7996 -+ __rq = bfq_find_rq_fmerge(bfqd, bio);
7997 -+ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
7998 -+ *req = __rq;
7999 -+ return ELEVATOR_FRONT_MERGE;
8000 -+ }
8001 -+
8002 -+ return ELEVATOR_NO_MERGE;
8003 -+}
8004 -+
8005 -+static void bfq_merged_request(struct request_queue *q, struct request *req,
8006 -+ int type)
8007 -+{
8008 -+ if (type == ELEVATOR_FRONT_MERGE) {
8009 -+ struct bfq_queue *bfqq = RQ_BFQQ(req);
8010 -+
8011 -+ bfq_reposition_rq_rb(bfqq, req);
8012 -+ }
8013 -+}
8014 -+
8015 -+static void bfq_merged_requests(struct request_queue *q, struct request *rq,
8016 -+ struct request *next)
8017 -+{
8018 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
8019 -+
8020 -+ /*
8021 -+ * Reposition in fifo if next is older than rq.
8022 -+ */
8023 -+ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
8024 -+ time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
8025 -+ list_move(&rq->queuelist, &next->queuelist);
8026 -+ rq_set_fifo_time(rq, rq_fifo_time(next));
8027 -+ }
8028 -+
8029 -+ if (bfqq->next_rq == next)
8030 -+ bfqq->next_rq = rq;
8031 -+
8032 -+ bfq_remove_request(next);
8033 -+}
8034 -+
8035 -+/* Must be called with bfqq != NULL */
8036 -+static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq)
8037 -+{
8038 -+ BUG_ON(bfqq == NULL);
8039 -+ if (bfq_bfqq_busy(bfqq))
8040 -+ bfqq->bfqd->raised_busy_queues--;
8041 -+ bfqq->raising_coeff = 1;
8042 -+ bfqq->raising_cur_max_time = 0;
8043 -+ /* Trigger a weight change on the next activation of the queue */
8044 -+ bfqq->entity.ioprio_changed = 1;
8045 -+}
8046 -+
8047 -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
8048 -+ struct bfq_group *bfqg)
8049 -+{
8050 -+ int i, j;
8051 -+
8052 -+ for (i = 0; i < 2; i++)
8053 -+ for (j = 0; j < IOPRIO_BE_NR; j++)
8054 -+ if (bfqg->async_bfqq[i][j] != NULL)
8055 -+ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]);
8056 -+ if (bfqg->async_idle_bfqq != NULL)
8057 -+ bfq_bfqq_end_raising(bfqg->async_idle_bfqq);
8058 -+}
8059 -+
8060 -+static void bfq_end_raising(struct bfq_data *bfqd)
8061 -+{
8062 -+ struct bfq_queue *bfqq;
8063 -+
8064 -+ spin_lock_irq(bfqd->queue->queue_lock);
8065 -+
8066 -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
8067 -+ bfq_bfqq_end_raising(bfqq);
8068 -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
8069 -+ bfq_bfqq_end_raising(bfqq);
8070 -+ bfq_end_raising_async(bfqd);
8071 -+
8072 -+ spin_unlock_irq(bfqd->queue->queue_lock);
8073 -+}
8074 -+
8075 -+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
8076 -+ struct bio *bio)
8077 -+{
8078 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
8079 -+ struct bfq_io_cq *bic;
8080 -+ struct bfq_queue *bfqq;
8081 -+
8082 -+ /*
8083 -+ * Disallow merge of a sync bio into an async request.
8084 -+ */
8085 -+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
8086 -+ return 0;
8087 -+
8088 -+ /*
8089 -+ * Lookup the bfqq that this bio will be queued with. Allow
8090 -+ * merge only if rq is queued there.
8091 -+ * Queue lock is held here.
8092 -+ */
8093 -+ bic = bfq_bic_lookup(bfqd, current->io_context);
8094 -+ if (bic == NULL)
8095 -+ return 0;
8096 -+
8097 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
8098 -+ return bfqq == RQ_BFQQ(rq);
8099 -+}
8100 -+
8101 -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
8102 -+ struct bfq_queue *bfqq)
8103 -+{
8104 -+ if (bfqq != NULL) {
8105 -+ bfq_mark_bfqq_must_alloc(bfqq);
8106 -+ bfq_mark_bfqq_budget_new(bfqq);
8107 -+ bfq_clear_bfqq_fifo_expire(bfqq);
8108 -+
8109 -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
8110 -+
8111 -+ bfq_log_bfqq(bfqd, bfqq,
8112 -+ "set_in_service_queue, cur-budget = %lu",
8113 -+ bfqq->entity.budget);
8114 -+ }
8115 -+
8116 -+ bfqd->in_service_queue = bfqq;
8117 -+}
8118 -+
8119 -+/*
8120 -+ * Get and set a new queue for service.
8121 -+ */
8122 -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
8123 -+ struct bfq_queue *bfqq)
8124 -+{
8125 -+ if (!bfqq)
8126 -+ bfqq = bfq_get_next_queue(bfqd);
8127 -+ else
8128 -+ bfq_get_next_queue_forced(bfqd, bfqq);
8129 -+
8130 -+ __bfq_set_in_service_queue(bfqd, bfqq);
8131 -+ return bfqq;
8132 -+}
8133 -+
8134 -+static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
8135 -+ struct request *rq)
8136 -+{
8137 -+ if (blk_rq_pos(rq) >= bfqd->last_position)
8138 -+ return blk_rq_pos(rq) - bfqd->last_position;
8139 -+ else
8140 -+ return bfqd->last_position - blk_rq_pos(rq);
8141 -+}
8142 -+
8143 -+/*
8144 -+ * Return true if bfqq has no request pending and rq is close enough to
8145 -+ * bfqd->last_position, or if rq is closer to bfqd->last_position than
8146 -+ * bfqq->next_rq
8147 -+ */
8148 -+static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
8149 -+{
8150 -+ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
8151 -+}
8152 -+
8153 -+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
8154 -+{
8155 -+ struct rb_root *root = &bfqd->rq_pos_tree;
8156 -+ struct rb_node *parent, *node;
8157 -+ struct bfq_queue *__bfqq;
8158 -+ sector_t sector = bfqd->last_position;
8159 -+
8160 -+ if (RB_EMPTY_ROOT(root))
8161 -+ return NULL;
8162 -+
8163 -+ /*
8164 -+ * First, if we find a request starting at the end of the last
8165 -+ * request, choose it.
8166 -+ */
8167 -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
8168 -+ if (__bfqq != NULL)
8169 -+ return __bfqq;
8170 -+
8171 -+ /*
8172 -+ * If the exact sector wasn't found, the parent of the NULL leaf
8173 -+ * will contain the closest sector (rq_pos_tree sorted by next_request
8174 -+ * position).
8175 -+ */
8176 -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
8177 -+ if (bfq_rq_close(bfqd, __bfqq->next_rq))
8178 -+ return __bfqq;
8179 -+
8180 -+ if (blk_rq_pos(__bfqq->next_rq) < sector)
8181 -+ node = rb_next(&__bfqq->pos_node);
8182 -+ else
8183 -+ node = rb_prev(&__bfqq->pos_node);
8184 -+ if (node == NULL)
8185 -+ return NULL;
8186 -+
8187 -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
8188 -+ if (bfq_rq_close(bfqd, __bfqq->next_rq))
8189 -+ return __bfqq;
8190 -+
8191 -+ return NULL;
8192 -+}
8193 -+
8194 -+/*
8195 -+ * bfqd - obvious
8196 -+ * cur_bfqq - passed in so that we don't decide that the current queue
8197 -+ * is closely cooperating with itself.
8198 -+ *
8199 -+ * We are assuming that cur_bfqq has dispatched at least one request,
8200 -+ * and that bfqd->last_position reflects a position on the disk associated
8201 -+ * with the I/O issued by cur_bfqq.
8202 -+ */
8203 -+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
8204 -+ struct bfq_queue *cur_bfqq)
8205 -+{
8206 -+ struct bfq_queue *bfqq;
8207 -+
8208 -+ if (bfq_class_idle(cur_bfqq))
8209 -+ return NULL;
8210 -+ if (!bfq_bfqq_sync(cur_bfqq))
8211 -+ return NULL;
8212 -+ if (BFQQ_SEEKY(cur_bfqq))
8213 -+ return NULL;
8214 -+
8215 -+ /* If device has only one backlogged bfq_queue, don't search. */
8216 -+ if (bfqd->busy_queues == 1)
8217 -+ return NULL;
8218 -+
8219 -+ /*
8220 -+ * We should notice if some of the queues are cooperating, e.g.
8221 -+ * working closely on the same area of the disk. In that case,
8222 -+ * we can group them together and don't waste time idling.
8223 -+ */
8224 -+ bfqq = bfqq_close(bfqd);
8225 -+ if (bfqq == NULL || bfqq == cur_bfqq)
8226 -+ return NULL;
8227 -+
8228 -+ /*
8229 -+ * Do not merge queues from different bfq_groups.
8230 -+ */
8231 -+ if (bfqq->entity.parent != cur_bfqq->entity.parent)
8232 -+ return NULL;
8233 -+
8234 -+ /*
8235 -+ * It only makes sense to merge sync queues.
8236 -+ */
8237 -+ if (!bfq_bfqq_sync(bfqq))
8238 -+ return NULL;
8239 -+ if (BFQQ_SEEKY(bfqq))
8240 -+ return NULL;
8241 -+
8242 -+ /*
8243 -+ * Do not merge queues of different priority classes.
8244 -+ */
8245 -+ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
8246 -+ return NULL;
8247 -+
8248 -+ return bfqq;
8249 -+}
8250 -+
8251 -+/*
8252 -+ * If enough samples have been computed, return the current max budget
8253 -+ * stored in bfqd, which is dynamically updated according to the
8254 -+ * estimated disk peak rate; otherwise return the default max budget
8255 -+ */
8256 -+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
8257 -+{
8258 -+ if (bfqd->budgets_assigned < 194)
8259 -+ return bfq_default_max_budget;
8260 -+ else
8261 -+ return bfqd->bfq_max_budget;
8262 -+}
8263 -+
8264 -+/*
8265 -+ * Return min budget, which is a fraction of the current or default
8266 -+ * max budget (trying with 1/32)
8267 -+ */
8268 -+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
8269 -+{
8270 -+ if (bfqd->budgets_assigned < 194)
8271 -+ return bfq_default_max_budget / 32;
8272 -+ else
8273 -+ return bfqd->bfq_max_budget / 32;
8274 -+}
8275 -+
8276 -+/*
8277 -+ * Decides whether idling should be done for given device and
8278 -+ * given in-service queue.
8279 -+ */
8280 -+static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
8281 -+ struct bfq_queue *in_service_bfqq)
8282 -+{
8283 -+ if (in_service_bfqq == NULL)
8284 -+ return false;
8285 -+ /*
8286 -+ * If the device is non-rotational, and hence has no seek penalty,
8287 -+ * disable idling; but do so only if:
8288 -+ * - device does not support queuing, otherwise we still have
8289 -+ * a problem with sync vs async workloads;
8290 -+ * - the queue is not weight-raised, to preserve guarantees.
8291 -+ */
8292 -+ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
8293 -+ in_service_bfqq->raising_coeff == 1);
8294 -+}
8295 -+
8296 -+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
8297 -+{
8298 -+ struct bfq_queue *bfqq = bfqd->in_service_queue;
8299 -+ struct bfq_io_cq *bic;
8300 -+ unsigned long sl;
8301 -+
8302 -+ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
8303 -+
8304 -+ /* Tasks have exited, don't wait. */
8305 -+ bic = bfqd->in_service_bic;
8306 -+ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
8307 -+ return;
8308 -+
8309 -+ bfq_mark_bfqq_wait_request(bfqq);
8310 -+
8311 -+ /*
8312 -+ * We don't want to idle for seeks, but we do want to allow
8313 -+ * fair distribution of slice time for a process doing back-to-back
8314 -+ * seeks. So allow a little bit of time for him to submit a new rq.
8315 -+ *
8316 -+ * To prevent processes with (partly) seeky workloads from
8317 -+ * being too ill-treated, grant them a small fraction of the
8318 -+ * assigned budget before reducing the waiting time to
8319 -+ * BFQ_MIN_TT. This happened to help reduce latency.
8320 -+ */
8321 -+ sl = bfqd->bfq_slice_idle;
8322 -+ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
8323 -+ bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
8324 -+ bfqq->raising_coeff == 1)
8325 -+ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
8326 -+ else if (bfqq->raising_coeff > 1)
8327 -+ sl = sl * 3;
8328 -+ bfqd->last_idling_start = ktime_get();
8329 -+ mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
8330 -+ bfq_log(bfqd, "arm idle: %u/%u ms",
8331 -+ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
8332 -+}
8333 -+
8334 -+/*
8335 -+ * Set the maximum time for the in-service queue to consume its
8336 -+ * budget. This prevents seeky processes from lowering the disk
8337 -+ * throughput (always guaranteed with a time slice scheme as in CFQ).
8338 -+ */
8339 -+static void bfq_set_budget_timeout(struct bfq_data *bfqd)
8340 -+{
8341 -+ struct bfq_queue *bfqq = bfqd->in_service_queue;
8342 -+ unsigned int timeout_coeff;
8343 -+ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
8344 -+ timeout_coeff = 1;
8345 -+ else
8346 -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
8347 -+
8348 -+ bfqd->last_budget_start = ktime_get();
8349 -+
8350 -+ bfq_clear_bfqq_budget_new(bfqq);
8351 -+ bfqq->budget_timeout = jiffies +
8352 -+ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
8353 -+
8354 -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
8355 -+ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
8356 -+ timeout_coeff));
8357 -+}
8358 -+
8359 -+/*
8360 -+ * Move request from internal lists to the request queue dispatch list.
8361 -+ */
8362 -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
8363 -+{
8364 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
8365 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
8366 -+
8367 -+ bfq_remove_request(rq);
8368 -+ bfqq->dispatched++;
8369 -+ elv_dispatch_sort(q, rq);
8370 -+
8371 -+ if (bfq_bfqq_sync(bfqq))
8372 -+ bfqd->sync_flight++;
8373 -+}
8374 -+
8375 -+/*
8376 -+ * Return expired entry, or NULL to just start from scratch in rbtree.
8377 -+ */
8378 -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
8379 -+{
8380 -+ struct request *rq = NULL;
8381 -+
8382 -+ if (bfq_bfqq_fifo_expire(bfqq))
8383 -+ return NULL;
8384 -+
8385 -+ bfq_mark_bfqq_fifo_expire(bfqq);
8386 -+
8387 -+ if (list_empty(&bfqq->fifo))
8388 -+ return NULL;
8389 -+
8390 -+ rq = rq_entry_fifo(bfqq->fifo.next);
8391 -+
8392 -+ if (time_before(jiffies, rq_fifo_time(rq)))
8393 -+ return NULL;
8394 -+
8395 -+ return rq;
8396 -+}
8397 -+
8398 -+/*
8399 -+ * Must be called with the queue_lock held.
8400 -+ */
8401 -+static int bfqq_process_refs(struct bfq_queue *bfqq)
8402 -+{
8403 -+ int process_refs, io_refs;
8404 -+
8405 -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
8406 -+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
8407 -+ BUG_ON(process_refs < 0);
8408 -+ return process_refs;
8409 -+}
8410 -+
8411 -+static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
8412 -+{
8413 -+ int process_refs, new_process_refs;
8414 -+ struct bfq_queue *__bfqq;
8415 -+
8416 -+ /*
8417 -+ * If there are no process references on the new_bfqq, then it is
8418 -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
8419 -+ * may have dropped their last reference (not just their last process
8420 -+ * reference).
8421 -+ */
8422 -+ if (!bfqq_process_refs(new_bfqq))
8423 -+ return;
8424 -+
8425 -+ /* Avoid a circular list and skip interim queue merges. */
8426 -+ while ((__bfqq = new_bfqq->new_bfqq)) {
8427 -+ if (__bfqq == bfqq)
8428 -+ return;
8429 -+ new_bfqq = __bfqq;
8430 -+ }
8431 -+
8432 -+ process_refs = bfqq_process_refs(bfqq);
8433 -+ new_process_refs = bfqq_process_refs(new_bfqq);
8434 -+ /*
8435 -+ * If the process for the bfqq has gone away, there is no
8436 -+ * sense in merging the queues.
8437 -+ */
8438 -+ if (process_refs == 0 || new_process_refs == 0)
8439 -+ return;
8440 -+
8441 -+ /*
8442 -+ * Merge in the direction of the lesser amount of work.
8443 -+ */
8444 -+ if (new_process_refs >= process_refs) {
8445 -+ bfqq->new_bfqq = new_bfqq;
8446 -+ atomic_add(process_refs, &new_bfqq->ref);
8447 -+ } else {
8448 -+ new_bfqq->new_bfqq = bfqq;
8449 -+ atomic_add(new_process_refs, &bfqq->ref);
8450 -+ }
8451 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
8452 -+ new_bfqq->pid);
8453 -+}
8454 -+
8455 -+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
8456 -+{
8457 -+ struct bfq_entity *entity = &bfqq->entity;
8458 -+ return entity->budget - entity->service;
8459 -+}
8460 -+
8461 -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
8462 -+{
8463 -+ BUG_ON(bfqq != bfqd->in_service_queue);
8464 -+
8465 -+ __bfq_bfqd_reset_in_service(bfqd);
8466 -+
8467 -+ /*
8468 -+ * If this bfqq is shared between multiple processes, check
8469 -+ * to make sure that those processes are still issuing I/Os
8470 -+ * within the mean seek distance. If not, it may be time to
8471 -+ * break the queues apart again.
8472 -+ */
8473 -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
8474 -+ bfq_mark_bfqq_split_coop(bfqq);
8475 -+
8476 -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
8477 -+ /*
8478 -+ * overloading budget_timeout field to store when
8479 -+ * the queue remains with no backlog, used by
8480 -+ * the weight-raising mechanism
8481 -+ */
8482 -+ bfqq->budget_timeout = jiffies;
8483 -+ bfq_del_bfqq_busy(bfqd, bfqq, 1);
8484 -+ } else {
8485 -+ bfq_activate_bfqq(bfqd, bfqq);
8486 -+ /*
8487 -+ * Resort priority tree of potential close cooperators.
8488 -+ */
8489 -+ bfq_rq_pos_tree_add(bfqd, bfqq);
8490 -+ }
8491 -+}
8492 -+
8493 -+/**
8494 -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
8495 -+ * @bfqd: device data.
8496 -+ * @bfqq: queue to update.
8497 -+ * @reason: reason for expiration.
8498 -+ *
8499 -+ * Handle the feedback on @bfqq budget. See the body for detailed
8500 -+ * comments.
8501 -+ */
8502 -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
8503 -+ struct bfq_queue *bfqq,
8504 -+ enum bfqq_expiration reason)
8505 -+{
8506 -+ struct request *next_rq;
8507 -+ unsigned long budget, min_budget;
8508 -+
8509 -+ budget = bfqq->max_budget;
8510 -+ min_budget = bfq_min_budget(bfqd);
8511 -+
8512 -+ BUG_ON(bfqq != bfqd->in_service_queue);
8513 -+
8514 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
8515 -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
8516 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
8517 -+ budget, bfq_min_budget(bfqd));
8518 -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
8519 -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
8520 -+
8521 -+ if (bfq_bfqq_sync(bfqq)) {
8522 -+ switch (reason) {
8523 -+ /*
8524 -+ * Caveat: in all the following cases we trade latency
8525 -+ * for throughput.
8526 -+ */
8527 -+ case BFQ_BFQQ_TOO_IDLE:
8528 -+ /*
8529 -+ * This is the only case where we may reduce
8530 -+ * the budget: if there is no requets of the
8531 -+ * process still waiting for completion, then
8532 -+ * we assume (tentatively) that the timer has
8533 -+ * expired because the batch of requests of
8534 -+ * the process could have been served with a
8535 -+ * smaller budget. Hence, betting that
8536 -+ * process will behave in the same way when it
8537 -+ * becomes backlogged again, we reduce its
8538 -+ * next budget. As long as we guess right,
8539 -+ * this budget cut reduces the latency
8540 -+ * experienced by the process.
8541 -+ *
8542 -+ * However, if there are still outstanding
8543 -+ * requests, then the process may have not yet
8544 -+ * issued its next request just because it is
8545 -+ * still waiting for the completion of some of
8546 -+ * the still oustanding ones. So in this
8547 -+ * subcase we do not reduce its budget, on the
8548 -+ * contrary we increase it to possibly boost
8549 -+ * the throughput, as discussed in the
8550 -+ * comments to the BUDGET_TIMEOUT case.
8551 -+ */
8552 -+ if (bfqq->dispatched > 0) /* still oustanding reqs */
8553 -+ budget = min(budget * 2, bfqd->bfq_max_budget);
8554 -+ else {
8555 -+ if (budget > 5 * min_budget)
8556 -+ budget -= 4 * min_budget;
8557 -+ else
8558 -+ budget = min_budget;
8559 -+ }
8560 -+ break;
8561 -+ case BFQ_BFQQ_BUDGET_TIMEOUT:
8562 -+ /*
8563 -+ * We double the budget here because: 1) it
8564 -+ * gives the chance to boost the throughput if
8565 -+ * this is not a seeky process (which may have
8566 -+ * bumped into this timeout because of, e.g.,
8567 -+ * ZBR), 2) together with charge_full_budget
8568 -+ * it helps give seeky processes higher
8569 -+ * timestamps, and hence be served less
8570 -+ * frequently.
8571 -+ */
8572 -+ budget = min(budget * 2, bfqd->bfq_max_budget);
8573 -+ break;
8574 -+ case BFQ_BFQQ_BUDGET_EXHAUSTED:
8575 -+ /*
8576 -+ * The process still has backlog, and did not
8577 -+ * let either the budget timeout or the disk
8578 -+ * idling timeout expire. Hence it is not
8579 -+ * seeky, has a short thinktime and may be
8580 -+ * happy with a higher budget too. So
8581 -+ * definitely increase the budget of this good
8582 -+ * candidate to boost the disk throughput.
8583 -+ */
8584 -+ budget = min(budget * 4, bfqd->bfq_max_budget);
8585 -+ break;
8586 -+ case BFQ_BFQQ_NO_MORE_REQUESTS:
8587 -+ /*
8588 -+ * Leave the budget unchanged.
8589 -+ */
8590 -+ default:
8591 -+ return;
8592 -+ }
8593 -+ } else /* async queue */
8594 -+ /* async queues get always the maximum possible budget
8595 -+ * (their ability to dispatch is limited by
8596 -+ * @bfqd->bfq_max_budget_async_rq).
8597 -+ */
8598 -+ budget = bfqd->bfq_max_budget;
8599 -+
8600 -+ bfqq->max_budget = budget;
8601 -+
8602 -+ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
8603 -+ bfqq->max_budget > bfqd->bfq_max_budget)
8604 -+ bfqq->max_budget = bfqd->bfq_max_budget;
8605 -+
8606 -+ /*
8607 -+ * Make sure that we have enough budget for the next request.
8608 -+ * Since the finish time of the bfqq must be kept in sync with
8609 -+ * the budget, be sure to call __bfq_bfqq_expire() after the
8610 -+ * update.
8611 -+ */
8612 -+ next_rq = bfqq->next_rq;
8613 -+ if (next_rq != NULL)
8614 -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
8615 -+ bfq_serv_to_charge(next_rq, bfqq));
8616 -+ else
8617 -+ bfqq->entity.budget = bfqq->max_budget;
8618 -+
8619 -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
8620 -+ next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
8621 -+ bfqq->entity.budget);
8622 -+}
8623 -+
8624 -+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
8625 -+{
8626 -+ unsigned long max_budget;
8627 -+
8628 -+ /*
8629 -+ * The max_budget calculated when autotuning is equal to the
8630 -+ * amount of sectors transfered in timeout_sync at the
8631 -+ * estimated peak rate.
8632 -+ */
8633 -+ max_budget = (unsigned long)(peak_rate * 1000 *
8634 -+ timeout >> BFQ_RATE_SHIFT);
8635 -+
8636 -+ return max_budget;
8637 -+}
8638 -+
8639 -+/*
8640 -+ * In addition to updating the peak rate, checks whether the process
8641 -+ * is "slow", and returns 1 if so. This slow flag is used, in addition
8642 -+ * to the budget timeout, to reduce the amount of service provided to
8643 -+ * seeky processes, and hence reduce their chances to lower the
8644 -+ * throughput. See the code for more details.
8645 -+ */
8646 -+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
8647 -+ int compensate, enum bfqq_expiration reason)
8648 -+{
8649 -+ u64 bw, usecs, expected, timeout;
8650 -+ ktime_t delta;
8651 -+ int update = 0;
8652 -+
8653 -+ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
8654 -+ return 0;
8655 -+
8656 -+ if (compensate)
8657 -+ delta = bfqd->last_idling_start;
8658 -+ else
8659 -+ delta = ktime_get();
8660 -+ delta = ktime_sub(delta, bfqd->last_budget_start);
8661 -+ usecs = ktime_to_us(delta);
8662 -+
8663 -+ /* Don't trust short/unrealistic values. */
8664 -+ if (usecs < 100 || usecs >= LONG_MAX)
8665 -+ return 0;
8666 -+
8667 -+ /*
8668 -+ * Calculate the bandwidth for the last slice. We use a 64 bit
8669 -+ * value to store the peak rate, in sectors per usec in fixed
8670 -+ * point math. We do so to have enough precision in the estimate
8671 -+ * and to avoid overflows.
8672 -+ */
8673 -+ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
8674 -+ do_div(bw, (unsigned long)usecs);
8675 -+
8676 -+ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
8677 -+
8678 -+ /*
8679 -+ * Use only long (> 20ms) intervals to filter out spikes for
8680 -+ * the peak rate estimation.
8681 -+ */
8682 -+ if (usecs > 20000) {
8683 -+ if (bw > bfqd->peak_rate ||
8684 -+ (!BFQQ_SEEKY(bfqq) &&
8685 -+ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
8686 -+ bfq_log(bfqd, "measured bw =%llu", bw);
8687 -+ /*
8688 -+ * To smooth oscillations use a low-pass filter with
8689 -+ * alpha=7/8, i.e.,
8690 -+ * new_rate = (7/8) * old_rate + (1/8) * bw
8691 -+ */
8692 -+ do_div(bw, 8);
8693 -+ if (bw == 0)
8694 -+ return 0;
8695 -+ bfqd->peak_rate *= 7;
8696 -+ do_div(bfqd->peak_rate, 8);
8697 -+ bfqd->peak_rate += bw;
8698 -+ update = 1;
8699 -+ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
8700 -+ }
8701 -+
8702 -+ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
8703 -+
8704 -+ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
8705 -+ bfqd->peak_rate_samples++;
8706 -+
8707 -+ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
8708 -+ update && bfqd->bfq_user_max_budget == 0) {
8709 -+ bfqd->bfq_max_budget =
8710 -+ bfq_calc_max_budget(bfqd->peak_rate, timeout);
8711 -+ bfq_log(bfqd, "new max_budget=%lu",
8712 -+ bfqd->bfq_max_budget);
8713 -+ }
8714 -+ }
8715 -+
8716 -+ /*
8717 -+ * If the process has been served for a too short time
8718 -+ * interval to let its possible sequential accesses prevail on
8719 -+ * the initial seek time needed to move the disk head on the
8720 -+ * first sector it requested, then give the process a chance
8721 -+ * and for the moment return false.
8722 -+ */
8723 -+ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
8724 -+ return 0;
8725 -+
8726 -+ /*
8727 -+ * A process is considered ``slow'' (i.e., seeky, so that we
8728 -+ * cannot treat it fairly in the service domain, as it would
8729 -+ * slow down too much the other processes) if, when a slice
8730 -+ * ends for whatever reason, it has received service at a
8731 -+ * rate that would not be high enough to complete the budget
8732 -+ * before the budget timeout expiration.
8733 -+ */
8734 -+ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
8735 -+
8736 -+ /*
8737 -+ * Caveat: processes doing IO in the slower disk zones will
8738 -+ * tend to be slow(er) even if not seeky. And the estimated
8739 -+ * peak rate will actually be an average over the disk
8740 -+ * surface. Hence, to not be too harsh with unlucky processes,
8741 -+ * we keep a budget/3 margin of safety before declaring a
8742 -+ * process slow.
8743 -+ */
8744 -+ return expected > (4 * bfqq->entity.budget) / 3;
8745 -+}
8746 -+
8747 -+/*
8748 -+ * To be deemed as soft real-time, an application must meet two requirements.
8749 -+ * The first is that the application must not require an average bandwidth
8750 -+ * higher than the approximate bandwidth required to playback or record a
8751 -+ * compressed high-definition video.
8752 -+ * The next function is invoked on the completion of the last request of a
8753 -+ * batch, to compute the next-start time instant, soft_rt_next_start, such
8754 -+ * that, if the next request of the application does not arrive before
8755 -+ * soft_rt_next_start, then the above requirement on the bandwidth is met.
8756 -+ *
8757 -+ * The second requirement is that the request pattern of the application is
8758 -+ * isochronous, i.e., that, after issuing a request or a batch of requests, the
8759 -+ * application stops for a while, then issues a new batch, and so on. For this
8760 -+ * reason the next function is invoked to compute soft_rt_next_start only for
8761 -+ * applications that meet this requirement, whereas soft_rt_next_start is set
8762 -+ * to infinity for applications that do not.
8763 -+ *
8764 -+ * Unfortunately, even a greedy application may happen to behave in an
8765 -+ * isochronous way if several processes are competing for the CPUs. In fact,
8766 -+ * in this scenario the application stops issuing requests while the CPUs are
8767 -+ * busy serving other processes, then restarts, then stops again for a while,
8768 -+ * and so on. In addition, if the disk achieves a low enough throughput with
8769 -+ * the request pattern issued by the application (e.g., because the request
8770 -+ * pattern is random and/or the device is slow), then the above bandwidth
8771 -+ * requirement may happen to be met too. To prevent such a greedy application
8772 -+ * to be deemed as soft real-time, a further rule is used in the computation
8773 -+ * of soft_rt_next_start: soft_rt_next_start must be higher than the current
8774 -+ * time plus the maximum time for which the arrival of a request is waited
8775 -+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This
8776 -+ * filters out greedy applications, as the latter issue instead their next
8777 -+ * request as soon as possible after the last one has been completed (in
8778 -+ * contrast, when a batch of requests is completed, a soft real-time
8779 -+ * application spends some time processing data).
8780 -+ *
8781 -+ * Actually, the last filter may easily generate false positives if: only
8782 -+ * bfqd->bfq_slice_idle is used as a reference time interval, and one or
8783 -+ * both the following two cases occur:
8784 -+ * 1) HZ is so low that the duration of a jiffie is comparable to or higher
8785 -+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
8786 -+ * HZ=100.
8787 -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
8788 -+ * for a while, then suddenly 'jump' by several units to recover the lost
8789 -+ * increments. This seems to happen, e.g., inside virtual machines.
8790 -+ * To address this issue, we do not use as a reference time interval just
8791 -+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
8792 -+ * particular we add the minimum number of jiffies for which the filter seems
8793 -+ * to be quite precise also in embedded systems and KVM/QEMU virtual machines.
8794 -+ */
8795 -+static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
8796 -+ struct bfq_queue *bfqq)
8797 -+{
8798 -+ return max(bfqq->last_idle_bklogged +
8799 -+ HZ * bfqq->service_from_backlogged /
8800 -+ bfqd->bfq_raising_max_softrt_rate,
8801 -+ jiffies + bfqq->bfqd->bfq_slice_idle + 4);
8802 -+}
8803 -+
8804 -+/*
8805 -+ * Largest-possible time instant such that, for as long as possible, the
8806 -+ * current time will be lower than this time instant according to the macro
8807 -+ * time_is_before_jiffies().
8808 -+ */
8809 -+static inline unsigned long bfq_infinity_from_now(unsigned long now)
8810 -+{
8811 -+ return now + ULONG_MAX / 2;
8812 -+}
8813 -+
8814 -+/**
8815 -+ * bfq_bfqq_expire - expire a queue.
8816 -+ * @bfqd: device owning the queue.
8817 -+ * @bfqq: the queue to expire.
8818 -+ * @compensate: if true, compensate for the time spent idling.
8819 -+ * @reason: the reason causing the expiration.
8820 -+ *
8821 -+ *
8822 -+ * If the process associated to the queue is slow (i.e., seeky), or in
8823 -+ * case of budget timeout, or, finally, if it is async, we
8824 -+ * artificially charge it an entire budget (independently of the
8825 -+ * actual service it received). As a consequence, the queue will get
8826 -+ * higher timestamps than the correct ones upon reactivation, and
8827 -+ * hence it will be rescheduled as if it had received more service
8828 -+ * than what it actually received. In the end, this class of processes
8829 -+ * will receive less service in proportion to how slowly they consume
8830 -+ * their budgets (and hence how seriously they tend to lower the
8831 -+ * throughput).
8832 -+ *
8833 -+ * In contrast, when a queue expires because it has been idling for
8834 -+ * too much or because it exhausted its budget, we do not touch the
8835 -+ * amount of service it has received. Hence when the queue will be
8836 -+ * reactivated and its timestamps updated, the latter will be in sync
8837 -+ * with the actual service received by the queue until expiration.
8838 -+ *
8839 -+ * Charging a full budget to the first type of queues and the exact
8840 -+ * service to the others has the effect of using the WF2Q+ policy to
8841 -+ * schedule the former on a timeslice basis, without violating the
8842 -+ * service domain guarantees of the latter.
8843 -+ */
8844 -+static void bfq_bfqq_expire(struct bfq_data *bfqd,
8845 -+ struct bfq_queue *bfqq,
8846 -+ int compensate,
8847 -+ enum bfqq_expiration reason)
8848 -+{
8849 -+ int slow;
8850 -+ BUG_ON(bfqq != bfqd->in_service_queue);
8851 -+
8852 -+ /* Update disk peak rate for autotuning and check whether the
8853 -+ * process is slow (see bfq_update_peak_rate).
8854 -+ */
8855 -+ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
8856 -+
8857 -+ /*
8858 -+ * As above explained, 'punish' slow (i.e., seeky), timed-out
8859 -+ * and async queues, to favor sequential sync workloads.
8860 -+ *
8861 -+ * Processes doing IO in the slower disk zones will tend to be
8862 -+ * slow(er) even if not seeky. Hence, since the estimated peak
8863 -+ * rate is actually an average over the disk surface, these
8864 -+ * processes may timeout just for bad luck. To avoid punishing
8865 -+ * them we do not charge a full budget to a process that
8866 -+ * succeeded in consuming at least 2/3 of its budget.
8867 -+ */
8868 -+ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
8869 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3))
8870 -+ bfq_bfqq_charge_full_budget(bfqq);
8871 -+
8872 -+ bfqq->service_from_backlogged += bfqq->entity.service;
8873 -+
8874 -+ if (bfqd->low_latency && bfqq->raising_coeff == 1)
8875 -+ bfqq->last_rais_start_finish = jiffies;
8876 -+
8877 -+ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0 &&
8878 -+ RB_EMPTY_ROOT(&bfqq->sort_list)) {
8879 -+ /*
8880 -+ * If we get here, then the request pattern is
8881 -+ * isochronous (see the comments to the function
8882 -+ * bfq_bfqq_softrt_next_start()). However, if the
8883 -+ * queue still has in-flight requests, then it is
8884 -+ * better to postpone the computation of next_start
8885 -+ * to the next request completion. In fact, if we
8886 -+ * computed it now, then the application might pass
8887 -+ * the greedy-application filter improperly, because
8888 -+ * the arrival of its next request may happen to be
8889 -+ * higher than (jiffies + bfqq->bfqd->bfq_slice_idle)
8890 -+ * not because the application is truly soft real-
8891 -+ * time, but just because the application is currently
8892 -+ * waiting for the completion of some request before
8893 -+ * issuing, as quickly as possible, its next request.
8894 -+ */
8895 -+ if (bfqq->dispatched > 0) {
8896 -+ /*
8897 -+ * The application is still waiting for the
8898 -+ * completion of one or more requests:
8899 -+ * prevent it from possibly being incorrectly
8900 -+ * deemed as soft real-time by setting its
8901 -+ * soft_rt_next_start to infinity. In fact,
8902 -+ * without this assignment, the application
8903 -+ * would be incorrectly deemed as soft
8904 -+ * real-time if:
8905 -+ * 1) it issued a new request before the
8906 -+ * completion of all its in-flight
8907 -+ * requests, and
8908 -+ * 2) at that time, its soft_rt_next_start
8909 -+ * happened to be in the past.
8910 -+ */
8911 -+ bfqq->soft_rt_next_start =
8912 -+ bfq_infinity_from_now(jiffies);
8913 -+ bfq_mark_bfqq_softrt_update(bfqq);
8914 -+ } else
8915 -+ bfqq->soft_rt_next_start =
8916 -+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
8917 -+ }
8918 -+
8919 -+ bfq_log_bfqq(bfqd, bfqq,
8920 -+ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
8921 -+ bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
8922 -+
8923 -+ /* Increase, decrease or leave budget unchanged according to reason */
8924 -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
8925 -+ __bfq_bfqq_expire(bfqd, bfqq);
8926 -+}
8927 -+
8928 -+/*
8929 -+ * Budget timeout is not implemented through a dedicated timer, but
8930 -+ * just checked on request arrivals and completions, as well as on
8931 -+ * idle timer expirations.
8932 -+ */
8933 -+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
8934 -+{
8935 -+ if (bfq_bfqq_budget_new(bfqq))
8936 -+ return 0;
8937 -+
8938 -+ if (time_before(jiffies, bfqq->budget_timeout))
8939 -+ return 0;
8940 -+
8941 -+ return 1;
8942 -+}
8943 -+
8944 -+/*
8945 -+ * If we expire a queue that is waiting for the arrival of a new
8946 -+ * request, we may prevent the fictitious timestamp backshifting that
8947 -+ * allows the guarantees of the queue to be preserved (see [1] for
8948 -+ * this tricky aspect). Hence we return true only if this condition
8949 -+ * does not hold, or if the queue is slow enough to deserve only to be
8950 -+ * kicked off for preserving a high throughput.
8951 -+*/
8952 -+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
8953 -+{
8954 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
8955 -+ "may_budget_timeout: wr %d left %d timeout %d",
8956 -+ bfq_bfqq_wait_request(bfqq),
8957 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
8958 -+ bfq_bfqq_budget_timeout(bfqq));
8959 -+
8960 -+ return (!bfq_bfqq_wait_request(bfqq) ||
8961 -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
8962 -+ &&
8963 -+ bfq_bfqq_budget_timeout(bfqq);
8964 -+}
8965 -+
8966 -+/*
8967 -+ * For weight-raised queues issuing sync requests, idling is always performed,
8968 -+ * as this is instrumental in guaranteeing a high fraction of the throughput
8969 -+ * to these queues, and hence in guaranteeing a lower latency for their
8970 -+ * requests. See [1] for details.
8971 -+ *
8972 -+ * For non-weight-raised queues, idling is instead disabled if the device is
8973 -+ * NCQ-enabled and non-rotational, as this boosts the throughput on such
8974 -+ * devices.
8975 -+ */
8976 -+static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
8977 -+{
8978 -+ struct bfq_data *bfqd = bfqq->bfqd;
8979 -+
8980 -+ return bfq_bfqq_sync(bfqq) && (
8981 -+ bfqq->raising_coeff > 1 ||
8982 -+ (bfq_bfqq_idle_window(bfqq) &&
8983 -+ !(bfqd->hw_tag &&
8984 -+ (blk_queue_nonrot(bfqd->queue) ||
8985 -+ /*
8986 -+ * If there are weight-raised busy queues, then do not idle
8987 -+ * the disk for a sync non-weight-raised queue, and hence
8988 -+ * expire the queue immediately if empty. Combined with the
8989 -+ * timestamping rules of BFQ (see [1] for details), this
8990 -+ * causes sync non-weight-raised queues to get a lower
8991 -+ * fraction of the disk throughput, and hence reduces the rate
8992 -+ * at which the processes associated to these queues ask for
8993 -+ * requests from the request pool.
8994 -+ *
8995 -+ * This is beneficial for weight-raised processes, when the
8996 -+ * system operates in request-pool saturation conditions
8997 -+ * (e.g., in the presence of write hogs). In fact, if
8998 -+ * non-weight-raised processes ask for requests at a lower
8999 -+ * rate, then weight-raised processes have a higher
9000 -+ * probability to get a request from the pool immediately
9001 -+ * (or at least soon) when they need one. Hence they have a
9002 -+ * higher probability to actually get a fraction of the disk
9003 -+ * throughput proportional to their high weight. This is
9004 -+ * especially true with NCQ-enabled drives, which enqueue
9005 -+ * several requests in advance and further reorder
9006 -+ * internally-queued requests.
9007 -+ *
9008 -+ * Mistreating non-weight-raised queues in the above-described
9009 -+ * way, when there are busy weight-raised queues, seems to
9010 -+ * mitigate starvation problems in the presence of heavy write
9011 -+ * workloads and NCQ, and hence to guarantee a higher
9012 -+ * application and system responsiveness in these hostile
9013 -+ * scenarios.
9014 -+ */
9015 -+ bfqd->raised_busy_queues > 0)
9016 -+ )
9017 -+ )
9018 -+ );
9019 -+}
9020 -+
9021 -+/*
9022 -+ * If the in-service queue is empty, but it is sync and either of the following
9023 -+ * conditions holds, then: 1) the queue must remain in service and cannot be
9024 -+ * expired, and 2) the disk must be idled to wait for the possible arrival
9025 -+ * of a new request for the queue. The conditions are:
9026 -+ * - the device is rotational and not performing NCQ, and the queue has its
9027 -+ * idle window set (in this case, waiting for a new request for the queue
9028 -+ * is likely to boost the disk throughput);
9029 -+ * - the queue is weight-raised (waiting for the request is necessary to
9030 -+ * provide the queue with fairness and latency guarantees, see [1] for
9031 -+ * details).
9032 -+ */
9033 -+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
9034 -+{
9035 -+ struct bfq_data *bfqd = bfqq->bfqd;
9036 -+
9037 -+ return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
9038 -+ bfq_bfqq_must_not_expire(bfqq) &&
9039 -+ !bfq_queue_nonrot_noidle(bfqd, bfqq));
9040 -+}
9041 -+
9042 -+/*
9043 -+ * Select a queue for service. If we have a current queue in service,
9044 -+ * check whether to continue servicing it, or retrieve and set a new one.
9045 -+ */
9046 -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
9047 -+{
9048 -+ struct bfq_queue *bfqq, *new_bfqq = NULL;
9049 -+ struct request *next_rq;
9050 -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
9051 -+
9052 -+ bfqq = bfqd->in_service_queue;
9053 -+ if (bfqq == NULL)
9054 -+ goto new_queue;
9055 -+
9056 -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
9057 -+
9058 -+ /*
9059 -+ * If another queue has a request waiting within our mean seek
9060 -+ * distance, let it run. The expire code will check for close
9061 -+ * cooperators and put the close queue at the front of the
9062 -+ * service tree. If possible, merge the expiring queue with the
9063 -+ * new bfqq.
9064 -+ */
9065 -+ new_bfqq = bfq_close_cooperator(bfqd, bfqq);
9066 -+ if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
9067 -+ bfq_setup_merge(bfqq, new_bfqq);
9068 -+
9069 -+ if (bfq_may_expire_for_budg_timeout(bfqq) &&
9070 -+ !timer_pending(&bfqd->idle_slice_timer) &&
9071 -+ !bfq_bfqq_must_idle(bfqq))
9072 -+ goto expire;
9073 -+
9074 -+ next_rq = bfqq->next_rq;
9075 -+ /*
9076 -+ * If bfqq has requests queued and it has enough budget left to
9077 -+ * serve them, keep the queue, otherwise expire it.
9078 -+ */
9079 -+ if (next_rq != NULL) {
9080 -+ if (bfq_serv_to_charge(next_rq, bfqq) >
9081 -+ bfq_bfqq_budget_left(bfqq)) {
9082 -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
9083 -+ goto expire;
9084 -+ } else {
9085 -+ /*
9086 -+ * The idle timer may be pending because we may not
9087 -+ * disable disk idling even when a new request arrives
9088 -+ */
9089 -+ if (timer_pending(&bfqd->idle_slice_timer)) {
9090 -+ /*
9091 -+ * If we get here: 1) at least a new request
9092 -+ * has arrived but we have not disabled the
9093 -+ * timer because the request was too small,
9094 -+ * 2) then the block layer has unplugged the
9095 -+ * device, causing the dispatch to be invoked.
9096 -+ *
9097 -+ * Since the device is unplugged, now the
9098 -+ * requests are probably large enough to
9099 -+ * provide a reasonable throughput.
9100 -+ * So we disable idling.
9101 -+ */
9102 -+ bfq_clear_bfqq_wait_request(bfqq);
9103 -+ del_timer(&bfqd->idle_slice_timer);
9104 -+ }
9105 -+ if (new_bfqq == NULL)
9106 -+ goto keep_queue;
9107 -+ else
9108 -+ goto expire;
9109 -+ }
9110 -+ }
9111 -+
9112 -+ /*
9113 -+ * No requests pending. If the in-service queue has no cooperator and
9114 -+ * still has requests in flight (possibly waiting for a completion)
9115 -+ * or is idling for a new request, then keep it.
9116 -+ */
9117 -+ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
9118 -+ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
9119 -+ bfqq = NULL;
9120 -+ goto keep_queue;
9121 -+ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
9122 -+ /*
9123 -+ * Expiring the queue because there is a close cooperator,
9124 -+ * cancel timer.
9125 -+ */
9126 -+ bfq_clear_bfqq_wait_request(bfqq);
9127 -+ del_timer(&bfqd->idle_slice_timer);
9128 -+ }
9129 -+
9130 -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS;
9131 -+expire:
9132 -+ bfq_bfqq_expire(bfqd, bfqq, 0, reason);
9133 -+new_queue:
9134 -+ bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
9135 -+ bfq_log(bfqd, "select_queue: new queue %d returned",
9136 -+ bfqq != NULL ? bfqq->pid : 0);
9137 -+keep_queue:
9138 -+ return bfqq;
9139 -+}
9140 -+
9141 -+static void bfq_update_raising_data(struct bfq_data *bfqd,
9142 -+ struct bfq_queue *bfqq)
9143 -+{
9144 -+ if (bfqq->raising_coeff > 1) { /* queue is being boosted */
9145 -+ struct bfq_entity *entity = &bfqq->entity;
9146 -+
9147 -+ bfq_log_bfqq(bfqd, bfqq,
9148 -+ "raising period dur %u/%u msec, "
9149 -+ "old raising coeff %u, w %d(%d)",
9150 -+ jiffies_to_msecs(jiffies -
9151 -+ bfqq->last_rais_start_finish),
9152 -+ jiffies_to_msecs(bfqq->raising_cur_max_time),
9153 -+ bfqq->raising_coeff,
9154 -+ bfqq->entity.weight, bfqq->entity.orig_weight);
9155 -+
9156 -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
9157 -+ entity->orig_weight * bfqq->raising_coeff);
9158 -+ if (entity->ioprio_changed)
9159 -+ bfq_log_bfqq(bfqd, bfqq,
9160 -+ "WARN: pending prio change");
9161 -+ /*
9162 -+ * If too much time has elapsed from the beginning
9163 -+ * of this weight-raising, stop it.
9164 -+ */
9165 -+ if (time_is_before_jiffies(bfqq->last_rais_start_finish +
9166 -+ bfqq->raising_cur_max_time)) {
9167 -+ bfqq->last_rais_start_finish = jiffies;
9168 -+ bfq_log_bfqq(bfqd, bfqq,
9169 -+ "wrais ending at %lu, "
9170 -+ "rais_max_time %u",
9171 -+ bfqq->last_rais_start_finish,
9172 -+ jiffies_to_msecs(bfqq->
9173 -+ raising_cur_max_time));
9174 -+ bfq_bfqq_end_raising(bfqq);
9175 -+ __bfq_entity_update_weight_prio(
9176 -+ bfq_entity_service_tree(entity),
9177 -+ entity);
9178 -+ }
9179 -+ }
9180 -+}
9181 -+
9182 -+/*
9183 -+ * Dispatch one request from bfqq, moving it to the request queue
9184 -+ * dispatch list.
9185 -+ */
9186 -+static int bfq_dispatch_request(struct bfq_data *bfqd,
9187 -+ struct bfq_queue *bfqq)
9188 -+{
9189 -+ int dispatched = 0;
9190 -+ struct request *rq;
9191 -+ unsigned long service_to_charge;
9192 -+
9193 -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
9194 -+
9195 -+ /* Follow expired path, else get first next available. */
9196 -+ rq = bfq_check_fifo(bfqq);
9197 -+ if (rq == NULL)
9198 -+ rq = bfqq->next_rq;
9199 -+ service_to_charge = bfq_serv_to_charge(rq, bfqq);
9200 -+
9201 -+ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
9202 -+ /*
9203 -+ * This may happen if the next rq is chosen
9204 -+ * in fifo order instead of sector order.
9205 -+ * The budget is properly dimensioned
9206 -+ * to be always sufficient to serve the next request
9207 -+ * only if it is chosen in sector order. The reason is
9208 -+ * that it would be quite inefficient and little useful
9209 -+ * to always make sure that the budget is large enough
9210 -+ * to serve even the possible next rq in fifo order.
9211 -+ * In fact, requests are seldom served in fifo order.
9212 -+ *
9213 -+ * Expire the queue for budget exhaustion, and
9214 -+ * make sure that the next act_budget is enough
9215 -+ * to serve the next request, even if it comes
9216 -+ * from the fifo expired path.
9217 -+ */
9218 -+ bfqq->next_rq = rq;
9219 -+ /*
9220 -+ * Since this dispatch is failed, make sure that
9221 -+ * a new one will be performed
9222 -+ */
9223 -+ if (!bfqd->rq_in_driver)
9224 -+ bfq_schedule_dispatch(bfqd);
9225 -+ goto expire;
9226 -+ }
9227 -+
9228 -+ /* Finally, insert request into driver dispatch list. */
9229 -+ bfq_bfqq_served(bfqq, service_to_charge);
9230 -+ bfq_dispatch_insert(bfqd->queue, rq);
9231 -+
9232 -+ bfq_update_raising_data(bfqd, bfqq);
9233 -+
9234 -+ bfq_log_bfqq(bfqd, bfqq,
9235 -+ "dispatched %u sec req (%llu), budg left %lu",
9236 -+ blk_rq_sectors(rq),
9237 -+ (long long unsigned)blk_rq_pos(rq),
9238 -+ bfq_bfqq_budget_left(bfqq));
9239 -+
9240 -+ dispatched++;
9241 -+
9242 -+ if (bfqd->in_service_bic == NULL) {
9243 -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
9244 -+ bfqd->in_service_bic = RQ_BIC(rq);
9245 -+ }
9246 -+
9247 -+ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
9248 -+ dispatched >= bfqd->bfq_max_budget_async_rq) ||
9249 -+ bfq_class_idle(bfqq)))
9250 -+ goto expire;
9251 -+
9252 -+ return dispatched;
9253 -+
9254 -+expire:
9255 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
9256 -+ return dispatched;
9257 -+}
9258 -+
9259 -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
9260 -+{
9261 -+ int dispatched = 0;
9262 -+
9263 -+ while (bfqq->next_rq != NULL) {
9264 -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
9265 -+ dispatched++;
9266 -+ }
9267 -+
9268 -+ BUG_ON(!list_empty(&bfqq->fifo));
9269 -+ return dispatched;
9270 -+}
9271 -+
9272 -+/*
9273 -+ * Drain our current requests. Used for barriers and when switching
9274 -+ * io schedulers on-the-fly.
9275 -+ */
9276 -+static int bfq_forced_dispatch(struct bfq_data *bfqd)
9277 -+{
9278 -+ struct bfq_queue *bfqq, *n;
9279 -+ struct bfq_service_tree *st;
9280 -+ int dispatched = 0;
9281 -+
9282 -+ bfqq = bfqd->in_service_queue;
9283 -+ if (bfqq != NULL)
9284 -+ __bfq_bfqq_expire(bfqd, bfqq);
9285 -+
9286 -+ /*
9287 -+ * Loop through classes, and be careful to leave the scheduler
9288 -+ * in a consistent state, as feedback mechanisms and vtime
9289 -+ * updates cannot be disabled during the process.
9290 -+ */
9291 -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
9292 -+ st = bfq_entity_service_tree(&bfqq->entity);
9293 -+
9294 -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq);
9295 -+ bfqq->max_budget = bfq_max_budget(bfqd);
9296 -+
9297 -+ bfq_forget_idle(st);
9298 -+ }
9299 -+
9300 -+ BUG_ON(bfqd->busy_queues != 0);
9301 -+
9302 -+ return dispatched;
9303 -+}
9304 -+
9305 -+static int bfq_dispatch_requests(struct request_queue *q, int force)
9306 -+{
9307 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
9308 -+ struct bfq_queue *bfqq;
9309 -+ int max_dispatch;
9310 -+
9311 -+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
9312 -+ if (bfqd->busy_queues == 0)
9313 -+ return 0;
9314 -+
9315 -+ if (unlikely(force))
9316 -+ return bfq_forced_dispatch(bfqd);
9317 -+
9318 -+ bfqq = bfq_select_queue(bfqd);
9319 -+ if (bfqq == NULL)
9320 -+ return 0;
9321 -+
9322 -+ max_dispatch = bfqd->bfq_quantum;
9323 -+ if (bfq_class_idle(bfqq))
9324 -+ max_dispatch = 1;
9325 -+
9326 -+ if (!bfq_bfqq_sync(bfqq))
9327 -+ max_dispatch = bfqd->bfq_max_budget_async_rq;
9328 -+
9329 -+ if (bfqq->dispatched >= max_dispatch) {
9330 -+ if (bfqd->busy_queues > 1)
9331 -+ return 0;
9332 -+ if (bfqq->dispatched >= 4 * max_dispatch)
9333 -+ return 0;
9334 -+ }
9335 -+
9336 -+ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
9337 -+ return 0;
9338 -+
9339 -+ bfq_clear_bfqq_wait_request(bfqq);
9340 -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
9341 -+
9342 -+ if (!bfq_dispatch_request(bfqd, bfqq))
9343 -+ return 0;
9344 -+
9345 -+ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)",
9346 -+ bfqq->pid, max_dispatch);
9347 -+
9348 -+ return 1;
9349 -+}
9350 -+
9351 -+/*
9352 -+ * Task holds one reference to the queue, dropped when task exits. Each rq
9353 -+ * in-flight on this queue also holds a reference, dropped when rq is freed.
9354 -+ *
9355 -+ * Queue lock must be held here.
9356 -+ */
9357 -+static void bfq_put_queue(struct bfq_queue *bfqq)
9358 -+{
9359 -+ struct bfq_data *bfqd = bfqq->bfqd;
9360 -+
9361 -+ BUG_ON(atomic_read(&bfqq->ref) <= 0);
9362 -+
9363 -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
9364 -+ atomic_read(&bfqq->ref));
9365 -+ if (!atomic_dec_and_test(&bfqq->ref))
9366 -+ return;
9367 -+
9368 -+ BUG_ON(rb_first(&bfqq->sort_list) != NULL);
9369 -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
9370 -+ BUG_ON(bfqq->entity.tree != NULL);
9371 -+ BUG_ON(bfq_bfqq_busy(bfqq));
9372 -+ BUG_ON(bfqd->in_service_queue == bfqq);
9373 -+
9374 -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
9375 -+
9376 -+ kmem_cache_free(bfq_pool, bfqq);
9377 -+}
9378 -+
9379 -+static void bfq_put_cooperator(struct bfq_queue *bfqq)
9380 -+{
9381 -+ struct bfq_queue *__bfqq, *next;
9382 -+
9383 -+ /*
9384 -+ * If this queue was scheduled to merge with another queue, be
9385 -+ * sure to drop the reference taken on that queue (and others in
9386 -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
9387 -+ */
9388 -+ __bfqq = bfqq->new_bfqq;
9389 -+ while (__bfqq) {
9390 -+ if (__bfqq == bfqq) {
9391 -+ WARN(1, "bfqq->new_bfqq loop detected.\n");
9392 -+ break;
9393 -+ }
9394 -+ next = __bfqq->new_bfqq;
9395 -+ bfq_put_queue(__bfqq);
9396 -+ __bfqq = next;
9397 -+ }
9398 -+}
9399 -+
9400 -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
9401 -+{
9402 -+ if (bfqq == bfqd->in_service_queue) {
9403 -+ __bfq_bfqq_expire(bfqd, bfqq);
9404 -+ bfq_schedule_dispatch(bfqd);
9405 -+ }
9406 -+
9407 -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
9408 -+ atomic_read(&bfqq->ref));
9409 -+
9410 -+ bfq_put_cooperator(bfqq);
9411 -+
9412 -+ bfq_put_queue(bfqq);
9413 -+}
9414 -+
9415 -+static void bfq_init_icq(struct io_cq *icq)
9416 -+{
9417 -+ struct bfq_io_cq *bic = icq_to_bic(icq);
9418 -+
9419 -+ bic->ttime.last_end_request = jiffies;
9420 -+}
9421 -+
9422 -+static void bfq_exit_icq(struct io_cq *icq)
9423 -+{
9424 -+ struct bfq_io_cq *bic = icq_to_bic(icq);
9425 -+ struct bfq_data *bfqd = bic_to_bfqd(bic);
9426 -+
9427 -+ if (bic->bfqq[BLK_RW_ASYNC]) {
9428 -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
9429 -+ bic->bfqq[BLK_RW_ASYNC] = NULL;
9430 -+ }
9431 -+
9432 -+ if (bic->bfqq[BLK_RW_SYNC]) {
9433 -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
9434 -+ bic->bfqq[BLK_RW_SYNC] = NULL;
9435 -+ }
9436 -+}
9437 -+
9438 -+/*
9439 -+ * Update the entity prio values; note that the new values will not
9440 -+ * be used until the next (re)activation.
9441 -+ */
9442 -+static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
9443 -+{
9444 -+ struct task_struct *tsk = current;
9445 -+ int ioprio_class;
9446 -+
9447 -+ if (!bfq_bfqq_prio_changed(bfqq))
9448 -+ return;
9449 -+
9450 -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
9451 -+ switch (ioprio_class) {
9452 -+ default:
9453 -+ dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
9454 -+ "bfq: bad prio %x\n", ioprio_class);
9455 -+ case IOPRIO_CLASS_NONE:
9456 -+ /*
9457 -+ * No prio set, inherit CPU scheduling settings.
9458 -+ */
9459 -+ bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
9460 -+ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
9461 -+ break;
9462 -+ case IOPRIO_CLASS_RT:
9463 -+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
9464 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
9465 -+ break;
9466 -+ case IOPRIO_CLASS_BE:
9467 -+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
9468 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
9469 -+ break;
9470 -+ case IOPRIO_CLASS_IDLE:
9471 -+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
9472 -+ bfqq->entity.new_ioprio = 7;
9473 -+ bfq_clear_bfqq_idle_window(bfqq);
9474 -+ break;
9475 -+ }
9476 -+
9477 -+ bfqq->entity.ioprio_changed = 1;
9478 -+
9479 -+ /*
9480 -+ * Keep track of original prio settings in case we have to temporarily
9481 -+ * elevate the priority of this queue.
9482 -+ */
9483 -+ bfqq->org_ioprio = bfqq->entity.new_ioprio;
9484 -+ bfq_clear_bfqq_prio_changed(bfqq);
9485 -+}
9486 -+
9487 -+static void bfq_changed_ioprio(struct bfq_io_cq *bic)
9488 -+{
9489 -+ struct bfq_data *bfqd;
9490 -+ struct bfq_queue *bfqq, *new_bfqq;
9491 -+ struct bfq_group *bfqg;
9492 -+ unsigned long uninitialized_var(flags);
9493 -+ int ioprio = bic->icq.ioc->ioprio;
9494 -+
9495 -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
9496 -+ &flags);
9497 -+ /*
9498 -+ * This condition may trigger on a newly created bic, be sure to drop
9499 -+ * the lock before returning.
9500 -+ */
9501 -+ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
9502 -+ goto out;
9503 -+
9504 -+ bfqq = bic->bfqq[BLK_RW_ASYNC];
9505 -+ if (bfqq != NULL) {
9506 -+ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
9507 -+ sched_data);
9508 -+ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
9509 -+ GFP_ATOMIC);
9510 -+ if (new_bfqq != NULL) {
9511 -+ bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
9512 -+ bfq_log_bfqq(bfqd, bfqq,
9513 -+ "changed_ioprio: bfqq %p %d",
9514 -+ bfqq, atomic_read(&bfqq->ref));
9515 -+ bfq_put_queue(bfqq);
9516 -+ }
9517 -+ }
9518 -+
9519 -+ bfqq = bic->bfqq[BLK_RW_SYNC];
9520 -+ if (bfqq != NULL)
9521 -+ bfq_mark_bfqq_prio_changed(bfqq);
9522 -+
9523 -+ bic->ioprio = ioprio;
9524 -+
9525 -+out:
9526 -+ bfq_put_bfqd_unlock(bfqd, &flags);
9527 -+}
9528 -+
9529 -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
9530 -+ pid_t pid, int is_sync)
9531 -+{
9532 -+ RB_CLEAR_NODE(&bfqq->entity.rb_node);
9533 -+ INIT_LIST_HEAD(&bfqq->fifo);
9534 -+
9535 -+ atomic_set(&bfqq->ref, 0);
9536 -+ bfqq->bfqd = bfqd;
9537 -+
9538 -+ bfq_mark_bfqq_prio_changed(bfqq);
9539 -+
9540 -+ if (is_sync) {
9541 -+ if (!bfq_class_idle(bfqq))
9542 -+ bfq_mark_bfqq_idle_window(bfqq);
9543 -+ bfq_mark_bfqq_sync(bfqq);
9544 -+ }
9545 -+
9546 -+ /* Tentative initial value to trade off between thr and lat */
9547 -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
9548 -+ bfqq->pid = pid;
9549 -+
9550 -+ bfqq->raising_coeff = 1;
9551 -+ bfqq->last_rais_start_finish = 0;
9552 -+ /*
9553 -+ * Set to the value for which bfqq will not be deemed as
9554 -+ * soft rt when it becomes backlogged.
9555 -+ */
9556 -+ bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);
9557 -+}
9558 -+
9559 -+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
9560 -+ struct bfq_group *bfqg,
9561 -+ int is_sync,
9562 -+ struct bfq_io_cq *bic,
9563 -+ gfp_t gfp_mask)
9564 -+{
9565 -+ struct bfq_queue *bfqq, *new_bfqq = NULL;
9566 -+
9567 -+retry:
9568 -+ /* bic always exists here */
9569 -+ bfqq = bic_to_bfqq(bic, is_sync);
9570 -+
9571 -+ /*
9572 -+ * Always try a new alloc if we fall back to the OOM bfqq
9573 -+ * originally, since it should just be a temporary situation.
9574 -+ */
9575 -+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
9576 -+ bfqq = NULL;
9577 -+ if (new_bfqq != NULL) {
9578 -+ bfqq = new_bfqq;
9579 -+ new_bfqq = NULL;
9580 -+ } else if (gfp_mask & __GFP_WAIT) {
9581 -+ spin_unlock_irq(bfqd->queue->queue_lock);
9582 -+ new_bfqq = kmem_cache_alloc_node(bfq_pool,
9583 -+ gfp_mask | __GFP_ZERO,
9584 -+ bfqd->queue->node);
9585 -+ spin_lock_irq(bfqd->queue->queue_lock);
9586 -+ if (new_bfqq != NULL)
9587 -+ goto retry;
9588 -+ } else {
9589 -+ bfqq = kmem_cache_alloc_node(bfq_pool,
9590 -+ gfp_mask | __GFP_ZERO,
9591 -+ bfqd->queue->node);
9592 -+ }
9593 -+
9594 -+ if (bfqq != NULL) {
9595 -+ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
9596 -+ bfq_log_bfqq(bfqd, bfqq, "allocated");
9597 -+ } else {
9598 -+ bfqq = &bfqd->oom_bfqq;
9599 -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
9600 -+ }
9601 -+
9602 -+ bfq_init_prio_data(bfqq, bic);
9603 -+ bfq_init_entity(&bfqq->entity, bfqg);
9604 -+ }
9605 -+
9606 -+ if (new_bfqq != NULL)
9607 -+ kmem_cache_free(bfq_pool, new_bfqq);
9608 -+
9609 -+ return bfqq;
9610 -+}
9611 -+
9612 -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
9613 -+ struct bfq_group *bfqg,
9614 -+ int ioprio_class, int ioprio)
9615 -+{
9616 -+ switch (ioprio_class) {
9617 -+ case IOPRIO_CLASS_RT:
9618 -+ return &bfqg->async_bfqq[0][ioprio];
9619 -+ case IOPRIO_CLASS_NONE:
9620 -+ ioprio = IOPRIO_NORM;
9621 -+ /* fall through */
9622 -+ case IOPRIO_CLASS_BE:
9623 -+ return &bfqg->async_bfqq[1][ioprio];
9624 -+ case IOPRIO_CLASS_IDLE:
9625 -+ return &bfqg->async_idle_bfqq;
9626 -+ default:
9627 -+ BUG();
9628 -+ }
9629 -+}
9630 -+
9631 -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
9632 -+ struct bfq_group *bfqg, int is_sync,
9633 -+ struct bfq_io_cq *bic, gfp_t gfp_mask)
9634 -+{
9635 -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
9636 -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
9637 -+ struct bfq_queue **async_bfqq = NULL;
9638 -+ struct bfq_queue *bfqq = NULL;
9639 -+
9640 -+ if (!is_sync) {
9641 -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
9642 -+ ioprio);
9643 -+ bfqq = *async_bfqq;
9644 -+ }
9645 -+
9646 -+ if (bfqq == NULL)
9647 -+ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
9648 -+
9649 -+ /*
9650 -+ * Pin the queue now that it's allocated, scheduler exit will prune it.
9651 -+ */
9652 -+ if (!is_sync && *async_bfqq == NULL) {
9653 -+ atomic_inc(&bfqq->ref);
9654 -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
9655 -+ bfqq, atomic_read(&bfqq->ref));
9656 -+ *async_bfqq = bfqq;
9657 -+ }
9658 -+
9659 -+ atomic_inc(&bfqq->ref);
9660 -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
9661 -+ atomic_read(&bfqq->ref));
9662 -+ return bfqq;
9663 -+}
9664 -+
9665 -+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
9666 -+ struct bfq_io_cq *bic)
9667 -+{
9668 -+ unsigned long elapsed = jiffies - bic->ttime.last_end_request;
9669 -+ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
9670 -+
9671 -+ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
9672 -+ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
9673 -+ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /
9674 -+ bic->ttime.ttime_samples;
9675 -+}
9676 -+
9677 -+static void bfq_update_io_seektime(struct bfq_data *bfqd,
9678 -+ struct bfq_queue *bfqq,
9679 -+ struct request *rq)
9680 -+{
9681 -+ sector_t sdist;
9682 -+ u64 total;
9683 -+
9684 -+ if (bfqq->last_request_pos < blk_rq_pos(rq))
9685 -+ sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
9686 -+ else
9687 -+ sdist = bfqq->last_request_pos - blk_rq_pos(rq);
9688 -+
9689 -+ /*
9690 -+ * Don't allow the seek distance to get too large from the
9691 -+ * odd fragment, pagein, etc.
9692 -+ */
9693 -+ if (bfqq->seek_samples == 0) /* first request, not really a seek */
9694 -+ sdist = 0;
9695 -+ else if (bfqq->seek_samples <= 60) /* second & third seek */
9696 -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
9697 -+ else
9698 -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
9699 -+
9700 -+ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
9701 -+ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
9702 -+ total = bfqq->seek_total + (bfqq->seek_samples/2);
9703 -+ do_div(total, bfqq->seek_samples);
9704 -+ bfqq->seek_mean = (sector_t)total;
9705 -+
9706 -+ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
9707 -+ (u64)bfqq->seek_mean);
9708 -+}
9709 -+
9710 -+/*
9711 -+ * Disable idle window if the process thinks too long or seeks so much that
9712 -+ * it doesn't matter.
9713 -+ */
9714 -+static void bfq_update_idle_window(struct bfq_data *bfqd,
9715 -+ struct bfq_queue *bfqq,
9716 -+ struct bfq_io_cq *bic)
9717 -+{
9718 -+ int enable_idle;
9719 -+
9720 -+ /* Don't idle for async or idle io prio class. */
9721 -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
9722 -+ return;
9723 -+
9724 -+ enable_idle = bfq_bfqq_idle_window(bfqq);
9725 -+
9726 -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
9727 -+ bfqd->bfq_slice_idle == 0 ||
9728 -+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
9729 -+ bfqq->raising_coeff == 1))
9730 -+ enable_idle = 0;
9731 -+ else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
9732 -+ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
9733 -+ bfqq->raising_coeff == 1)
9734 -+ enable_idle = 0;
9735 -+ else
9736 -+ enable_idle = 1;
9737 -+ }
9738 -+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
9739 -+ enable_idle);
9740 -+
9741 -+ if (enable_idle)
9742 -+ bfq_mark_bfqq_idle_window(bfqq);
9743 -+ else
9744 -+ bfq_clear_bfqq_idle_window(bfqq);
9745 -+}
9746 -+
9747 -+/*
9748 -+ * Called when a new fs request (rq) is added to bfqq. Check if there's
9749 -+ * something we should do about it.
9750 -+ */
9751 -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
9752 -+ struct request *rq)
9753 -+{
9754 -+ struct bfq_io_cq *bic = RQ_BIC(rq);
9755 -+
9756 -+ if (rq->cmd_flags & REQ_META)
9757 -+ bfqq->meta_pending++;
9758 -+
9759 -+ bfq_update_io_thinktime(bfqd, bic);
9760 -+ bfq_update_io_seektime(bfqd, bfqq, rq);
9761 -+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
9762 -+ !BFQQ_SEEKY(bfqq))
9763 -+ bfq_update_idle_window(bfqd, bfqq, bic);
9764 -+
9765 -+ bfq_log_bfqq(bfqd, bfqq,
9766 -+ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
9767 -+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
9768 -+ (long long unsigned)bfqq->seek_mean);
9769 -+
9770 -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
9771 -+
9772 -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
9773 -+ int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
9774 -+ blk_rq_sectors(rq) < 32;
9775 -+ int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
9776 -+
9777 -+ /*
9778 -+ * There is just this request queued: if the request
9779 -+ * is small and the queue is not to be expired, then
9780 -+ * just exit.
9781 -+ *
9782 -+ * In this way, if the disk is being idled to wait for
9783 -+ * a new request from the in-service queue, we avoid
9784 -+ * unplugging the device and committing the disk to serve
9785 -+ * just a small request. On the contrary, we wait for
9786 -+ * the block layer to decide when to unplug the device:
9787 -+ * hopefully, new requests will be merged to this one
9788 -+ * quickly, then the device will be unplugged and
9789 -+ * larger requests will be dispatched.
9790 -+ */
9791 -+ if (small_req && !budget_timeout)
9792 -+ return;
9793 -+
9794 -+ /*
9795 -+ * A large enough request arrived, or the queue is to
9796 -+ * be expired: in both cases disk idling is to be
9797 -+ * stopped, so clear wait_request flag and reset
9798 -+ * timer.
9799 -+ */
9800 -+ bfq_clear_bfqq_wait_request(bfqq);
9801 -+ del_timer(&bfqd->idle_slice_timer);
9802 -+
9803 -+ /*
9804 -+ * The queue is not empty, because a new request just
9805 -+ * arrived. Hence we can safely expire the queue, in
9806 -+ * case of budget timeout, without risking that the
9807 -+ * timestamps of the queue are not updated correctly.
9808 -+ * See [1] for more details.
9809 -+ */
9810 -+ if (budget_timeout)
9811 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
9812 -+
9813 -+ /*
9814 -+ * Let the request rip immediately, or let a new queue be
9815 -+ * selected if bfqq has just been expired.
9816 -+ */
9817 -+ __blk_run_queue(bfqd->queue);
9818 -+ }
9819 -+}
9820 -+
9821 -+static void bfq_insert_request(struct request_queue *q, struct request *rq)
9822 -+{
9823 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
9824 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
9825 -+
9826 -+ assert_spin_locked(bfqd->queue->queue_lock);
9827 -+ bfq_init_prio_data(bfqq, RQ_BIC(rq));
9828 -+
9829 -+ bfq_add_rq_rb(rq);
9830 -+
9831 -+ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
9832 -+ list_add_tail(&rq->queuelist, &bfqq->fifo);
9833 -+
9834 -+ bfq_rq_enqueued(bfqd, bfqq, rq);
9835 -+}
9836 -+
9837 -+static void bfq_update_hw_tag(struct bfq_data *bfqd)
9838 -+{
9839 -+ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
9840 -+ bfqd->rq_in_driver);
9841 -+
9842 -+ if (bfqd->hw_tag == 1)
9843 -+ return;
9844 -+
9845 -+ /*
9846 -+ * This sample is valid if the number of outstanding requests
9847 -+ * is large enough to allow a queueing behavior. Note that the
9848 -+ * sum is not exact, as it's not taking into account deactivated
9849 -+ * requests.
9850 -+ */
9851 -+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
9852 -+ return;
9853 -+
9854 -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
9855 -+ return;
9856 -+
9857 -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
9858 -+ bfqd->max_rq_in_driver = 0;
9859 -+ bfqd->hw_tag_samples = 0;
9860 -+}
9861 -+
9862 -+static void bfq_completed_request(struct request_queue *q, struct request *rq)
9863 -+{
9864 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
9865 -+ struct bfq_data *bfqd = bfqq->bfqd;
9866 -+ const int sync = rq_is_sync(rq);
9867 -+
9868 -+ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
9869 -+ blk_rq_sectors(rq), sync);
9870 -+
9871 -+ bfq_update_hw_tag(bfqd);
9872 -+
9873 -+ WARN_ON(!bfqd->rq_in_driver);
9874 -+ WARN_ON(!bfqq->dispatched);
9875 -+ bfqd->rq_in_driver--;
9876 -+ bfqq->dispatched--;
9877 -+
9878 -+ if (bfq_bfqq_sync(bfqq))
9879 -+ bfqd->sync_flight--;
9880 -+
9881 -+ if (sync)
9882 -+ RQ_BIC(rq)->ttime.last_end_request = jiffies;
9883 -+
9884 -+ /*
9885 -+ * The computation of softrt_next_start was scheduled for the next
9886 -+ * request completion: it is now time to compute it.
9887 -+ */
9888 -+ if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list))
9889 -+ bfqq->soft_rt_next_start =
9890 -+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
9891 -+
9892 -+ /*
9893 -+ * If this is the in-service queue, check if it needs to be expired,
9894 -+ * or if we want to idle in case it has no pending requests.
9895 -+ */
9896 -+ if (bfqd->in_service_queue == bfqq) {
9897 -+ if (bfq_bfqq_budget_new(bfqq))
9898 -+ bfq_set_budget_timeout(bfqd);
9899 -+
9900 -+ if (bfq_bfqq_must_idle(bfqq)) {
9901 -+ bfq_arm_slice_timer(bfqd);
9902 -+ goto out;
9903 -+ } else if (bfq_may_expire_for_budg_timeout(bfqq))
9904 -+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
9905 -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
9906 -+ (bfqq->dispatched == 0 ||
9907 -+ !bfq_bfqq_must_not_expire(bfqq)))
9908 -+ bfq_bfqq_expire(bfqd, bfqq, 0,
9909 -+ BFQ_BFQQ_NO_MORE_REQUESTS);
9910 -+ }
9911 -+
9912 -+ if (!bfqd->rq_in_driver)
9913 -+ bfq_schedule_dispatch(bfqd);
9914 -+
9915 -+out:
9916 -+ return;
9917 -+}
9918 -+
9919 -+static inline int __bfq_may_queue(struct bfq_queue *bfqq)
9920 -+{
9921 -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
9922 -+ bfq_clear_bfqq_must_alloc(bfqq);
9923 -+ return ELV_MQUEUE_MUST;
9924 -+ }
9925 -+
9926 -+ return ELV_MQUEUE_MAY;
9927 -+}
9928 -+
9929 -+static int bfq_may_queue(struct request_queue *q, int rw)
9930 -+{
9931 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
9932 -+ struct task_struct *tsk = current;
9933 -+ struct bfq_io_cq *bic;
9934 -+ struct bfq_queue *bfqq;
9935 -+
9936 -+ /*
9937 -+ * Don't force setup of a queue from here, as a call to may_queue
9938 -+ * does not necessarily imply that a request actually will be queued.
9939 -+ * So just lookup a possibly existing queue, or return 'may queue'
9940 -+ * if that fails.
9941 -+ */
9942 -+ bic = bfq_bic_lookup(bfqd, tsk->io_context);
9943 -+ if (bic == NULL)
9944 -+ return ELV_MQUEUE_MAY;
9945 -+
9946 -+ bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
9947 -+ if (bfqq != NULL) {
9948 -+ bfq_init_prio_data(bfqq, bic);
9949 -+
9950 -+ return __bfq_may_queue(bfqq);
9951 -+ }
9952 -+
9953 -+ return ELV_MQUEUE_MAY;
9954 -+}
9955 -+
9956 -+/*
9957 -+ * Queue lock held here.
9958 -+ */
9959 -+static void bfq_put_request(struct request *rq)
9960 -+{
9961 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
9962 -+
9963 -+ if (bfqq != NULL) {
9964 -+ const int rw = rq_data_dir(rq);
9965 -+
9966 -+ BUG_ON(!bfqq->allocated[rw]);
9967 -+ bfqq->allocated[rw]--;
9968 -+
9969 -+ rq->elv.priv[0] = NULL;
9970 -+ rq->elv.priv[1] = NULL;
9971 -+
9972 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
9973 -+ bfqq, atomic_read(&bfqq->ref));
9974 -+ bfq_put_queue(bfqq);
9975 -+ }
9976 -+}
9977 -+
9978 -+static struct bfq_queue *
9979 -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
9980 -+ struct bfq_queue *bfqq)
9981 -+{
9982 -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
9983 -+ (long unsigned)bfqq->new_bfqq->pid);
9984 -+ bic_set_bfqq(bic, bfqq->new_bfqq, 1);
9985 -+ bfq_mark_bfqq_coop(bfqq->new_bfqq);
9986 -+ bfq_put_queue(bfqq);
9987 -+ return bic_to_bfqq(bic, 1);
9988 -+}
9989 -+
9990 -+/*
9991 -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
9992 -+ * was the last process referring to said bfqq.
9993 -+ */
9994 -+static struct bfq_queue *
9995 -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
9996 -+{
9997 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
9998 -+ if (bfqq_process_refs(bfqq) == 1) {
9999 -+ bfqq->pid = current->pid;
10000 -+ bfq_clear_bfqq_coop(bfqq);
10001 -+ bfq_clear_bfqq_split_coop(bfqq);
10002 -+ return bfqq;
10003 -+ }
10004 -+
10005 -+ bic_set_bfqq(bic, NULL, 1);
10006 -+
10007 -+ bfq_put_cooperator(bfqq);
10008 -+
10009 -+ bfq_put_queue(bfqq);
10010 -+ return NULL;
10011 -+}
10012 -+
10013 -+/*
10014 -+ * Allocate bfq data structures associated with this request.
10015 -+ */
10016 -+static int bfq_set_request(struct request_queue *q, struct request *rq,
10017 -+ struct bio *bio, gfp_t gfp_mask)
10018 -+{
10019 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
10020 -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
10021 -+ const int rw = rq_data_dir(rq);
10022 -+ const int is_sync = rq_is_sync(rq);
10023 -+ struct bfq_queue *bfqq;
10024 -+ struct bfq_group *bfqg;
10025 -+ unsigned long flags;
10026 -+
10027 -+ might_sleep_if(gfp_mask & __GFP_WAIT);
10028 -+
10029 -+ bfq_changed_ioprio(bic);
10030 -+
10031 -+ spin_lock_irqsave(q->queue_lock, flags);
10032 -+
10033 -+ if (bic == NULL)
10034 -+ goto queue_fail;
10035 -+
10036 -+ bfqg = bfq_bic_update_cgroup(bic);
10037 -+
10038 -+new_queue:
10039 -+ bfqq = bic_to_bfqq(bic, is_sync);
10040 -+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
10041 -+ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
10042 -+ bic_set_bfqq(bic, bfqq, is_sync);
10043 -+ } else {
10044 -+ /*
10045 -+ * If the queue was seeky for too long, break it apart.
10046 -+ */
10047 -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
10048 -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
10049 -+ bfqq = bfq_split_bfqq(bic, bfqq);
10050 -+ if (!bfqq)
10051 -+ goto new_queue;
10052 -+ }
10053 -+
10054 -+ /*
10055 -+ * Check to see if this queue is scheduled to merge with
10056 -+ * another closely cooperating queue. The merging of queues
10057 -+ * happens here as it must be done in process context.
10058 -+ * The reference on new_bfqq was taken in merge_bfqqs.
10059 -+ */
10060 -+ if (bfqq->new_bfqq != NULL)
10061 -+ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
10062 -+ }
10063 -+
10064 -+ bfqq->allocated[rw]++;
10065 -+ atomic_inc(&bfqq->ref);
10066 -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
10067 -+ atomic_read(&bfqq->ref));
10068 -+
10069 -+ rq->elv.priv[0] = bic;
10070 -+ rq->elv.priv[1] = bfqq;
10071 -+
10072 -+ spin_unlock_irqrestore(q->queue_lock, flags);
10073 -+
10074 -+ return 0;
10075 -+
10076 -+queue_fail:
10077 -+ bfq_schedule_dispatch(bfqd);
10078 -+ spin_unlock_irqrestore(q->queue_lock, flags);
10079 -+
10080 -+ return 1;
10081 -+}
10082 -+
10083 -+static void bfq_kick_queue(struct work_struct *work)
10084 -+{
10085 -+ struct bfq_data *bfqd =
10086 -+ container_of(work, struct bfq_data, unplug_work);
10087 -+ struct request_queue *q = bfqd->queue;
10088 -+
10089 -+ spin_lock_irq(q->queue_lock);
10090 -+ __blk_run_queue(q);
10091 -+ spin_unlock_irq(q->queue_lock);
10092 -+}
10093 -+
10094 -+/*
10095 -+ * Handler of the expiration of the timer running if the in-service queue
10096 -+ * is idling inside its time slice.
10097 -+ */
10098 -+static void bfq_idle_slice_timer(unsigned long data)
10099 -+{
10100 -+ struct bfq_data *bfqd = (struct bfq_data *)data;
10101 -+ struct bfq_queue *bfqq;
10102 -+ unsigned long flags;
10103 -+ enum bfqq_expiration reason;
10104 -+
10105 -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags);
10106 -+
10107 -+ bfqq = bfqd->in_service_queue;
10108 -+ /*
10109 -+ * Theoretical race here: the in-service queue can be NULL or different
10110 -+ * from the queue that was idling if the timer handler spins on
10111 -+ * the queue_lock and a new request arrives for the current
10112 -+ * queue and there is a full dispatch cycle that changes the
10113 -+ * in-service queue. This can hardly happen, but in the worst case
10114 -+ * we just expire a queue too early.
10115 -+ */
10116 -+ if (bfqq != NULL) {
10117 -+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
10118 -+ if (bfq_bfqq_budget_timeout(bfqq))
10119 -+ /*
10120 -+ * Also here the queue can be safely expired
10121 -+ * for budget timeout without wasting
10122 -+ * guarantees
10123 -+ */
10124 -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
10125 -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
10126 -+ /*
10127 -+ * The queue may not be empty upon timer expiration,
10128 -+ * because we may not disable the timer when the first
10129 -+ * request of the in-service queue arrives during
10130 -+ * disk idling
10131 -+ */
10132 -+ reason = BFQ_BFQQ_TOO_IDLE;
10133 -+ else
10134 -+ goto schedule_dispatch;
10135 -+
10136 -+ bfq_bfqq_expire(bfqd, bfqq, 1, reason);
10137 -+ }
10138 -+
10139 -+schedule_dispatch:
10140 -+ bfq_schedule_dispatch(bfqd);
10141 -+
10142 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
10143 -+}
10144 -+
10145 -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
10146 -+{
10147 -+ del_timer_sync(&bfqd->idle_slice_timer);
10148 -+ cancel_work_sync(&bfqd->unplug_work);
10149 -+}
10150 -+
10151 -+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
10152 -+ struct bfq_queue **bfqq_ptr)
10153 -+{
10154 -+ struct bfq_group *root_group = bfqd->root_group;
10155 -+ struct bfq_queue *bfqq = *bfqq_ptr;
10156 -+
10157 -+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
10158 -+ if (bfqq != NULL) {
10159 -+ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
10160 -+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
10161 -+ bfqq, atomic_read(&bfqq->ref));
10162 -+ bfq_put_queue(bfqq);
10163 -+ *bfqq_ptr = NULL;
10164 -+ }
10165 -+}
10166 -+
10167 -+/*
10168 -+ * Release all the bfqg references to its async queues. If we are
10169 -+ * deallocating the group these queues may still contain requests, so
10170 -+ * we reparent them to the root cgroup (i.e., the only one that will
10171 -+ * exist for sure untill all the requests on a device are gone).
10172 -+ */
10173 -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
10174 -+{
10175 -+ int i, j;
10176 -+
10177 -+ for (i = 0; i < 2; i++)
10178 -+ for (j = 0; j < IOPRIO_BE_NR; j++)
10179 -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
10180 -+
10181 -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
10182 -+}
10183 -+
10184 -+static void bfq_exit_queue(struct elevator_queue *e)
10185 -+{
10186 -+ struct bfq_data *bfqd = e->elevator_data;
10187 -+ struct request_queue *q = bfqd->queue;
10188 -+ struct bfq_queue *bfqq, *n;
10189 -+
10190 -+ bfq_shutdown_timer_wq(bfqd);
10191 -+
10192 -+ spin_lock_irq(q->queue_lock);
10193 -+
10194 -+ BUG_ON(bfqd->in_service_queue != NULL);
10195 -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
10196 -+ bfq_deactivate_bfqq(bfqd, bfqq, 0);
10197 -+
10198 -+ bfq_disconnect_groups(bfqd);
10199 -+ spin_unlock_irq(q->queue_lock);
10200 -+
10201 -+ bfq_shutdown_timer_wq(bfqd);
10202 -+
10203 -+ synchronize_rcu();
10204 -+
10205 -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer));
10206 -+
10207 -+ bfq_free_root_group(bfqd);
10208 -+ kfree(bfqd);
10209 -+}
10210 -+
10211 -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
10212 -+{
10213 -+ struct bfq_group *bfqg;
10214 -+ struct bfq_data *bfqd;
10215 -+ struct elevator_queue *eq;
10216 -+
10217 -+ eq = elevator_alloc(q, e);
10218 -+ if (eq == NULL)
10219 -+ return -ENOMEM;
10220 -+
10221 -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
10222 -+ if (bfqd == NULL) {
10223 -+ kobject_put(&eq->kobj);
10224 -+ return -ENOMEM;
10225 -+ }
10226 -+ eq->elevator_data = bfqd;
10227 -+
10228 -+ /*
10229 -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
10230 -+ * Grab a permanent reference to it, so that the normal code flow
10231 -+ * will not attempt to free it.
10232 -+ */
10233 -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
10234 -+ atomic_inc(&bfqd->oom_bfqq.ref);
10235 -+
10236 -+ bfqd->queue = q;
10237 -+
10238 -+ spin_lock_irq(q->queue_lock);
10239 -+ q->elevator = eq;
10240 -+ spin_unlock_irq(q->queue_lock);
10241 -+
10242 -+ bfqg = bfq_alloc_root_group(bfqd, q->node);
10243 -+ if (bfqg == NULL) {
10244 -+ kfree(bfqd);
10245 -+ kobject_put(&eq->kobj);
10246 -+ return -ENOMEM;
10247 -+ }
10248 -+
10249 -+ bfqd->root_group = bfqg;
10250 -+
10251 -+ init_timer(&bfqd->idle_slice_timer);
10252 -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
10253 -+ bfqd->idle_slice_timer.data = (unsigned long)bfqd;
10254 -+
10255 -+ bfqd->rq_pos_tree = RB_ROOT;
10256 -+
10257 -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
10258 -+
10259 -+ INIT_LIST_HEAD(&bfqd->active_list);
10260 -+ INIT_LIST_HEAD(&bfqd->idle_list);
10261 -+
10262 -+ bfqd->hw_tag = -1;
10263 -+
10264 -+ bfqd->bfq_max_budget = bfq_default_max_budget;
10265 -+
10266 -+ bfqd->bfq_quantum = bfq_quantum;
10267 -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
10268 -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
10269 -+ bfqd->bfq_back_max = bfq_back_max;
10270 -+ bfqd->bfq_back_penalty = bfq_back_penalty;
10271 -+ bfqd->bfq_slice_idle = bfq_slice_idle;
10272 -+ bfqd->bfq_class_idle_last_service = 0;
10273 -+ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
10274 -+ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
10275 -+ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
10276 -+
10277 -+ bfqd->low_latency = true;
10278 -+
10279 -+ bfqd->bfq_raising_coeff = 20;
10280 -+ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
10281 -+ bfqd->bfq_raising_max_time = 0;
10282 -+ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
10283 -+ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
10284 -+ bfqd->bfq_raising_max_softrt_rate = 7000; /*
10285 -+ * Approximate rate required
10286 -+ * to playback or record a
10287 -+ * high-definition compressed
10288 -+ * video.
10289 -+ */
10290 -+ bfqd->raised_busy_queues = 0;
10291 -+
10292 -+ /* Initially estimate the device's peak rate as the reference rate */
10293 -+ if (blk_queue_nonrot(bfqd->queue)) {
10294 -+ bfqd->RT_prod = R_nonrot * T_nonrot;
10295 -+ bfqd->peak_rate = R_nonrot;
10296 -+ } else {
10297 -+ bfqd->RT_prod = R_rot * T_rot;
10298 -+ bfqd->peak_rate = R_rot;
10299 -+ }
10300 -+
10301 -+ return 0;
10302 -+}
10303 -+
10304 -+static void bfq_slab_kill(void)
10305 -+{
10306 -+ if (bfq_pool != NULL)
10307 -+ kmem_cache_destroy(bfq_pool);
10308 -+}
10309 -+
10310 -+static int __init bfq_slab_setup(void)
10311 -+{
10312 -+ bfq_pool = KMEM_CACHE(bfq_queue, 0);
10313 -+ if (bfq_pool == NULL)
10314 -+ return -ENOMEM;
10315 -+ return 0;
10316 -+}
10317 -+
10318 -+static ssize_t bfq_var_show(unsigned int var, char *page)
10319 -+{
10320 -+ return sprintf(page, "%d\n", var);
10321 -+}
10322 -+
10323 -+static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
10324 -+{
10325 -+ unsigned long new_val;
10326 -+ int ret = kstrtoul(page, 10, &new_val);
10327 -+
10328 -+ if (ret == 0)
10329 -+ *var = new_val;
10330 -+
10331 -+ return count;
10332 -+}
10333 -+
10334 -+static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
10335 -+{
10336 -+ struct bfq_data *bfqd = e->elevator_data;
10337 -+ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
10338 -+ jiffies_to_msecs(bfqd->bfq_raising_max_time) :
10339 -+ jiffies_to_msecs(bfq_wrais_duration(bfqd)));
10340 -+}
10341 -+
10342 -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
10343 -+{
10344 -+ struct bfq_queue *bfqq;
10345 -+ struct bfq_data *bfqd = e->elevator_data;
10346 -+ ssize_t num_char = 0;
10347 -+
10348 -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
10349 -+ bfqd->queued);
10350 -+
10351 -+ spin_lock_irq(bfqd->queue->queue_lock);
10352 -+
10353 -+ num_char += sprintf(page + num_char, "Active:\n");
10354 -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
10355 -+ num_char += sprintf(page + num_char,
10356 -+ "pid%d: weight %hu, nr_queued %d %d,"
10357 -+ " dur %d/%u\n",
10358 -+ bfqq->pid,
10359 -+ bfqq->entity.weight,
10360 -+ bfqq->queued[0],
10361 -+ bfqq->queued[1],
10362 -+ jiffies_to_msecs(jiffies -
10363 -+ bfqq->last_rais_start_finish),
10364 -+ jiffies_to_msecs(bfqq->raising_cur_max_time));
10365 -+ }
10366 -+
10367 -+ num_char += sprintf(page + num_char, "Idle:\n");
10368 -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
10369 -+ num_char += sprintf(page + num_char,
10370 -+ "pid%d: weight %hu, dur %d/%u\n",
10371 -+ bfqq->pid,
10372 -+ bfqq->entity.weight,
10373 -+ jiffies_to_msecs(jiffies -
10374 -+ bfqq->last_rais_start_finish),
10375 -+ jiffies_to_msecs(bfqq->raising_cur_max_time));
10376 -+ }
10377 -+
10378 -+ spin_unlock_irq(bfqd->queue->queue_lock);
10379 -+
10380 -+ return num_char;
10381 -+}
10382 -+
10383 -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
10384 -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
10385 -+{ \
10386 -+ struct bfq_data *bfqd = e->elevator_data; \
10387 -+ unsigned int __data = __VAR; \
10388 -+ if (__CONV) \
10389 -+ __data = jiffies_to_msecs(__data); \
10390 -+ return bfq_var_show(__data, (page)); \
10391 -+}
10392 -+SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
10393 -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
10394 -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
10395 -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
10396 -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
10397 -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
10398 -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
10399 -+SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
10400 -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
10401 -+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
10402 -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
10403 -+SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
10404 -+SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
10405 -+SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
10406 -+ 1);
10407 -+SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
10408 -+ bfqd->bfq_raising_min_inter_arr_async,
10409 -+ 1);
10410 -+SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
10411 -+ bfqd->bfq_raising_max_softrt_rate, 0);
10412 -+#undef SHOW_FUNCTION
10413 -+
10414 -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
10415 -+static ssize_t \
10416 -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \
10417 -+{ \
10418 -+ struct bfq_data *bfqd = e->elevator_data; \
10419 -+ unsigned long uninitialized_var(__data); \
10420 -+ int ret = bfq_var_store(&__data, (page), count); \
10421 -+ if (__data < (MIN)) \
10422 -+ __data = (MIN); \
10423 -+ else if (__data > (MAX)) \
10424 -+ __data = (MAX); \
10425 -+ if (__CONV) \
10426 -+ *(__PTR) = msecs_to_jiffies(__data); \
10427 -+ else \
10428 -+ *(__PTR) = __data; \
10429 -+ return ret; \
10430 -+}
10431 -+STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
10432 -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
10433 -+ INT_MAX, 1);
10434 -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
10435 -+ INT_MAX, 1);
10436 -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
10437 -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
10438 -+ INT_MAX, 0);
10439 -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
10440 -+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
10441 -+ 1, INT_MAX, 0);
10442 -+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
10443 -+ INT_MAX, 1);
10444 -+STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
10445 -+ INT_MAX, 0);
10446 -+STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
10447 -+ INT_MAX, 1);
10448 -+STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
10449 -+ INT_MAX, 1);
10450 -+STORE_FUNCTION(bfq_raising_min_idle_time_store,
10451 -+ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
10452 -+STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
10453 -+ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
10454 -+STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
10455 -+ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
10456 -+#undef STORE_FUNCTION
10457 -+
10458 -+/* do nothing for the moment */
10459 -+static ssize_t bfq_weights_store(struct elevator_queue *e,
10460 -+ const char *page, size_t count)
10461 -+{
10462 -+ return count;
10463 -+}
10464 -+
10465 -+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
10466 -+{
10467 -+ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
10468 -+
10469 -+ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
10470 -+ return bfq_calc_max_budget(bfqd->peak_rate, timeout);
10471 -+ else
10472 -+ return bfq_default_max_budget;
10473 -+}
10474 -+
10475 -+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
10476 -+ const char *page, size_t count)
10477 -+{
10478 -+ struct bfq_data *bfqd = e->elevator_data;
10479 -+ unsigned long uninitialized_var(__data);
10480 -+ int ret = bfq_var_store(&__data, (page), count);
10481 -+
10482 -+ if (__data == 0)
10483 -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
10484 -+ else {
10485 -+ if (__data > INT_MAX)
10486 -+ __data = INT_MAX;
10487 -+ bfqd->bfq_max_budget = __data;
10488 -+ }
10489 -+
10490 -+ bfqd->bfq_user_max_budget = __data;
10491 -+
10492 -+ return ret;
10493 -+}
10494 -+
10495 -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
10496 -+ const char *page, size_t count)
10497 -+{
10498 -+ struct bfq_data *bfqd = e->elevator_data;
10499 -+ unsigned long uninitialized_var(__data);
10500 -+ int ret = bfq_var_store(&__data, (page), count);
10501 -+
10502 -+ if (__data < 1)
10503 -+ __data = 1;
10504 -+ else if (__data > INT_MAX)
10505 -+ __data = INT_MAX;
10506 -+
10507 -+ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
10508 -+ if (bfqd->bfq_user_max_budget == 0)
10509 -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
10510 -+
10511 -+ return ret;
10512 -+}
10513 -+
10514 -+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
10515 -+ const char *page, size_t count)
10516 -+{
10517 -+ struct bfq_data *bfqd = e->elevator_data;
10518 -+ unsigned long uninitialized_var(__data);
10519 -+ int ret = bfq_var_store(&__data, (page), count);
10520 -+
10521 -+ if (__data > 1)
10522 -+ __data = 1;
10523 -+ if (__data == 0 && bfqd->low_latency != 0)
10524 -+ bfq_end_raising(bfqd);
10525 -+ bfqd->low_latency = __data;
10526 -+
10527 -+ return ret;
10528 -+}
10529 -+
10530 -+#define BFQ_ATTR(name) \
10531 -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
10532 -+
10533 -+static struct elv_fs_entry bfq_attrs[] = {
10534 -+ BFQ_ATTR(quantum),
10535 -+ BFQ_ATTR(fifo_expire_sync),
10536 -+ BFQ_ATTR(fifo_expire_async),
10537 -+ BFQ_ATTR(back_seek_max),
10538 -+ BFQ_ATTR(back_seek_penalty),
10539 -+ BFQ_ATTR(slice_idle),
10540 -+ BFQ_ATTR(max_budget),
10541 -+ BFQ_ATTR(max_budget_async_rq),
10542 -+ BFQ_ATTR(timeout_sync),
10543 -+ BFQ_ATTR(timeout_async),
10544 -+ BFQ_ATTR(low_latency),
10545 -+ BFQ_ATTR(raising_coeff),
10546 -+ BFQ_ATTR(raising_max_time),
10547 -+ BFQ_ATTR(raising_rt_max_time),
10548 -+ BFQ_ATTR(raising_min_idle_time),
10549 -+ BFQ_ATTR(raising_min_inter_arr_async),
10550 -+ BFQ_ATTR(raising_max_softrt_rate),
10551 -+ BFQ_ATTR(weights),
10552 -+ __ATTR_NULL
10553 -+};
10554 -+
10555 -+static struct elevator_type iosched_bfq = {
10556 -+ .ops = {
10557 -+ .elevator_merge_fn = bfq_merge,
10558 -+ .elevator_merged_fn = bfq_merged_request,
10559 -+ .elevator_merge_req_fn = bfq_merged_requests,
10560 -+ .elevator_allow_merge_fn = bfq_allow_merge,
10561 -+ .elevator_dispatch_fn = bfq_dispatch_requests,
10562 -+ .elevator_add_req_fn = bfq_insert_request,
10563 -+ .elevator_activate_req_fn = bfq_activate_request,
10564 -+ .elevator_deactivate_req_fn = bfq_deactivate_request,
10565 -+ .elevator_completed_req_fn = bfq_completed_request,
10566 -+ .elevator_former_req_fn = elv_rb_former_request,
10567 -+ .elevator_latter_req_fn = elv_rb_latter_request,
10568 -+ .elevator_init_icq_fn = bfq_init_icq,
10569 -+ .elevator_exit_icq_fn = bfq_exit_icq,
10570 -+ .elevator_set_req_fn = bfq_set_request,
10571 -+ .elevator_put_req_fn = bfq_put_request,
10572 -+ .elevator_may_queue_fn = bfq_may_queue,
10573 -+ .elevator_init_fn = bfq_init_queue,
10574 -+ .elevator_exit_fn = bfq_exit_queue,
10575 -+ },
10576 -+ .icq_size = sizeof(struct bfq_io_cq),
10577 -+ .icq_align = __alignof__(struct bfq_io_cq),
10578 -+ .elevator_attrs = bfq_attrs,
10579 -+ .elevator_name = "bfq",
10580 -+ .elevator_owner = THIS_MODULE,
10581 -+};
10582 -+
10583 -+static int __init bfq_init(void)
10584 -+{
10585 -+ /*
10586 -+ * Can be 0 on HZ < 1000 setups.
10587 -+ */
10588 -+ if (bfq_slice_idle == 0)
10589 -+ bfq_slice_idle = 1;
10590 -+
10591 -+ if (bfq_timeout_async == 0)
10592 -+ bfq_timeout_async = 1;
10593 -+
10594 -+ if (bfq_slab_setup())
10595 -+ return -ENOMEM;
10596 -+
10597 -+ elv_register(&iosched_bfq);
10598 -+ printk(KERN_INFO "BFQ I/O-scheduler version: v7r1");
10599 -+
10600 -+ return 0;
10601 -+}
10602 -+
10603 -+static void __exit bfq_exit(void)
10604 -+{
10605 -+ elv_unregister(&iosched_bfq);
10606 -+ bfq_slab_kill();
10607 -+}
10608 -+
10609 -+module_init(bfq_init);
10610 -+module_exit(bfq_exit);
10611 -+
10612 -+MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
10613 -+MODULE_LICENSE("GPL");
10614 -+MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
10615 -diff --git a/block/bfq-sched.c b/block/bfq-sched.c
10616 -new file mode 100644
10617 -index 0000000..999b475
10618 ---- /dev/null
10619 -+++ b/block/bfq-sched.c
10620 -@@ -0,0 +1,1078 @@
10621 -+/*
10622 -+ * BFQ: Hierarchical B-WF2Q+ scheduler.
10623 -+ *
10624 -+ * Based on ideas and code from CFQ:
10625 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
10626 -+ *
10627 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
10628 -+ * Paolo Valente <paolo.valente@×××××××.it>
10629 -+ *
10630 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
10631 -+ */
10632 -+
10633 -+#ifdef CONFIG_CGROUP_BFQIO
10634 -+#define for_each_entity(entity) \
10635 -+ for (; entity != NULL; entity = entity->parent)
10636 -+
10637 -+#define for_each_entity_safe(entity, parent) \
10638 -+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
10639 -+
10640 -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
10641 -+ int extract,
10642 -+ struct bfq_data *bfqd);
10643 -+
10644 -+static inline void bfq_update_budget(struct bfq_entity *next_in_service)
10645 -+{
10646 -+ struct bfq_entity *bfqg_entity;
10647 -+ struct bfq_group *bfqg;
10648 -+ struct bfq_sched_data *group_sd;
10649 -+
10650 -+ BUG_ON(next_in_service == NULL);
10651 -+
10652 -+ group_sd = next_in_service->sched_data;
10653 -+
10654 -+ bfqg = container_of(group_sd, struct bfq_group, sched_data);
10655 -+ /*
10656 -+ * bfq_group's my_entity field is not NULL only if the group
10657 -+ * is not the root group. We must not touch the root entity
10658 -+ * as it must never become an in-service entity.
10659 -+ */
10660 -+ bfqg_entity = bfqg->my_entity;
10661 -+ if (bfqg_entity != NULL)
10662 -+ bfqg_entity->budget = next_in_service->budget;
10663 -+}
10664 -+
10665 -+static int bfq_update_next_in_service(struct bfq_sched_data *sd)
10666 -+{
10667 -+ struct bfq_entity *next_in_service;
10668 -+
10669 -+ if (sd->in_service_entity != NULL)
10670 -+ /* will update/requeue at the end of service */
10671 -+ return 0;
10672 -+
10673 -+ /*
10674 -+ * NOTE: this can be improved in many ways, such as returning
10675 -+ * 1 (and thus propagating upwards the update) only when the
10676 -+ * budget changes, or caching the bfqq that will be scheduled
10677 -+ * next from this subtree. By now we worry more about
10678 -+ * correctness than about performance...
10679 -+ */
10680 -+ next_in_service = bfq_lookup_next_entity(sd, 0, NULL);
10681 -+ sd->next_in_service = next_in_service;
10682 -+
10683 -+ if (next_in_service != NULL)
10684 -+ bfq_update_budget(next_in_service);
10685 -+
10686 -+ return 1;
10687 -+}
10688 -+
10689 -+static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
10690 -+ struct bfq_entity *entity)
10691 -+{
10692 -+ BUG_ON(sd->next_in_service != entity);
10693 -+}
10694 -+#else
10695 -+#define for_each_entity(entity) \
10696 -+ for (; entity != NULL; entity = NULL)
10697 -+
10698 -+#define for_each_entity_safe(entity, parent) \
10699 -+ for (parent = NULL; entity != NULL; entity = parent)
10700 -+
10701 -+static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)
10702 -+{
10703 -+ return 0;
10704 -+}
10705 -+
10706 -+static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
10707 -+ struct bfq_entity *entity)
10708 -+{
10709 -+}
10710 -+
10711 -+static inline void bfq_update_budget(struct bfq_entity *next_in_service)
10712 -+{
10713 -+}
10714 -+#endif
10715 -+
10716 -+/*
10717 -+ * Shift for timestamp calculations. This actually limits the maximum
10718 -+ * service allowed in one timestamp delta (small shift values increase it),
10719 -+ * the maximum total weight that can be used for the queues in the system
10720 -+ * (big shift values increase it), and the period of virtual time wraparounds.
10721 -+ */
10722 -+#define WFQ_SERVICE_SHIFT 22
10723 -+
10724 -+/**
10725 -+ * bfq_gt - compare two timestamps.
10726 -+ * @a: first ts.
10727 -+ * @b: second ts.
10728 -+ *
10729 -+ * Return @a > @b, dealing with wrapping correctly.
10730 -+ */
10731 -+static inline int bfq_gt(u64 a, u64 b)
10732 -+{
10733 -+ return (s64)(a - b) > 0;
10734 -+}
10735 -+
10736 -+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
10737 -+{
10738 -+ struct bfq_queue *bfqq = NULL;
10739 -+
10740 -+ BUG_ON(entity == NULL);
10741 -+
10742 -+ if (entity->my_sched_data == NULL)
10743 -+ bfqq = container_of(entity, struct bfq_queue, entity);
10744 -+
10745 -+ return bfqq;
10746 -+}
10747 -+
10748 -+
10749 -+/**
10750 -+ * bfq_delta - map service into the virtual time domain.
10751 -+ * @service: amount of service.
10752 -+ * @weight: scale factor (weight of an entity or weight sum).
10753 -+ */
10754 -+static inline u64 bfq_delta(unsigned long service,
10755 -+ unsigned long weight)
10756 -+{
10757 -+ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
10758 -+
10759 -+ do_div(d, weight);
10760 -+ return d;
10761 -+}
10762 -+
10763 -+/**
10764 -+ * bfq_calc_finish - assign the finish time to an entity.
10765 -+ * @entity: the entity to act upon.
10766 -+ * @service: the service to be charged to the entity.
10767 -+ */
10768 -+static inline void bfq_calc_finish(struct bfq_entity *entity,
10769 -+ unsigned long service)
10770 -+{
10771 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10772 -+
10773 -+ BUG_ON(entity->weight == 0);
10774 -+
10775 -+ entity->finish = entity->start +
10776 -+ bfq_delta(service, entity->weight);
10777 -+
10778 -+ if (bfqq != NULL) {
10779 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
10780 -+ "calc_finish: serv %lu, w %d",
10781 -+ service, entity->weight);
10782 -+ bfq_log_bfqq(bfqq->bfqd, bfqq,
10783 -+ "calc_finish: start %llu, finish %llu, delta %llu",
10784 -+ entity->start, entity->finish,
10785 -+ bfq_delta(service, entity->weight));
10786 -+ }
10787 -+}
10788 -+
10789 -+/**
10790 -+ * bfq_entity_of - get an entity from a node.
10791 -+ * @node: the node field of the entity.
10792 -+ *
10793 -+ * Convert a node pointer to the relative entity. This is used only
10794 -+ * to simplify the logic of some functions and not as the generic
10795 -+ * conversion mechanism because, e.g., in the tree walking functions,
10796 -+ * the check for a %NULL value would be redundant.
10797 -+ */
10798 -+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
10799 -+{
10800 -+ struct bfq_entity *entity = NULL;
10801 -+
10802 -+ if (node != NULL)
10803 -+ entity = rb_entry(node, struct bfq_entity, rb_node);
10804 -+
10805 -+ return entity;
10806 -+}
10807 -+
10808 -+/**
10809 -+ * bfq_extract - remove an entity from a tree.
10810 -+ * @root: the tree root.
10811 -+ * @entity: the entity to remove.
10812 -+ */
10813 -+static inline void bfq_extract(struct rb_root *root,
10814 -+ struct bfq_entity *entity)
10815 -+{
10816 -+ BUG_ON(entity->tree != root);
10817 -+
10818 -+ entity->tree = NULL;
10819 -+ rb_erase(&entity->rb_node, root);
10820 -+}
10821 -+
10822 -+/**
10823 -+ * bfq_idle_extract - extract an entity from the idle tree.
10824 -+ * @st: the service tree of the owning @entity.
10825 -+ * @entity: the entity being removed.
10826 -+ */
10827 -+static void bfq_idle_extract(struct bfq_service_tree *st,
10828 -+ struct bfq_entity *entity)
10829 -+{
10830 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10831 -+ struct rb_node *next;
10832 -+
10833 -+ BUG_ON(entity->tree != &st->idle);
10834 -+
10835 -+ if (entity == st->first_idle) {
10836 -+ next = rb_next(&entity->rb_node);
10837 -+ st->first_idle = bfq_entity_of(next);
10838 -+ }
10839 -+
10840 -+ if (entity == st->last_idle) {
10841 -+ next = rb_prev(&entity->rb_node);
10842 -+ st->last_idle = bfq_entity_of(next);
10843 -+ }
10844 -+
10845 -+ bfq_extract(&st->idle, entity);
10846 -+
10847 -+ if (bfqq != NULL)
10848 -+ list_del(&bfqq->bfqq_list);
10849 -+}
10850 -+
10851 -+/**
10852 -+ * bfq_insert - generic tree insertion.
10853 -+ * @root: tree root.
10854 -+ * @entity: entity to insert.
10855 -+ *
10856 -+ * This is used for the idle and the active tree, since they are both
10857 -+ * ordered by finish time.
10858 -+ */
10859 -+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
10860 -+{
10861 -+ struct bfq_entity *entry;
10862 -+ struct rb_node **node = &root->rb_node;
10863 -+ struct rb_node *parent = NULL;
10864 -+
10865 -+ BUG_ON(entity->tree != NULL);
10866 -+
10867 -+ while (*node != NULL) {
10868 -+ parent = *node;
10869 -+ entry = rb_entry(parent, struct bfq_entity, rb_node);
10870 -+
10871 -+ if (bfq_gt(entry->finish, entity->finish))
10872 -+ node = &parent->rb_left;
10873 -+ else
10874 -+ node = &parent->rb_right;
10875 -+ }
10876 -+
10877 -+ rb_link_node(&entity->rb_node, parent, node);
10878 -+ rb_insert_color(&entity->rb_node, root);
10879 -+
10880 -+ entity->tree = root;
10881 -+}
10882 -+
10883 -+/**
10884 -+ * bfq_update_min - update the min_start field of a entity.
10885 -+ * @entity: the entity to update.
10886 -+ * @node: one of its children.
10887 -+ *
10888 -+ * This function is called when @entity may store an invalid value for
10889 -+ * min_start due to updates to the active tree. The function assumes
10890 -+ * that the subtree rooted at @node (which may be its left or its right
10891 -+ * child) has a valid min_start value.
10892 -+ */
10893 -+static inline void bfq_update_min(struct bfq_entity *entity,
10894 -+ struct rb_node *node)
10895 -+{
10896 -+ struct bfq_entity *child;
10897 -+
10898 -+ if (node != NULL) {
10899 -+ child = rb_entry(node, struct bfq_entity, rb_node);
10900 -+ if (bfq_gt(entity->min_start, child->min_start))
10901 -+ entity->min_start = child->min_start;
10902 -+ }
10903 -+}
10904 -+
10905 -+/**
10906 -+ * bfq_update_active_node - recalculate min_start.
10907 -+ * @node: the node to update.
10908 -+ *
10909 -+ * @node may have changed position or one of its children may have moved,
10910 -+ * this function updates its min_start value. The left and right subtrees
10911 -+ * are assumed to hold a correct min_start value.
10912 -+ */
10913 -+static inline void bfq_update_active_node(struct rb_node *node)
10914 -+{
10915 -+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
10916 -+
10917 -+ entity->min_start = entity->start;
10918 -+ bfq_update_min(entity, node->rb_right);
10919 -+ bfq_update_min(entity, node->rb_left);
10920 -+}
10921 -+
10922 -+/**
10923 -+ * bfq_update_active_tree - update min_start for the whole active tree.
10924 -+ * @node: the starting node.
10925 -+ *
10926 -+ * @node must be the deepest modified node after an update. This function
10927 -+ * updates its min_start using the values held by its children, assuming
10928 -+ * that they did not change, and then updates all the nodes that may have
10929 -+ * changed in the path to the root. The only nodes that may have changed
10930 -+ * are the ones in the path or their siblings.
10931 -+ */
10932 -+static void bfq_update_active_tree(struct rb_node *node)
10933 -+{
10934 -+ struct rb_node *parent;
10935 -+
10936 -+up:
10937 -+ bfq_update_active_node(node);
10938 -+
10939 -+ parent = rb_parent(node);
10940 -+ if (parent == NULL)
10941 -+ return;
10942 -+
10943 -+ if (node == parent->rb_left && parent->rb_right != NULL)
10944 -+ bfq_update_active_node(parent->rb_right);
10945 -+ else if (parent->rb_left != NULL)
10946 -+ bfq_update_active_node(parent->rb_left);
10947 -+
10948 -+ node = parent;
10949 -+ goto up;
10950 -+}
10951 -+
10952 -+/**
10953 -+ * bfq_active_insert - insert an entity in the active tree of its group/device.
10954 -+ * @st: the service tree of the entity.
10955 -+ * @entity: the entity being inserted.
10956 -+ *
10957 -+ * The active tree is ordered by finish time, but an extra key is kept
10958 -+ * per each node, containing the minimum value for the start times of
10959 -+ * its children (and the node itself), so it's possible to search for
10960 -+ * the eligible node with the lowest finish time in logarithmic time.
10961 -+ */
10962 -+static void bfq_active_insert(struct bfq_service_tree *st,
10963 -+ struct bfq_entity *entity)
10964 -+{
10965 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
10966 -+ struct rb_node *node = &entity->rb_node;
10967 -+
10968 -+ bfq_insert(&st->active, entity);
10969 -+
10970 -+ if (node->rb_left != NULL)
10971 -+ node = node->rb_left;
10972 -+ else if (node->rb_right != NULL)
10973 -+ node = node->rb_right;
10974 -+
10975 -+ bfq_update_active_tree(node);
10976 -+
10977 -+ if (bfqq != NULL)
10978 -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
10979 -+}
10980 -+
10981 -+/**
10982 -+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
10983 -+ * @ioprio: the ioprio value to convert.
10984 -+ */
10985 -+static unsigned short bfq_ioprio_to_weight(int ioprio)
10986 -+{
10987 -+ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
10988 -+ return IOPRIO_BE_NR - ioprio;
10989 -+}
10990 -+
10991 -+/**
10992 -+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
10993 -+ * @weight: the weight value to convert.
10994 -+ *
10995 -+ * To preserve as mush as possible the old only-ioprio user interface,
10996 -+ * 0 is used as an escape ioprio value for weights (numerically) equal or
10997 -+ * larger than IOPRIO_BE_NR
10998 -+ */
10999 -+static unsigned short bfq_weight_to_ioprio(int weight)
11000 -+{
11001 -+ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
11002 -+ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
11003 -+}
11004 -+
11005 -+static inline void bfq_get_entity(struct bfq_entity *entity)
11006 -+{
11007 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11008 -+ struct bfq_sched_data *sd;
11009 -+
11010 -+ if (bfqq != NULL) {
11011 -+ sd = entity->sched_data;
11012 -+ atomic_inc(&bfqq->ref);
11013 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
11014 -+ bfqq, atomic_read(&bfqq->ref));
11015 -+ }
11016 -+}
11017 -+
11018 -+/**
11019 -+ * bfq_find_deepest - find the deepest node that an extraction can modify.
11020 -+ * @node: the node being removed.
11021 -+ *
11022 -+ * Do the first step of an extraction in an rb tree, looking for the
11023 -+ * node that will replace @node, and returning the deepest node that
11024 -+ * the following modifications to the tree can touch. If @node is the
11025 -+ * last node in the tree return %NULL.
11026 -+ */
11027 -+static struct rb_node *bfq_find_deepest(struct rb_node *node)
11028 -+{
11029 -+ struct rb_node *deepest;
11030 -+
11031 -+ if (node->rb_right == NULL && node->rb_left == NULL)
11032 -+ deepest = rb_parent(node);
11033 -+ else if (node->rb_right == NULL)
11034 -+ deepest = node->rb_left;
11035 -+ else if (node->rb_left == NULL)
11036 -+ deepest = node->rb_right;
11037 -+ else {
11038 -+ deepest = rb_next(node);
11039 -+ if (deepest->rb_right != NULL)
11040 -+ deepest = deepest->rb_right;
11041 -+ else if (rb_parent(deepest) != node)
11042 -+ deepest = rb_parent(deepest);
11043 -+ }
11044 -+
11045 -+ return deepest;
11046 -+}
11047 -+
11048 -+/**
11049 -+ * bfq_active_extract - remove an entity from the active tree.
11050 -+ * @st: the service_tree containing the tree.
11051 -+ * @entity: the entity being removed.
11052 -+ */
11053 -+static void bfq_active_extract(struct bfq_service_tree *st,
11054 -+ struct bfq_entity *entity)
11055 -+{
11056 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11057 -+ struct rb_node *node;
11058 -+
11059 -+ node = bfq_find_deepest(&entity->rb_node);
11060 -+ bfq_extract(&st->active, entity);
11061 -+
11062 -+ if (node != NULL)
11063 -+ bfq_update_active_tree(node);
11064 -+
11065 -+ if (bfqq != NULL)
11066 -+ list_del(&bfqq->bfqq_list);
11067 -+}
11068 -+
11069 -+/**
11070 -+ * bfq_idle_insert - insert an entity into the idle tree.
11071 -+ * @st: the service tree containing the tree.
11072 -+ * @entity: the entity to insert.
11073 -+ */
11074 -+static void bfq_idle_insert(struct bfq_service_tree *st,
11075 -+ struct bfq_entity *entity)
11076 -+{
11077 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11078 -+ struct bfq_entity *first_idle = st->first_idle;
11079 -+ struct bfq_entity *last_idle = st->last_idle;
11080 -+
11081 -+ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
11082 -+ st->first_idle = entity;
11083 -+ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
11084 -+ st->last_idle = entity;
11085 -+
11086 -+ bfq_insert(&st->idle, entity);
11087 -+
11088 -+ if (bfqq != NULL)
11089 -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
11090 -+}
11091 -+
11092 -+/**
11093 -+ * bfq_forget_entity - remove an entity from the wfq trees.
11094 -+ * @st: the service tree.
11095 -+ * @entity: the entity being removed.
11096 -+ *
11097 -+ * Update the device status and forget everything about @entity, putting
11098 -+ * the device reference to it, if it is a queue. Entities belonging to
11099 -+ * groups are not refcounted.
11100 -+ */
11101 -+static void bfq_forget_entity(struct bfq_service_tree *st,
11102 -+ struct bfq_entity *entity)
11103 -+{
11104 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11105 -+ struct bfq_sched_data *sd;
11106 -+
11107 -+ BUG_ON(!entity->on_st);
11108 -+
11109 -+ entity->on_st = 0;
11110 -+ st->wsum -= entity->weight;
11111 -+ if (bfqq != NULL) {
11112 -+ sd = entity->sched_data;
11113 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
11114 -+ bfqq, atomic_read(&bfqq->ref));
11115 -+ bfq_put_queue(bfqq);
11116 -+ }
11117 -+}
11118 -+
11119 -+/**
11120 -+ * bfq_put_idle_entity - release the idle tree ref of an entity.
11121 -+ * @st: service tree for the entity.
11122 -+ * @entity: the entity being released.
11123 -+ */
11124 -+static void bfq_put_idle_entity(struct bfq_service_tree *st,
11125 -+ struct bfq_entity *entity)
11126 -+{
11127 -+ bfq_idle_extract(st, entity);
11128 -+ bfq_forget_entity(st, entity);
11129 -+}
11130 -+
11131 -+/**
11132 -+ * bfq_forget_idle - update the idle tree if necessary.
11133 -+ * @st: the service tree to act upon.
11134 -+ *
11135 -+ * To preserve the global O(log N) complexity we only remove one entry here;
11136 -+ * as the idle tree will not grow indefinitely this can be done safely.
11137 -+ */
11138 -+static void bfq_forget_idle(struct bfq_service_tree *st)
11139 -+{
11140 -+ struct bfq_entity *first_idle = st->first_idle;
11141 -+ struct bfq_entity *last_idle = st->last_idle;
11142 -+
11143 -+ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
11144 -+ !bfq_gt(last_idle->finish, st->vtime)) {
11145 -+ /*
11146 -+ * Forget the whole idle tree, increasing the vtime past
11147 -+ * the last finish time of idle entities.
11148 -+ */
11149 -+ st->vtime = last_idle->finish;
11150 -+ }
11151 -+
11152 -+ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
11153 -+ bfq_put_idle_entity(st, first_idle);
11154 -+}
11155 -+
11156 -+static struct bfq_service_tree *
11157 -+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
11158 -+ struct bfq_entity *entity)
11159 -+{
11160 -+ struct bfq_service_tree *new_st = old_st;
11161 -+
11162 -+ if (entity->ioprio_changed) {
11163 -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
11164 -+
11165 -+ BUG_ON(old_st->wsum < entity->weight);
11166 -+ old_st->wsum -= entity->weight;
11167 -+
11168 -+ if (entity->new_weight != entity->orig_weight) {
11169 -+ entity->orig_weight = entity->new_weight;
11170 -+ entity->ioprio =
11171 -+ bfq_weight_to_ioprio(entity->orig_weight);
11172 -+ } else if (entity->new_ioprio != entity->ioprio) {
11173 -+ entity->ioprio = entity->new_ioprio;
11174 -+ entity->orig_weight =
11175 -+ bfq_ioprio_to_weight(entity->ioprio);
11176 -+ } else
11177 -+ entity->new_weight = entity->orig_weight =
11178 -+ bfq_ioprio_to_weight(entity->ioprio);
11179 -+
11180 -+ entity->ioprio_class = entity->new_ioprio_class;
11181 -+ entity->ioprio_changed = 0;
11182 -+
11183 -+ /*
11184 -+ * NOTE: here we may be changing the weight too early,
11185 -+ * this will cause unfairness. The correct approach
11186 -+ * would have required additional complexity to defer
11187 -+ * weight changes to the proper time instants (i.e.,
11188 -+ * when entity->finish <= old_st->vtime).
11189 -+ */
11190 -+ new_st = bfq_entity_service_tree(entity);
11191 -+ entity->weight = entity->orig_weight *
11192 -+ (bfqq != NULL ? bfqq->raising_coeff : 1);
11193 -+ new_st->wsum += entity->weight;
11194 -+
11195 -+ if (new_st != old_st)
11196 -+ entity->start = new_st->vtime;
11197 -+ }
11198 -+
11199 -+ return new_st;
11200 -+}
11201 -+
11202 -+/**
11203 -+ * bfq_bfqq_served - update the scheduler status after selection for service.
11204 -+ * @bfqq: the queue being served.
11205 -+ * @served: bytes to transfer.
11206 -+ *
11207 -+ * NOTE: this can be optimized, as the timestamps of upper level entities
11208 -+ * are synchronized every time a new bfqq is selected for service. By now,
11209 -+ * we keep it to better check consistency.
11210 -+ */
11211 -+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
11212 -+{
11213 -+ struct bfq_entity *entity = &bfqq->entity;
11214 -+ struct bfq_service_tree *st;
11215 -+
11216 -+ for_each_entity(entity) {
11217 -+ st = bfq_entity_service_tree(entity);
11218 -+
11219 -+ entity->service += served;
11220 -+ BUG_ON(entity->service > entity->budget);
11221 -+ BUG_ON(st->wsum == 0);
11222 -+
11223 -+ st->vtime += bfq_delta(served, st->wsum);
11224 -+ bfq_forget_idle(st);
11225 -+ }
11226 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
11227 -+}
11228 -+
11229 -+/**
11230 -+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
11231 -+ * @bfqq: the queue that needs a service update.
11232 -+ *
11233 -+ * When it's not possible to be fair in the service domain, because
11234 -+ * a queue is not consuming its budget fast enough (the meaning of
11235 -+ * fast depends on the timeout parameter), we charge it a full
11236 -+ * budget. In this way we should obtain a sort of time-domain
11237 -+ * fairness among all the seeky/slow queues.
11238 -+ */
11239 -+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
11240 -+{
11241 -+ struct bfq_entity *entity = &bfqq->entity;
11242 -+
11243 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
11244 -+
11245 -+ bfq_bfqq_served(bfqq, entity->budget - entity->service);
11246 -+}
11247 -+
11248 -+/**
11249 -+ * __bfq_activate_entity - activate an entity.
11250 -+ * @entity: the entity being activated.
11251 -+ *
11252 -+ * Called whenever an entity is activated, i.e., it is not active and one
11253 -+ * of its children receives a new request, or has to be reactivated due to
11254 -+ * budget exhaustion. It uses the current budget of the entity (and the
11255 -+ * service received if @entity is active) of the queue to calculate its
11256 -+ * timestamps.
11257 -+ */
11258 -+static void __bfq_activate_entity(struct bfq_entity *entity)
11259 -+{
11260 -+ struct bfq_sched_data *sd = entity->sched_data;
11261 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
11262 -+
11263 -+ if (entity == sd->in_service_entity) {
11264 -+ BUG_ON(entity->tree != NULL);
11265 -+ /*
11266 -+ * If we are requeueing the current entity we have
11267 -+ * to take care of not charging to it service it has
11268 -+ * not received.
11269 -+ */
11270 -+ bfq_calc_finish(entity, entity->service);
11271 -+ entity->start = entity->finish;
11272 -+ sd->in_service_entity = NULL;
11273 -+ } else if (entity->tree == &st->active) {
11274 -+ /*
11275 -+ * Requeueing an entity due to a change of some
11276 -+ * next_in_service entity below it. We reuse the
11277 -+ * old start time.
11278 -+ */
11279 -+ bfq_active_extract(st, entity);
11280 -+ } else if (entity->tree == &st->idle) {
11281 -+ /*
11282 -+ * Must be on the idle tree, bfq_idle_extract() will
11283 -+ * check for that.
11284 -+ */
11285 -+ bfq_idle_extract(st, entity);
11286 -+ entity->start = bfq_gt(st->vtime, entity->finish) ?
11287 -+ st->vtime : entity->finish;
11288 -+ } else {
11289 -+ /*
11290 -+ * The finish time of the entity may be invalid, and
11291 -+ * it is in the past for sure, otherwise the queue
11292 -+ * would have been on the idle tree.
11293 -+ */
11294 -+ entity->start = st->vtime;
11295 -+ st->wsum += entity->weight;
11296 -+ bfq_get_entity(entity);
11297 -+
11298 -+ BUG_ON(entity->on_st);
11299 -+ entity->on_st = 1;
11300 -+ }
11301 -+
11302 -+ st = __bfq_entity_update_weight_prio(st, entity);
11303 -+ bfq_calc_finish(entity, entity->budget);
11304 -+ bfq_active_insert(st, entity);
11305 -+}
11306 -+
11307 -+/**
11308 -+ * bfq_activate_entity - activate an entity and its ancestors if necessary.
11309 -+ * @entity: the entity to activate.
11310 -+ *
11311 -+ * Activate @entity and all the entities on the path from it to the root.
11312 -+ */
11313 -+static void bfq_activate_entity(struct bfq_entity *entity)
11314 -+{
11315 -+ struct bfq_sched_data *sd;
11316 -+
11317 -+ for_each_entity(entity) {
11318 -+ __bfq_activate_entity(entity);
11319 -+
11320 -+ sd = entity->sched_data;
11321 -+ if (!bfq_update_next_in_service(sd))
11322 -+ /*
11323 -+ * No need to propagate the activation to the
11324 -+ * upper entities, as they will be updated when
11325 -+ * the in-service entity is rescheduled.
11326 -+ */
11327 -+ break;
11328 -+ }
11329 -+}
11330 -+
11331 -+/**
11332 -+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
11333 -+ * @entity: the entity to deactivate.
11334 -+ * @requeue: if false, the entity will not be put into the idle tree.
11335 -+ *
11336 -+ * Deactivate an entity, independently from its previous state. If the
11337 -+ * entity was not on a service tree just return, otherwise if it is on
11338 -+ * any scheduler tree, extract it from that tree, and if necessary
11339 -+ * and if the caller did not specify @requeue, put it on the idle tree.
11340 -+ *
11341 -+ * Return %1 if the caller should update the entity hierarchy, i.e.,
11342 -+ * if the entity was under service or if it was the next_in_service for
11343 -+ * its sched_data; return %0 otherwise.
11344 -+ */
11345 -+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
11346 -+{
11347 -+ struct bfq_sched_data *sd = entity->sched_data;
11348 -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
11349 -+ int was_in_service = entity == sd->in_service_entity;
11350 -+ int ret = 0;
11351 -+
11352 -+ if (!entity->on_st)
11353 -+ return 0;
11354 -+
11355 -+ BUG_ON(was_in_service && entity->tree != NULL);
11356 -+
11357 -+ if (was_in_service) {
11358 -+ bfq_calc_finish(entity, entity->service);
11359 -+ sd->in_service_entity = NULL;
11360 -+ } else if (entity->tree == &st->active)
11361 -+ bfq_active_extract(st, entity);
11362 -+ else if (entity->tree == &st->idle)
11363 -+ bfq_idle_extract(st, entity);
11364 -+ else if (entity->tree != NULL)
11365 -+ BUG();
11366 -+
11367 -+ if (was_in_service || sd->next_in_service == entity)
11368 -+ ret = bfq_update_next_in_service(sd);
11369 -+
11370 -+ if (!requeue || !bfq_gt(entity->finish, st->vtime))
11371 -+ bfq_forget_entity(st, entity);
11372 -+ else
11373 -+ bfq_idle_insert(st, entity);
11374 -+
11375 -+ BUG_ON(sd->in_service_entity == entity);
11376 -+ BUG_ON(sd->next_in_service == entity);
11377 -+
11378 -+ return ret;
11379 -+}
11380 -+
11381 -+/**
11382 -+ * bfq_deactivate_entity - deactivate an entity.
11383 -+ * @entity: the entity to deactivate.
11384 -+ * @requeue: true if the entity can be put on the idle tree
11385 -+ */
11386 -+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
11387 -+{
11388 -+ struct bfq_sched_data *sd;
11389 -+ struct bfq_entity *parent;
11390 -+
11391 -+ for_each_entity_safe(entity, parent) {
11392 -+ sd = entity->sched_data;
11393 -+
11394 -+ if (!__bfq_deactivate_entity(entity, requeue))
11395 -+ /*
11396 -+ * The parent entity is still backlogged, and
11397 -+ * we don't need to update it as it is still
11398 -+ * under service.
11399 -+ */
11400 -+ break;
11401 -+
11402 -+ if (sd->next_in_service != NULL)
11403 -+ /*
11404 -+ * The parent entity is still backlogged and
11405 -+ * the budgets on the path towards the root
11406 -+ * need to be updated.
11407 -+ */
11408 -+ goto update;
11409 -+
11410 -+ /*
11411 -+ * If we reach there the parent is no more backlogged and
11412 -+ * we want to propagate the dequeue upwards.
11413 -+ */
11414 -+ requeue = 1;
11415 -+ }
11416 -+
11417 -+ return;
11418 -+
11419 -+update:
11420 -+ entity = parent;
11421 -+ for_each_entity(entity) {
11422 -+ __bfq_activate_entity(entity);
11423 -+
11424 -+ sd = entity->sched_data;
11425 -+ if (!bfq_update_next_in_service(sd))
11426 -+ break;
11427 -+ }
11428 -+}
11429 -+
11430 -+/**
11431 -+ * bfq_update_vtime - update vtime if necessary.
11432 -+ * @st: the service tree to act upon.
11433 -+ *
11434 -+ * If necessary update the service tree vtime to have at least one
11435 -+ * eligible entity, skipping to its start time. Assumes that the
11436 -+ * active tree of the device is not empty.
11437 -+ *
11438 -+ * NOTE: this hierarchical implementation updates vtimes quite often,
11439 -+ * we may end up with reactivated tasks getting timestamps after a
11440 -+ * vtime skip done because we needed a ->first_active entity on some
11441 -+ * intermediate node.
11442 -+ */
11443 -+static void bfq_update_vtime(struct bfq_service_tree *st)
11444 -+{
11445 -+ struct bfq_entity *entry;
11446 -+ struct rb_node *node = st->active.rb_node;
11447 -+
11448 -+ entry = rb_entry(node, struct bfq_entity, rb_node);
11449 -+ if (bfq_gt(entry->min_start, st->vtime)) {
11450 -+ st->vtime = entry->min_start;
11451 -+ bfq_forget_idle(st);
11452 -+ }
11453 -+}
11454 -+
11455 -+/**
11456 -+ * bfq_first_active_entity - find the eligible entity with
11457 -+ * the smallest finish time
11458 -+ * @st: the service tree to select from.
11459 -+ *
11460 -+ * This function searches the first schedulable entity, starting from the
11461 -+ * root of the tree and going on the left every time on this side there is
11462 -+ * a subtree with at least one eligible (start >= vtime) entity. The path
11463 -+ * on the right is followed only if a) the left subtree contains no eligible
11464 -+ * entities and b) no eligible entity has been found yet.
11465 -+ */
11466 -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
11467 -+{
11468 -+ struct bfq_entity *entry, *first = NULL;
11469 -+ struct rb_node *node = st->active.rb_node;
11470 -+
11471 -+ while (node != NULL) {
11472 -+ entry = rb_entry(node, struct bfq_entity, rb_node);
11473 -+left:
11474 -+ if (!bfq_gt(entry->start, st->vtime))
11475 -+ first = entry;
11476 -+
11477 -+ BUG_ON(bfq_gt(entry->min_start, st->vtime));
11478 -+
11479 -+ if (node->rb_left != NULL) {
11480 -+ entry = rb_entry(node->rb_left,
11481 -+ struct bfq_entity, rb_node);
11482 -+ if (!bfq_gt(entry->min_start, st->vtime)) {
11483 -+ node = node->rb_left;
11484 -+ goto left;
11485 -+ }
11486 -+ }
11487 -+ if (first != NULL)
11488 -+ break;
11489 -+ node = node->rb_right;
11490 -+ }
11491 -+
11492 -+ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
11493 -+ return first;
11494 -+}
11495 -+
11496 -+/**
11497 -+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
11498 -+ * @st: the service tree.
11499 -+ *
11500 -+ * Update the virtual time in @st and return the first eligible entity
11501 -+ * it contains.
11502 -+ */
11503 -+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
11504 -+ bool force)
11505 -+{
11506 -+ struct bfq_entity *entity, *new_next_in_service = NULL;
11507 -+
11508 -+ if (RB_EMPTY_ROOT(&st->active))
11509 -+ return NULL;
11510 -+
11511 -+ bfq_update_vtime(st);
11512 -+ entity = bfq_first_active_entity(st);
11513 -+ BUG_ON(bfq_gt(entity->start, st->vtime));
11514 -+
11515 -+ /*
11516 -+ * If the chosen entity does not match with the sched_data's
11517 -+ * next_in_service and we are forcedly serving the IDLE priority
11518 -+ * class tree, bubble up budget update.
11519 -+ */
11520 -+ if (unlikely(force && entity != entity->sched_data->next_in_service)) {
11521 -+ new_next_in_service = entity;
11522 -+ for_each_entity(new_next_in_service)
11523 -+ bfq_update_budget(new_next_in_service);
11524 -+ }
11525 -+
11526 -+ return entity;
11527 -+}
11528 -+
11529 -+/**
11530 -+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
11531 -+ * @sd: the sched_data.
11532 -+ * @extract: if true the returned entity will be also extracted from @sd.
11533 -+ *
11534 -+ * NOTE: since we cache the next_in_service entity at each level of the
11535 -+ * hierarchy, the complexity of the lookup can be decreased with
11536 -+ * absolutely no effort just returning the cached next_in_service value;
11537 -+ * we prefer to do full lookups to test the consistency of * the data
11538 -+ * structures.
11539 -+ */
11540 -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
11541 -+ int extract,
11542 -+ struct bfq_data *bfqd)
11543 -+{
11544 -+ struct bfq_service_tree *st = sd->service_tree;
11545 -+ struct bfq_entity *entity;
11546 -+ int i = 0;
11547 -+
11548 -+ BUG_ON(sd->in_service_entity != NULL);
11549 -+
11550 -+ if (bfqd != NULL &&
11551 -+ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
11552 -+ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
11553 -+ true);
11554 -+ if (entity != NULL) {
11555 -+ i = BFQ_IOPRIO_CLASSES - 1;
11556 -+ bfqd->bfq_class_idle_last_service = jiffies;
11557 -+ sd->next_in_service = entity;
11558 -+ }
11559 -+ }
11560 -+ for (; i < BFQ_IOPRIO_CLASSES; i++) {
11561 -+ entity = __bfq_lookup_next_entity(st + i, false);
11562 -+ if (entity != NULL) {
11563 -+ if (extract) {
11564 -+ bfq_check_next_in_service(sd, entity);
11565 -+ bfq_active_extract(st + i, entity);
11566 -+ sd->in_service_entity = entity;
11567 -+ sd->next_in_service = NULL;
11568 -+ }
11569 -+ break;
11570 -+ }
11571 -+ }
11572 -+
11573 -+ return entity;
11574 -+}
11575 -+
11576 -+/*
11577 -+ * Get next queue for service.
11578 -+ */
11579 -+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
11580 -+{
11581 -+ struct bfq_entity *entity = NULL;
11582 -+ struct bfq_sched_data *sd;
11583 -+ struct bfq_queue *bfqq;
11584 -+
11585 -+ BUG_ON(bfqd->in_service_queue != NULL);
11586 -+
11587 -+ if (bfqd->busy_queues == 0)
11588 -+ return NULL;
11589 -+
11590 -+ sd = &bfqd->root_group->sched_data;
11591 -+ for (; sd != NULL; sd = entity->my_sched_data) {
11592 -+ entity = bfq_lookup_next_entity(sd, 1, bfqd);
11593 -+ BUG_ON(entity == NULL);
11594 -+ entity->service = 0;
11595 -+ }
11596 -+
11597 -+ bfqq = bfq_entity_to_bfqq(entity);
11598 -+ BUG_ON(bfqq == NULL);
11599 -+
11600 -+ return bfqq;
11601 -+}
11602 -+
11603 -+/*
11604 -+ * Forced extraction of the given queue.
11605 -+ */
11606 -+static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
11607 -+ struct bfq_queue *bfqq)
11608 -+{
11609 -+ struct bfq_entity *entity;
11610 -+ struct bfq_sched_data *sd;
11611 -+
11612 -+ BUG_ON(bfqd->in_service_queue != NULL);
11613 -+
11614 -+ entity = &bfqq->entity;
11615 -+ /*
11616 -+ * Bubble up extraction/update from the leaf to the root.
11617 -+ */
11618 -+ for_each_entity(entity) {
11619 -+ sd = entity->sched_data;
11620 -+ bfq_update_budget(entity);
11621 -+ bfq_update_vtime(bfq_entity_service_tree(entity));
11622 -+ bfq_active_extract(bfq_entity_service_tree(entity), entity);
11623 -+ sd->active_entity = entity;
11624 -+ sd->next_active = NULL;
11625 -+ entity->service = 0;
11626 -+ }
11627 -+
11628 -+ return;
11629 -+}
11630 -+
11631 -+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
11632 -+{
11633 -+ if (bfqd->in_service_bic != NULL) {
11634 -+ put_io_context(bfqd->in_service_bic->icq.ioc);
11635 -+ bfqd->in_service_bic = NULL;
11636 -+ }
11637 -+
11638 -+ bfqd->in_service_queue = NULL;
11639 -+ del_timer(&bfqd->idle_slice_timer);
11640 -+}
11641 -+
11642 -+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
11643 -+ int requeue)
11644 -+{
11645 -+ struct bfq_entity *entity = &bfqq->entity;
11646 -+
11647 -+ if (bfqq == bfqd->in_service_queue)
11648 -+ __bfq_bfqd_reset_in_service(bfqd);
11649 -+
11650 -+ bfq_deactivate_entity(entity, requeue);
11651 -+}
11652 -+
11653 -+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
11654 -+{
11655 -+ struct bfq_entity *entity = &bfqq->entity;
11656 -+
11657 -+ bfq_activate_entity(entity);
11658 -+}
11659 -+
11660 -+/*
11661 -+ * Called when the bfqq no longer has requests pending, remove it from
11662 -+ * the service tree.
11663 -+ */
11664 -+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
11665 -+ int requeue)
11666 -+{
11667 -+ BUG_ON(!bfq_bfqq_busy(bfqq));
11668 -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
11669 -+
11670 -+ bfq_log_bfqq(bfqd, bfqq, "del from busy");
11671 -+
11672 -+ bfq_clear_bfqq_busy(bfqq);
11673 -+
11674 -+ BUG_ON(bfqd->busy_queues == 0);
11675 -+ bfqd->busy_queues--;
11676 -+ if (bfqq->raising_coeff > 1)
11677 -+ bfqd->raised_busy_queues--;
11678 -+
11679 -+ bfq_deactivate_bfqq(bfqd, bfqq, requeue);
11680 -+}
11681 -+
11682 -+/*
11683 -+ * Called when an inactive queue receives a new request.
11684 -+ */
11685 -+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
11686 -+{
11687 -+ BUG_ON(bfq_bfqq_busy(bfqq));
11688 -+ BUG_ON(bfqq == bfqd->in_service_queue);
11689 -+
11690 -+ bfq_log_bfqq(bfqd, bfqq, "add to busy");
11691 -+
11692 -+ bfq_activate_bfqq(bfqd, bfqq);
11693 -+
11694 -+ bfq_mark_bfqq_busy(bfqq);
11695 -+ bfqd->busy_queues++;
11696 -+ if (bfqq->raising_coeff > 1)
11697 -+ bfqd->raised_busy_queues++;
11698 -+}
11699 -diff --git a/block/bfq.h b/block/bfq.h
11700 -new file mode 100644
11701 -index 0000000..f9b5881
11702 ---- /dev/null
11703 -+++ b/block/bfq.h
11704 -@@ -0,0 +1,614 @@
11705 -+/*
11706 -+ * BFQ-v7r1 for 3.13.0: data structures and common functions prototypes.
11707 -+ *
11708 -+ * Based on ideas and code from CFQ:
11709 -+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk>
11710 -+ *
11711 -+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it>
11712 -+ * Paolo Valente <paolo.valente@×××××××.it>
11713 -+ *
11714 -+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it>
11715 -+ */
11716 -+
11717 -+#ifndef _BFQ_H
11718 -+#define _BFQ_H
11719 -+
11720 -+#include <linux/blktrace_api.h>
11721 -+#include <linux/hrtimer.h>
11722 -+#include <linux/ioprio.h>
11723 -+#include <linux/rbtree.h>
11724 -+
11725 -+#define BFQ_IOPRIO_CLASSES 3
11726 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
11727 -+
11728 -+#define BFQ_MIN_WEIGHT 1
11729 -+#define BFQ_MAX_WEIGHT 1000
11730 -+
11731 -+#define BFQ_DEFAULT_GRP_WEIGHT 10
11732 -+#define BFQ_DEFAULT_GRP_IOPRIO 0
11733 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
11734 -+
11735 -+struct bfq_entity;
11736 -+
11737 -+/**
11738 -+ * struct bfq_service_tree - per ioprio_class service tree.
11739 -+ * @active: tree for active entities (i.e., those backlogged).
11740 -+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
11741 -+ * @first_idle: idle entity with minimum F_i.
11742 -+ * @last_idle: idle entity with maximum F_i.
11743 -+ * @vtime: scheduler virtual time.
11744 -+ * @wsum: scheduler weight sum; active and idle entities contribute to it.
11745 -+ *
11746 -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
11747 -+ * ioprio_class has its own independent scheduler, and so its own
11748 -+ * bfq_service_tree. All the fields are protected by the queue lock
11749 -+ * of the containing bfqd.
11750 -+ */
11751 -+struct bfq_service_tree {
11752 -+ struct rb_root active;
11753 -+ struct rb_root idle;
11754 -+
11755 -+ struct bfq_entity *first_idle;
11756 -+ struct bfq_entity *last_idle;
11757 -+
11758 -+ u64 vtime;
11759 -+ unsigned long wsum;
11760 -+};
11761 -+
11762 -+/**
11763 -+ * struct bfq_sched_data - multi-class scheduler.
11764 -+ * @in_service_entity: entity under service.
11765 -+ * @next_in_service: head-of-the-line entity in the scheduler.
11766 -+ * @service_tree: array of service trees, one per ioprio_class.
11767 -+ *
11768 -+ * bfq_sched_data is the basic scheduler queue. It supports three
11769 -+ * ioprio_classes, and can be used either as a toplevel queue or as
11770 -+ * an intermediate queue on a hierarchical setup.
11771 -+ * @next_in_service points to the active entity of the sched_data
11772 -+ * service trees that will be scheduled next.
11773 -+ *
11774 -+ * The supported ioprio_classes are the same as in CFQ, in descending
11775 -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
11776 -+ * Requests from higher priority queues are served before all the
11777 -+ * requests from lower priority queues; among requests of the same
11778 -+ * queue requests are served according to B-WF2Q+.
11779 -+ * All the fields are protected by the queue lock of the containing bfqd.
11780 -+ */
11781 -+struct bfq_sched_data {
11782 -+ struct bfq_entity *in_service_entity;
11783 -+ struct bfq_entity *next_in_service;
11784 -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
11785 -+};
11786 -+
11787 -+/**
11788 -+ * struct bfq_entity - schedulable entity.
11789 -+ * @rb_node: service_tree member.
11790 -+ * @on_st: flag, true if the entity is on a tree (either the active or
11791 -+ * the idle one of its service_tree).
11792 -+ * @finish: B-WF2Q+ finish timestamp (aka F_i).
11793 -+ * @start: B-WF2Q+ start timestamp (aka S_i).
11794 -+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
11795 -+ * @min_start: minimum start time of the (active) subtree rooted at
11796 -+ * this entity; used for O(log N) lookups into active trees.
11797 -+ * @service: service received during the last round of service.
11798 -+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
11799 -+ * @weight: weight of the queue
11800 -+ * @parent: parent entity, for hierarchical scheduling.
11801 -+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
11802 -+ * associated scheduler queue, %NULL on leaf nodes.
11803 -+ * @sched_data: the scheduler queue this entity belongs to.
11804 -+ * @ioprio: the ioprio in use.
11805 -+ * @new_weight: when a weight change is requested, the new weight value.
11806 -+ * @orig_weight: original weight, used to implement weight boosting
11807 -+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
11808 -+ * @ioprio_class: the ioprio_class in use.
11809 -+ * @new_ioprio_class: when an ioprio_class change is requested, the new
11810 -+ * ioprio_class value.
11811 -+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
11812 -+ * ioprio_class change.
11813 -+ *
11814 -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
11815 -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
11816 -+ * entity belongs to the sched_data of the parent group in the cgroup
11817 -+ * hierarchy. Non-leaf entities have also their own sched_data, stored
11818 -+ * in @my_sched_data.
11819 -+ *
11820 -+ * Each entity stores independently its priority values; this would
11821 -+ * allow different weights on different devices, but this
11822 -+ * functionality is not exported to userspace by now. Priorities and
11823 -+ * weights are updated lazily, first storing the new values into the
11824 -+ * new_* fields, then setting the @ioprio_changed flag. As soon as
11825 -+ * there is a transition in the entity state that allows the priority
11826 -+ * update to take place the effective and the requested priority
11827 -+ * values are synchronized.
11828 -+ *
11829 -+ * Unless cgroups are used, the weight value is calculated from the
11830 -+ * ioprio to export the same interface as CFQ. When dealing with
11831 -+ * ``well-behaved'' queues (i.e., queues that do not spend too much
11832 -+ * time to consume their budget and have true sequential behavior, and
11833 -+ * when there are no external factors breaking anticipation) the
11834 -+ * relative weights at each level of the cgroups hierarchy should be
11835 -+ * guaranteed. All the fields are protected by the queue lock of the
11836 -+ * containing bfqd.
11837 -+ */
11838 -+struct bfq_entity {
11839 -+ struct rb_node rb_node;
11840 -+
11841 -+ int on_st;
11842 -+
11843 -+ u64 finish;
11844 -+ u64 start;
11845 -+
11846 -+ struct rb_root *tree;
11847 -+
11848 -+ u64 min_start;
11849 -+
11850 -+ unsigned long service, budget;
11851 -+ unsigned short weight, new_weight;
11852 -+ unsigned short orig_weight;
11853 -+
11854 -+ struct bfq_entity *parent;
11855 -+
11856 -+ struct bfq_sched_data *my_sched_data;
11857 -+ struct bfq_sched_data *sched_data;
11858 -+
11859 -+ unsigned short ioprio, new_ioprio;
11860 -+ unsigned short ioprio_class, new_ioprio_class;
11861 -+
11862 -+ int ioprio_changed;
11863 -+};
11864 -+
11865 -+struct bfq_group;
11866 -+
11867 -+/**
11868 -+ * struct bfq_queue - leaf schedulable entity.
11869 -+ * @ref: reference counter.
11870 -+ * @bfqd: parent bfq_data.
11871 -+ * @new_bfqq: shared bfq_queue if queue is cooperating with
11872 -+ * one or more other queues.
11873 -+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
11874 -+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
11875 -+ * @sort_list: sorted list of pending requests.
11876 -+ * @next_rq: if fifo isn't expired, next request to serve.
11877 -+ * @queued: nr of requests queued in @sort_list.
11878 -+ * @allocated: currently allocated requests.
11879 -+ * @meta_pending: pending metadata requests.
11880 -+ * @fifo: fifo list of requests in sort_list.
11881 -+ * @entity: entity representing this queue in the scheduler.
11882 -+ * @max_budget: maximum budget allowed from the feedback mechanism.
11883 -+ * @budget_timeout: budget expiration (in jiffies).
11884 -+ * @dispatched: number of requests on the dispatch list or inside driver.
11885 -+ * @org_ioprio: saved ioprio during boosted periods.
11886 -+ * @flags: status flags.
11887 -+ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
11888 -+ * @seek_samples: number of seeks sampled
11889 -+ * @seek_total: sum of the distances of the seeks sampled
11890 -+ * @seek_mean: mean seek distance
11891 -+ * @last_request_pos: position of the last request enqueued
11892 -+ * @pid: pid of the process owning the queue, used for logging purposes.
11893 -+ * @last_rais_start_time: last (idle -> weight-raised) transition attempt
11894 -+ * @raising_cur_max_time: current max raising time for this queue
11895 -+ * @last_idle_bklogged: time of the last transition of the @bfq_queue from
11896 -+ * idle to backlogged
11897 -+ * @service_from_backlogged: cumulative service received from the @bfq_queue
11898 -+ * since the last transition from idle to backlogged
11899 -+ *
11900 -+ * A bfq_queue is a leaf request queue; it can be associated to an io_context
11901 -+ * or more (if it is an async one). @cgroup holds a reference to the
11902 -+ * cgroup, to be sure that it does not disappear while a bfqq still
11903 -+ * references it (mostly to avoid races between request issuing and task
11904 -+ * migration followed by cgroup distruction).
11905 -+ * All the fields are protected by the queue lock of the containing bfqd.
11906 -+ */
11907 -+struct bfq_queue {
11908 -+ atomic_t ref;
11909 -+ struct bfq_data *bfqd;
11910 -+
11911 -+ /* fields for cooperating queues handling */
11912 -+ struct bfq_queue *new_bfqq;
11913 -+ struct rb_node pos_node;
11914 -+ struct rb_root *pos_root;
11915 -+
11916 -+ struct rb_root sort_list;
11917 -+ struct request *next_rq;
11918 -+ int queued[2];
11919 -+ int allocated[2];
11920 -+ int meta_pending;
11921 -+ struct list_head fifo;
11922 -+
11923 -+ struct bfq_entity entity;
11924 -+
11925 -+ unsigned long max_budget;
11926 -+ unsigned long budget_timeout;
11927 -+
11928 -+ int dispatched;
11929 -+
11930 -+ unsigned short org_ioprio;
11931 -+
11932 -+ unsigned int flags;
11933 -+
11934 -+ struct list_head bfqq_list;
11935 -+
11936 -+ unsigned int seek_samples;
11937 -+ u64 seek_total;
11938 -+ sector_t seek_mean;
11939 -+ sector_t last_request_pos;
11940 -+
11941 -+ pid_t pid;
11942 -+
11943 -+ /* weight-raising fields */
11944 -+ unsigned long raising_cur_max_time;
11945 -+ unsigned long soft_rt_next_start;
11946 -+ unsigned long last_rais_start_finish;
11947 -+ unsigned int raising_coeff;
11948 -+ unsigned long last_idle_bklogged;
11949 -+ unsigned long service_from_backlogged;
11950 -+};
11951 -+
11952 -+/**
11953 -+ * struct bfq_ttime - per process thinktime stats.
11954 -+ * @ttime_total: total process thinktime
11955 -+ * @ttime_samples: number of thinktime samples
11956 -+ * @ttime_mean: average process thinktime
11957 -+ */
11958 -+struct bfq_ttime {
11959 -+ unsigned long last_end_request;
11960 -+
11961 -+ unsigned long ttime_total;
11962 -+ unsigned long ttime_samples;
11963 -+ unsigned long ttime_mean;
11964 -+};
11965 -+
11966 -+/**
11967 -+ * struct bfq_io_cq - per (request_queue, io_context) structure.
11968 -+ * @icq: associated io_cq structure
11969 -+ * @bfqq: array of two process queues, the sync and the async
11970 -+ * @ttime: associated @bfq_ttime struct
11971 -+ */
11972 -+struct bfq_io_cq {
11973 -+ struct io_cq icq; /* must be the first member */
11974 -+ struct bfq_queue *bfqq[2];
11975 -+ struct bfq_ttime ttime;
11976 -+ int ioprio;
11977 -+};
11978 -+
11979 -+/**
11980 -+ * struct bfq_data - per device data structure.
11981 -+ * @queue: request queue for the managed device.
11982 -+ * @root_group: root bfq_group for the device.
11983 -+ * @rq_pos_tree: rbtree sorted by next_request position,
11984 -+ * used when determining if two or more queues
11985 -+ * have interleaving requests (see bfq_close_cooperator).
11986 -+ * @busy_queues: number of bfq_queues containing requests (including the
11987 -+ * queue under service, even if it is idling).
11988 -+ * @raised_busy_queues: number of weight-raised busy bfq_queues.
11989 -+ * @queued: number of queued requests.
11990 -+ * @rq_in_driver: number of requests dispatched and waiting for completion.
11991 -+ * @sync_flight: number of sync requests in the driver.
11992 -+ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
11993 -+ * completed requests .
11994 -+ * @hw_tag_samples: nr of samples used to calculate hw_tag.
11995 -+ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
11996 -+ * @budgets_assigned: number of budgets assigned.
11997 -+ * @idle_slice_timer: timer set when idling for the next sequential request
11998 -+ * from the queue under service.
11999 -+ * @unplug_work: delayed work to restart dispatching on the request queue.
12000 -+ * @in_service_queue: bfq_queue under service.
12001 -+ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.
12002 -+ * @last_position: on-disk position of the last served request.
12003 -+ * @last_budget_start: beginning of the last budget.
12004 -+ * @last_idling_start: beginning of the last idle slice.
12005 -+ * @peak_rate: peak transfer rate observed for a budget.
12006 -+ * @peak_rate_samples: number of samples used to calculate @peak_rate.
12007 -+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
12008 -+ * @group_list: list of all the bfq_groups active on the device.
12009 -+ * @active_list: list of all the bfq_queues active on the device.
12010 -+ * @idle_list: list of all the bfq_queues idle on the device.
12011 -+ * @bfq_quantum: max number of requests dispatched per dispatch round.
12012 -+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
12013 -+ * requests are served in fifo order.
12014 -+ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
12015 -+ * @bfq_back_max: maximum allowed backward seek.
12016 -+ * @bfq_slice_idle: maximum idling time.
12017 -+ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
12018 -+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
12019 -+ * async queues.
12020 -+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
12021 -+ * to prevent seeky queues to impose long latencies to well
12022 -+ * behaved ones (this also implies that seeky queues cannot
12023 -+ * receive guarantees in the service domain; after a timeout
12024 -+ * they are charged for the whole allocated budget, to try
12025 -+ * to preserve a behavior reasonably fair among them, but
12026 -+ * without service-domain guarantees).
12027 -+ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
12028 -+ * queue is multiplied
12029 -+ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
12030 -+ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
12031 -+ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
12032 -+ * may be reactivated for a queue (in jiffies)
12033 -+ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
12034 -+ * after which weight-raising may be
12035 -+ * reactivated for an already busy queue
12036 -+ * (in jiffies)
12037 -+ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
12038 -+ * sectors per seconds
12039 -+ * @RT_prod: cached value of the product R*T used for computing the maximum
12040 -+ * duration of the weight raising automatically
12041 -+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
12042 -+ *
12043 -+ * All the fields are protected by the @queue lock.
12044 -+ */
12045 -+struct bfq_data {
12046 -+ struct request_queue *queue;
12047 -+
12048 -+ struct bfq_group *root_group;
12049 -+
12050 -+ struct rb_root rq_pos_tree;
12051 -+
12052 -+ int busy_queues;
12053 -+ int raised_busy_queues;
12054 -+ int queued;
12055 -+ int rq_in_driver;
12056 -+ int sync_flight;
12057 -+
12058 -+ int max_rq_in_driver;
12059 -+ int hw_tag_samples;
12060 -+ int hw_tag;
12061 -+
12062 -+ int budgets_assigned;
12063 -+
12064 -+ struct timer_list idle_slice_timer;
12065 -+ struct work_struct unplug_work;
12066 -+
12067 -+ struct bfq_queue *in_service_queue;
12068 -+ struct bfq_io_cq *in_service_bic;
12069 -+
12070 -+ sector_t last_position;
12071 -+
12072 -+ ktime_t last_budget_start;
12073 -+ ktime_t last_idling_start;
12074 -+ int peak_rate_samples;
12075 -+ u64 peak_rate;
12076 -+ unsigned long bfq_max_budget;
12077 -+
12078 -+ struct hlist_head group_list;
12079 -+ struct list_head active_list;
12080 -+ struct list_head idle_list;
12081 -+
12082 -+ unsigned int bfq_quantum;
12083 -+ unsigned int bfq_fifo_expire[2];
12084 -+ unsigned int bfq_back_penalty;
12085 -+ unsigned int bfq_back_max;
12086 -+ unsigned int bfq_slice_idle;
12087 -+ u64 bfq_class_idle_last_service;
12088 -+
12089 -+ unsigned int bfq_user_max_budget;
12090 -+ unsigned int bfq_max_budget_async_rq;
12091 -+ unsigned int bfq_timeout[2];
12092 -+
12093 -+ bool low_latency;
12094 -+
12095 -+ /* parameters of the low_latency heuristics */
12096 -+ unsigned int bfq_raising_coeff;
12097 -+ unsigned int bfq_raising_max_time;
12098 -+ unsigned int bfq_raising_rt_max_time;
12099 -+ unsigned int bfq_raising_min_idle_time;
12100 -+ unsigned long bfq_raising_min_inter_arr_async;
12101 -+ unsigned int bfq_raising_max_softrt_rate;
12102 -+ u64 RT_prod;
12103 -+
12104 -+ struct bfq_queue oom_bfqq;
12105 -+};
12106 -+
12107 -+enum bfqq_state_flags {
12108 -+ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */
12109 -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */
12110 -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
12111 -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
12112 -+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
12113 -+ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */
12114 -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */
12115 -+ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
12116 -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
12117 -+ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
12118 -+ BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */
12119 -+};
12120 -+
12121 -+#define BFQ_BFQQ_FNS(name) \
12122 -+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
12123 -+{ \
12124 -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \
12125 -+} \
12126 -+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
12127 -+{ \
12128 -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \
12129 -+} \
12130 -+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
12131 -+{ \
12132 -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \
12133 -+}
12134 -+
12135 -+BFQ_BFQQ_FNS(busy);
12136 -+BFQ_BFQQ_FNS(wait_request);
12137 -+BFQ_BFQQ_FNS(must_alloc);
12138 -+BFQ_BFQQ_FNS(fifo_expire);
12139 -+BFQ_BFQQ_FNS(idle_window);
12140 -+BFQ_BFQQ_FNS(prio_changed);
12141 -+BFQ_BFQQ_FNS(sync);
12142 -+BFQ_BFQQ_FNS(budget_new);
12143 -+BFQ_BFQQ_FNS(coop);
12144 -+BFQ_BFQQ_FNS(split_coop);
12145 -+BFQ_BFQQ_FNS(softrt_update);
12146 -+#undef BFQ_BFQQ_FNS
12147 -+
12148 -+/* Logging facilities. */
12149 -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
12150 -+ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
12151 -+
12152 -+#define bfq_log(bfqd, fmt, args...) \
12153 -+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
12154 -+
12155 -+/* Expiration reasons. */
12156 -+enum bfqq_expiration {
12157 -+ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */
12158 -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */
12159 -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */
12160 -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */
12161 -+};
12162 -+
12163 -+#ifdef CONFIG_CGROUP_BFQIO
12164 -+/**
12165 -+ * struct bfq_group - per (device, cgroup) data structure.
12166 -+ * @entity: schedulable entity to insert into the parent group sched_data.
12167 -+ * @sched_data: own sched_data, to contain child entities (they may be
12168 -+ * both bfq_queues and bfq_groups).
12169 -+ * @group_node: node to be inserted into the bfqio_cgroup->group_data
12170 -+ * list of the containing cgroup's bfqio_cgroup.
12171 -+ * @bfqd_node: node to be inserted into the @bfqd->group_list list
12172 -+ * of the groups active on the same device; used for cleanup.
12173 -+ * @bfqd: the bfq_data for the device this group acts upon.
12174 -+ * @async_bfqq: array of async queues for all the tasks belonging to
12175 -+ * the group, one queue per ioprio value per ioprio_class,
12176 -+ * except for the idle class that has only one queue.
12177 -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
12178 -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
12179 -+ * to avoid too many special cases during group creation/migration.
12180 -+ *
12181 -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
12182 -+ * there is a set of bfq_groups, each one collecting the lower-level
12183 -+ * entities belonging to the group that are acting on the same device.
12184 -+ *
12185 -+ * Locking works as follows:
12186 -+ * o @group_node is protected by the bfqio_cgroup lock, and is accessed
12187 -+ * via RCU from its readers.
12188 -+ * o @bfqd is protected by the queue lock, RCU is used to access it
12189 -+ * from the readers.
12190 -+ * o All the other fields are protected by the @bfqd queue lock.
12191 -+ */
12192 -+struct bfq_group {
12193 -+ struct bfq_entity entity;
12194 -+ struct bfq_sched_data sched_data;
12195 -+
12196 -+ struct hlist_node group_node;
12197 -+ struct hlist_node bfqd_node;
12198 -+
12199 -+ void *bfqd;
12200 -+
12201 -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
12202 -+ struct bfq_queue *async_idle_bfqq;
12203 -+
12204 -+ struct bfq_entity *my_entity;
12205 -+};
12206 -+
12207 -+/**
12208 -+ * struct bfqio_cgroup - bfq cgroup data structure.
12209 -+ * @css: subsystem state for bfq in the containing cgroup.
12210 -+ * @online: flag marked when the subsystem is inserted.
12211 -+ * @weight: cgroup weight.
12212 -+ * @ioprio: cgroup ioprio.
12213 -+ * @ioprio_class: cgroup ioprio_class.
12214 -+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
12215 -+ * @group_data: list containing the bfq_group belonging to this cgroup.
12216 -+ *
12217 -+ * @group_data is accessed using RCU, with @lock protecting the updates,
12218 -+ * @ioprio and @ioprio_class are protected by @lock.
12219 -+ */
12220 -+struct bfqio_cgroup {
12221 -+ struct cgroup_subsys_state css;
12222 -+ bool online;
12223 -+
12224 -+ unsigned short weight, ioprio, ioprio_class;
12225 -+
12226 -+ spinlock_t lock;
12227 -+ struct hlist_head group_data;
12228 -+};
12229 -+#else
12230 -+struct bfq_group {
12231 -+ struct bfq_sched_data sched_data;
12232 -+
12233 -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
12234 -+ struct bfq_queue *async_idle_bfqq;
12235 -+};
12236 -+#endif
12237 -+
12238 -+static inline struct bfq_service_tree *
12239 -+bfq_entity_service_tree(struct bfq_entity *entity)
12240 -+{
12241 -+ struct bfq_sched_data *sched_data = entity->sched_data;
12242 -+ unsigned int idx = entity->ioprio_class - 1;
12243 -+
12244 -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
12245 -+ BUG_ON(sched_data == NULL);
12246 -+
12247 -+ return sched_data->service_tree + idx;
12248 -+}
12249 -+
12250 -+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
12251 -+ int is_sync)
12252 -+{
12253 -+ return bic->bfqq[!!is_sync];
12254 -+}
12255 -+
12256 -+static inline void bic_set_bfqq(struct bfq_io_cq *bic,
12257 -+ struct bfq_queue *bfqq, int is_sync)
12258 -+{
12259 -+ bic->bfqq[!!is_sync] = bfqq;
12260 -+}
12261 -+
12262 -+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
12263 -+{
12264 -+ return bic->icq.q->elevator->elevator_data;
12265 -+}
12266 -+
12267 -+/**
12268 -+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
12269 -+ * @ptr: a pointer to a bfqd.
12270 -+ * @flags: storage for the flags to be saved.
12271 -+ *
12272 -+ * This function allows bfqg->bfqd to be protected by the
12273 -+ * queue lock of the bfqd they reference; the pointer is dereferenced
12274 -+ * under RCU, so the storage for bfqd is assured to be safe as long
12275 -+ * as the RCU read side critical section does not end. After the
12276 -+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
12277 -+ * sure that no other writer accessed it. If we raced with a writer,
12278 -+ * the function returns NULL, with the queue unlocked, otherwise it
12279 -+ * returns the dereferenced pointer, with the queue locked.
12280 -+ */
12281 -+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
12282 -+ unsigned long *flags)
12283 -+{
12284 -+ struct bfq_data *bfqd;
12285 -+
12286 -+ rcu_read_lock();
12287 -+ bfqd = rcu_dereference(*(struct bfq_data **)ptr);
12288 -+
12289 -+ if (bfqd != NULL) {
12290 -+ spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
12291 -+ if (*ptr == bfqd)
12292 -+ goto out;
12293 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
12294 -+ }
12295 -+
12296 -+ bfqd = NULL;
12297 -+out:
12298 -+ rcu_read_unlock();
12299 -+ return bfqd;
12300 -+}
12301 -+
12302 -+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
12303 -+ unsigned long *flags)
12304 -+{
12305 -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
12306 -+}
12307 -+
12308 -+static void bfq_changed_ioprio(struct bfq_io_cq *bic);
12309 -+static void bfq_put_queue(struct bfq_queue *bfqq);
12310 -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
12311 -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
12312 -+ struct bfq_group *bfqg, int is_sync,
12313 -+ struct bfq_io_cq *bic, gfp_t gfp_mask);
12314 -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd,
12315 -+ struct bfq_group *bfqg);
12316 -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
12317 -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
12318 -+#endif
12319 ---
12320 -1.8.5.2
12321 -
12322
12323 Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch
12324 ===================================================================
12325 --- genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch 2014-03-26 23:50:52 UTC (rev 2715)
12326 +++ genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch 2014-03-31 12:03:14 UTC (rev 2716)
12327 @@ -1,1034 +0,0 @@
12328 -From 3cd9e2ea29c3ba9e420556e8ecf161d166186b63 Mon Sep 17 00:00:00 2001
12329 -From: Mauro Andreolini <mauro.andreolini@×××××××.it>
12330 -Date: Thu, 23 Jan 2014 16:54:44 +0100
12331 -Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7 for
12332 - 3.13.0
12333 -
12334 -A set of processes may happen to perform interleaved reads, i.e., requests
12335 -whose union would give rise to a sequential read pattern. There are two
12336 -typical cases: in the first case, processes read fixed-size chunks of
12337 -data at a fixed distance from each other, while in the second case processes
12338 -may read variable-size chunks at variable distances. The latter case occurs
12339 -for example with KVM, which splits the I/O generated by the guest into
12340 -multiple chunks, and lets these chunks be served by a pool of cooperating
12341 -processes, iteratively assigning the next chunk of I/O to the first
12342 -available process. CFQ uses actual queue merging for the first type of
12343 -rocesses, whereas it uses preemption to get a sequential read pattern out
12344 -of the read requests performed by the second type of processes. In the end
12345 -it uses two different mechanisms to achieve the same goal: boosting the
12346 -throughput with interleaved I/O.
12347 -
12348 -This patch introduces Early Queue Merge (EQM), a unified mechanism to get a
12349 -sequential read pattern with both types of processes. The main idea is
12350 -checking newly arrived requests against the next request of the active queue
12351 -both in case of actual request insert and in case of request merge. By doing
12352 -so, both the types of processes can be handled by just merging their queues.
12353 -EQM is then simpler and more compact than the pair of mechanisms used in
12354 -CFQ.
12355 -
12356 -Finally, EQM also preserves the typical low-latency properties of BFQ, by
12357 -properly restoring the weight-raising state of a queue when it gets back to
12358 -a non-merged state.
12359 -
12360 -Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>
12361 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
12362 -Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>
12363 ----
12364 - block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++----------------
12365 - block/bfq-sched.c | 28 ---
12366 - block/bfq.h | 16 ++
12367 - 3 files changed, 474 insertions(+), 227 deletions(-)
12368 -
12369 -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
12370 -index 7670400..295236e 100644
12371 ---- a/block/bfq-iosched.c
12372 -+++ b/block/bfq-iosched.c
12373 -@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
12374 - return dur;
12375 - }
12376 -
12377 -+static inline void
12378 -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
12379 -+{
12380 -+ if (bic->saved_idle_window)
12381 -+ bfq_mark_bfqq_idle_window(bfqq);
12382 -+ else
12383 -+ bfq_clear_bfqq_idle_window(bfqq);
12384 -+ if (bic->raising_time_left && bfqq->bfqd->low_latency) {
12385 -+ /*
12386 -+ * Start a weight raising period with the duration given by
12387 -+ * the raising_time_left snapshot.
12388 -+ */
12389 -+ if (bfq_bfqq_busy(bfqq))
12390 -+ bfqq->bfqd->raised_busy_queues++;
12391 -+ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;
12392 -+ bfqq->raising_cur_max_time = bic->raising_time_left;
12393 -+ bfqq->last_rais_start_finish = jiffies;
12394 -+ bfqq->entity.ioprio_changed = 1;
12395 -+ }
12396 -+ /*
12397 -+ * Clear raising_time_left to prevent bfq_bfqq_save_state() from
12398 -+ * getting confused about the queue's need of a weight-raising
12399 -+ * period.
12400 -+ */
12401 -+ bic->raising_time_left = 0;
12402 -+}
12403 -+
12404 -+/*
12405 -+ * Must be called with the queue_lock held.
12406 -+ */
12407 -+static int bfqq_process_refs(struct bfq_queue *bfqq)
12408 -+{
12409 -+ int process_refs, io_refs;
12410 -+
12411 -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
12412 -+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
12413 -+ BUG_ON(process_refs < 0);
12414 -+ return process_refs;
12415 -+}
12416 -+
12417 - static void bfq_add_rq_rb(struct request *rq)
12418 - {
12419 - struct bfq_queue *bfqq = RQ_BFQQ(rq);
12420 -@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq)
12421 - if (!bfqd->low_latency)
12422 - goto add_bfqq_busy;
12423 -
12424 -+ if (bfq_bfqq_just_split(bfqq))
12425 -+ goto set_ioprio_changed;
12426 -+
12427 - /*
12428 -- * If the queue is not being boosted and has been idle
12429 -- * for enough time, start a weight-raising period
12430 -+ * If the queue:
12431 -+ * - is not being boosted,
12432 -+ * - has been idle for enough time,
12433 -+ * - is not a sync queue or is linked to a bfq_io_cq (it is
12434 -+ * shared "for its nature" or it is not shared and its
12435 -+ * requests have not been redirected to a shared queue)
12436 -+ * start a weight-raising period.
12437 - */
12438 -- if (old_raising_coeff == 1 &&
12439 -- (idle_for_long_time || soft_rt)) {
12440 -+ if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&
12441 -+ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
12442 - bfqq->raising_coeff = bfqd->bfq_raising_coeff;
12443 - if (idle_for_long_time)
12444 - bfqq->raising_cur_max_time =
12445 -@@ -572,6 +620,7 @@ static void bfq_add_rq_rb(struct request *rq)
12446 - bfqd->bfq_raising_rt_max_time;
12447 - }
12448 - }
12449 -+set_ioprio_changed:
12450 - if (old_raising_coeff != bfqq->raising_coeff)
12451 - entity->ioprio_changed = 1;
12452 - add_bfqq_busy:
12453 -@@ -754,90 +803,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)
12454 - spin_unlock_irq(bfqd->queue->queue_lock);
12455 - }
12456 -
12457 --static int bfq_allow_merge(struct request_queue *q, struct request *rq,
12458 -- struct bio *bio)
12459 --{
12460 -- struct bfq_data *bfqd = q->elevator->elevator_data;
12461 -- struct bfq_io_cq *bic;
12462 -- struct bfq_queue *bfqq;
12463 --
12464 -- /*
12465 -- * Disallow merge of a sync bio into an async request.
12466 -- */
12467 -- if (bfq_bio_sync(bio) && !rq_is_sync(rq))
12468 -- return 0;
12469 --
12470 -- /*
12471 -- * Lookup the bfqq that this bio will be queued with. Allow
12472 -- * merge only if rq is queued there.
12473 -- * Queue lock is held here.
12474 -- */
12475 -- bic = bfq_bic_lookup(bfqd, current->io_context);
12476 -- if (bic == NULL)
12477 -- return 0;
12478 --
12479 -- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
12480 -- return bfqq == RQ_BFQQ(rq);
12481 --}
12482 --
12483 --static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
12484 -- struct bfq_queue *bfqq)
12485 --{
12486 -- if (bfqq != NULL) {
12487 -- bfq_mark_bfqq_must_alloc(bfqq);
12488 -- bfq_mark_bfqq_budget_new(bfqq);
12489 -- bfq_clear_bfqq_fifo_expire(bfqq);
12490 --
12491 -- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
12492 --
12493 -- bfq_log_bfqq(bfqd, bfqq,
12494 -- "set_in_service_queue, cur-budget = %lu",
12495 -- bfqq->entity.budget);
12496 -- }
12497 --
12498 -- bfqd->in_service_queue = bfqq;
12499 --}
12500 --
12501 --/*
12502 -- * Get and set a new queue for service.
12503 -- */
12504 --static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
12505 -- struct bfq_queue *bfqq)
12506 -+static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
12507 - {
12508 -- if (!bfqq)
12509 -- bfqq = bfq_get_next_queue(bfqd);
12510 -+ if (request)
12511 -+ return blk_rq_pos(io_struct);
12512 - else
12513 -- bfq_get_next_queue_forced(bfqd, bfqq);
12514 --
12515 -- __bfq_set_in_service_queue(bfqd, bfqq);
12516 -- return bfqq;
12517 -+ return ((struct bio *)io_struct)->bi_sector;
12518 - }
12519 -
12520 --static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
12521 -- struct request *rq)
12522 -+static inline sector_t bfq_dist_from(sector_t pos1,
12523 -+ sector_t pos2)
12524 - {
12525 -- if (blk_rq_pos(rq) >= bfqd->last_position)
12526 -- return blk_rq_pos(rq) - bfqd->last_position;
12527 -+ if (pos1 >= pos2)
12528 -+ return pos1 - pos2;
12529 - else
12530 -- return bfqd->last_position - blk_rq_pos(rq);
12531 -+ return pos2 - pos1;
12532 - }
12533 -
12534 --/*
12535 -- * Return true if bfqq has no request pending and rq is close enough to
12536 -- * bfqd->last_position, or if rq is closer to bfqd->last_position than
12537 -- * bfqq->next_rq
12538 -- */
12539 --static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
12540 -+static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
12541 -+ sector_t sector)
12542 - {
12543 -- return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
12544 -+ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
12545 -+ BFQQ_SEEK_THR;
12546 - }
12547 -
12548 --static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
12549 -+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
12550 - {
12551 - struct rb_root *root = &bfqd->rq_pos_tree;
12552 - struct rb_node *parent, *node;
12553 - struct bfq_queue *__bfqq;
12554 -- sector_t sector = bfqd->last_position;
12555 -
12556 - if (RB_EMPTY_ROOT(root))
12557 - return NULL;
12558 -@@ -856,7 +850,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
12559 - * position).
12560 - */
12561 - __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
12562 -- if (bfq_rq_close(bfqd, __bfqq->next_rq))
12563 -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
12564 - return __bfqq;
12565 -
12566 - if (blk_rq_pos(__bfqq->next_rq) < sector)
12567 -@@ -867,7 +861,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
12568 - return NULL;
12569 -
12570 - __bfqq = rb_entry(node, struct bfq_queue, pos_node);
12571 -- if (bfq_rq_close(bfqd, __bfqq->next_rq))
12572 -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
12573 - return __bfqq;
12574 -
12575 - return NULL;
12576 -@@ -876,14 +870,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
12577 - /*
12578 - * bfqd - obvious
12579 - * cur_bfqq - passed in so that we don't decide that the current queue
12580 -- * is closely cooperating with itself.
12581 -- *
12582 -- * We are assuming that cur_bfqq has dispatched at least one request,
12583 -- * and that bfqd->last_position reflects a position on the disk associated
12584 -- * with the I/O issued by cur_bfqq.
12585 -+ * is closely cooperating with itself
12586 -+ * sector - used as a reference point to search for a close queue
12587 - */
12588 - static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
12589 -- struct bfq_queue *cur_bfqq)
12590 -+ struct bfq_queue *cur_bfqq,
12591 -+ sector_t sector)
12592 - {
12593 - struct bfq_queue *bfqq;
12594 -
12595 -@@ -903,7 +895,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
12596 - * working closely on the same area of the disk. In that case,
12597 - * we can group them together and don't waste time idling.
12598 - */
12599 -- bfqq = bfqq_close(bfqd);
12600 -+ bfqq = bfqq_close(bfqd, sector);
12601 - if (bfqq == NULL || bfqq == cur_bfqq)
12602 - return NULL;
12603 -
12604 -@@ -930,6 +922,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
12605 - return bfqq;
12606 - }
12607 -
12608 -+static struct bfq_queue *
12609 -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
12610 -+{
12611 -+ int process_refs, new_process_refs;
12612 -+ struct bfq_queue *__bfqq;
12613 -+
12614 -+ /*
12615 -+ * If there are no process references on the new_bfqq, then it is
12616 -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
12617 -+ * may have dropped their last reference (not just their last process
12618 -+ * reference).
12619 -+ */
12620 -+ if (!bfqq_process_refs(new_bfqq))
12621 -+ return NULL;
12622 -+
12623 -+ /* Avoid a circular list and skip interim queue merges. */
12624 -+ while ((__bfqq = new_bfqq->new_bfqq)) {
12625 -+ if (__bfqq == bfqq)
12626 -+ return NULL;
12627 -+ new_bfqq = __bfqq;
12628 -+ }
12629 -+
12630 -+ process_refs = bfqq_process_refs(bfqq);
12631 -+ new_process_refs = bfqq_process_refs(new_bfqq);
12632 -+ /*
12633 -+ * If the process for the bfqq has gone away, there is no
12634 -+ * sense in merging the queues.
12635 -+ */
12636 -+ if (process_refs == 0 || new_process_refs == 0)
12637 -+ return NULL;
12638 -+
12639 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
12640 -+ new_bfqq->pid);
12641 -+
12642 -+ /*
12643 -+ * Merging is just a redirection: the requests of the process owning
12644 -+ * one of the two queues are redirected to the other queue. The latter
12645 -+ * queue, in its turn, is set as shared if this is the first time that
12646 -+ * the requests of some process are redirected to it.
12647 -+ *
12648 -+ * We redirect bfqq to new_bfqq and not the opposite, because we
12649 -+ * are in the context of the process owning bfqq, hence we have the
12650 -+ * io_cq of this process. So we can immediately configure this io_cq
12651 -+ * to redirect the requests of the process to new_bfqq.
12652 -+ *
12653 -+ * NOTE, even if new_bfqq coincides with the in-service queue, the
12654 -+ * io_cq of new_bfqq is not available, because, if the in-service queue
12655 -+ * is shared, bfqd->in_service_bic may not point to the io_cq of the
12656 -+ * in-service queue.
12657 -+ * Redirecting the requests of the process owning bfqq to the currently
12658 -+ * in-service queue is in any case the best option, as we feed the
12659 -+ * in-service queue with new requests close to the last request served
12660 -+ * and, by doing so, hopefully increase the throughput.
12661 -+ */
12662 -+ bfqq->new_bfqq = new_bfqq;
12663 -+ atomic_add(process_refs, &new_bfqq->ref);
12664 -+ return new_bfqq;
12665 -+}
12666 -+
12667 -+/*
12668 -+ * Attempt to schedule a merge of bfqq with the currently in-service queue or
12669 -+ * with a close queue among the scheduled queues.
12670 -+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
12671 -+ * structure otherwise.
12672 -+ */
12673 -+static struct bfq_queue *
12674 -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
12675 -+ void *io_struct, bool request)
12676 -+{
12677 -+ struct bfq_queue *in_service_bfqq, *new_bfqq;
12678 -+
12679 -+ if (bfqq->new_bfqq)
12680 -+ return bfqq->new_bfqq;
12681 -+
12682 -+ if (!io_struct)
12683 -+ return NULL;
12684 -+
12685 -+ in_service_bfqq = bfqd->in_service_queue;
12686 -+
12687 -+ if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||
12688 -+ !bfqd->in_service_bic)
12689 -+ goto check_scheduled;
12690 -+
12691 -+ if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))
12692 -+ goto check_scheduled;
12693 -+
12694 -+ if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))
12695 -+ goto check_scheduled;
12696 -+
12697 -+ if (in_service_bfqq->entity.parent != bfqq->entity.parent)
12698 -+ goto check_scheduled;
12699 -+
12700 -+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
12701 -+ bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {
12702 -+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
12703 -+ if (new_bfqq != NULL)
12704 -+ return new_bfqq; /* Merge with the in-service queue */
12705 -+ }
12706 -+
12707 -+ /*
12708 -+ * Check whether there is a cooperator among currently scheduled
12709 -+ * queues. The only thing we need is that the bio/request is not
12710 -+ * NULL, as we need it to establish whether a cooperator exists.
12711 -+ */
12712 -+check_scheduled:
12713 -+ new_bfqq = bfq_close_cooperator(bfqd, bfqq,
12714 -+ bfq_io_struct_pos(io_struct, request));
12715 -+ if (new_bfqq)
12716 -+ return bfq_setup_merge(bfqq, new_bfqq);
12717 -+
12718 -+ return NULL;
12719 -+}
12720 -+
12721 -+static inline void
12722 -+bfq_bfqq_save_state(struct bfq_queue *bfqq)
12723 -+{
12724 -+ /*
12725 -+ * If bfqq->bic == NULL, the queue is already shared or its requests
12726 -+ * have already been redirected to a shared queue; both idle window
12727 -+ * and weight raising state have already been saved. Do nothing.
12728 -+ */
12729 -+ if (bfqq->bic == NULL)
12730 -+ return;
12731 -+ if (bfqq->bic->raising_time_left)
12732 -+ /*
12733 -+ * This is the queue of a just-started process, and would
12734 -+ * deserve weight raising: we set raising_time_left to the full
12735 -+ * weight-raising duration to trigger weight-raising when and
12736 -+ * if the queue is split and the first request of the queue
12737 -+ * is enqueued.
12738 -+ */
12739 -+ bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);
12740 -+ else if (bfqq->raising_coeff > 1) {
12741 -+ unsigned long wrais_duration =
12742 -+ jiffies - bfqq->last_rais_start_finish;
12743 -+ /*
12744 -+ * It may happen that a queue's weight raising period lasts
12745 -+ * longer than its raising_cur_max_time, as weight raising is
12746 -+ * handled only when a request is enqueued or dispatched (it
12747 -+ * does not use any timer). If the weight raising period is
12748 -+ * about to end, don't save it.
12749 -+ */
12750 -+ if (bfqq->raising_cur_max_time <= wrais_duration)
12751 -+ bfqq->bic->raising_time_left = 0;
12752 -+ else
12753 -+ bfqq->bic->raising_time_left =
12754 -+ bfqq->raising_cur_max_time - wrais_duration;
12755 -+ /*
12756 -+ * The bfq_queue is becoming shared or the requests of the
12757 -+ * process owning the queue are being redirected to a shared
12758 -+ * queue. Stop the weight raising period of the queue, as in
12759 -+ * both cases it should not be owned by an interactive or soft
12760 -+ * real-time application.
12761 -+ */
12762 -+ bfq_bfqq_end_raising(bfqq);
12763 -+ } else
12764 -+ bfqq->bic->raising_time_left = 0;
12765 -+ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
12766 -+}
12767 -+
12768 -+static inline void
12769 -+bfq_get_bic_reference(struct bfq_queue *bfqq)
12770 -+{
12771 -+ /*
12772 -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs
12773 -+ * is about to begin using a shared bfq_queue.
12774 -+ */
12775 -+ if (bfqq->bic)
12776 -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
12777 -+}
12778 -+
12779 -+static void
12780 -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
12781 -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
12782 -+{
12783 -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
12784 -+ (long unsigned)new_bfqq->pid);
12785 -+ /* Save weight raising and idle window of the merged queues */
12786 -+ bfq_bfqq_save_state(bfqq);
12787 -+ bfq_bfqq_save_state(new_bfqq);
12788 -+ /*
12789 -+ * Grab a reference to the bic, to prevent it from being destroyed
12790 -+ * before being possibly touched by a bfq_split_bfqq().
12791 -+ */
12792 -+ bfq_get_bic_reference(bfqq);
12793 -+ bfq_get_bic_reference(new_bfqq);
12794 -+ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */
12795 -+ bic_set_bfqq(bic, new_bfqq, 1);
12796 -+ bfq_mark_bfqq_coop(new_bfqq);
12797 -+ /*
12798 -+ * new_bfqq now belongs to at least two bics (it is a shared queue): set
12799 -+ * new_bfqq->bic to NULL. bfqq either:
12800 -+ * - does not belong to any bic any more, and hence bfqq->bic must
12801 -+ * be set to NULL, or
12802 -+ * - is a queue whose owning bics have already been redirected to a
12803 -+ * different queue, hence the queue is destined to not belong to any
12804 -+ * bic soon and bfqq->bic is already NULL (therefore the next
12805 -+ * assignment causes no harm).
12806 -+ */
12807 -+ new_bfqq->bic = NULL;
12808 -+ bfqq->bic = NULL;
12809 -+ bfq_put_queue(bfqq);
12810 -+}
12811 -+
12812 -+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
12813 -+ struct bio *bio)
12814 -+{
12815 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
12816 -+ struct bfq_io_cq *bic;
12817 -+ struct bfq_queue *bfqq, *new_bfqq;
12818 -+
12819 -+ /*
12820 -+ * Disallow merge of a sync bio into an async request.
12821 -+ */
12822 -+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
12823 -+ return 0;
12824 -+
12825 -+ /*
12826 -+ * Lookup the bfqq that this bio will be queued with. Allow
12827 -+ * merge only if rq is queued there.
12828 -+ * Queue lock is held here.
12829 -+ */
12830 -+ bic = bfq_bic_lookup(bfqd, current->io_context);
12831 -+ if (bic == NULL)
12832 -+ return 0;
12833 -+
12834 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
12835 -+ /*
12836 -+ * We take advantage of this function to perform an early merge
12837 -+ * of the queues of possible cooperating processes.
12838 -+ */
12839 -+ if (bfqq != NULL) {
12840 -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
12841 -+ if (new_bfqq != NULL) {
12842 -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
12843 -+ /*
12844 -+ * If we get here, the bio will be queued in the shared queue,
12845 -+ * i.e., new_bfqq, so use new_bfqq to decide whether bio and
12846 -+ * rq can be merged.
12847 -+ */
12848 -+ bfqq = new_bfqq;
12849 -+ }
12850 -+ }
12851 -+
12852 -+ return bfqq == RQ_BFQQ(rq);
12853 -+}
12854 -+
12855 -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
12856 -+ struct bfq_queue *bfqq)
12857 -+{
12858 -+ if (bfqq != NULL) {
12859 -+ bfq_mark_bfqq_must_alloc(bfqq);
12860 -+ bfq_mark_bfqq_budget_new(bfqq);
12861 -+ bfq_clear_bfqq_fifo_expire(bfqq);
12862 -+
12863 -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
12864 -+
12865 -+ bfq_log_bfqq(bfqd, bfqq,
12866 -+ "set_in_service_queue, cur-budget = %lu",
12867 -+ bfqq->entity.budget);
12868 -+ }
12869 -+
12870 -+ bfqd->in_service_queue = bfqq;
12871 -+}
12872 -+
12873 -+/*
12874 -+ * Get and set a new queue for service.
12875 -+ */
12876 -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
12877 -+{
12878 -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
12879 -+
12880 -+ __bfq_set_in_service_queue(bfqd, bfqq);
12881 -+ return bfqq;
12882 -+}
12883 -+
12884 - /*
12885 - * If enough samples have been computed, return the current max budget
12886 - * stored in bfqd, which is dynamically updated according to the
12887 -@@ -1077,63 +1345,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
12888 - return rq;
12889 - }
12890 -
12891 --/*
12892 -- * Must be called with the queue_lock held.
12893 -- */
12894 --static int bfqq_process_refs(struct bfq_queue *bfqq)
12895 --{
12896 -- int process_refs, io_refs;
12897 --
12898 -- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
12899 -- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
12900 -- BUG_ON(process_refs < 0);
12901 -- return process_refs;
12902 --}
12903 --
12904 --static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
12905 --{
12906 -- int process_refs, new_process_refs;
12907 -- struct bfq_queue *__bfqq;
12908 --
12909 -- /*
12910 -- * If there are no process references on the new_bfqq, then it is
12911 -- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
12912 -- * may have dropped their last reference (not just their last process
12913 -- * reference).
12914 -- */
12915 -- if (!bfqq_process_refs(new_bfqq))
12916 -- return;
12917 --
12918 -- /* Avoid a circular list and skip interim queue merges. */
12919 -- while ((__bfqq = new_bfqq->new_bfqq)) {
12920 -- if (__bfqq == bfqq)
12921 -- return;
12922 -- new_bfqq = __bfqq;
12923 -- }
12924 --
12925 -- process_refs = bfqq_process_refs(bfqq);
12926 -- new_process_refs = bfqq_process_refs(new_bfqq);
12927 -- /*
12928 -- * If the process for the bfqq has gone away, there is no
12929 -- * sense in merging the queues.
12930 -- */
12931 -- if (process_refs == 0 || new_process_refs == 0)
12932 -- return;
12933 --
12934 -- /*
12935 -- * Merge in the direction of the lesser amount of work.
12936 -- */
12937 -- if (new_process_refs >= process_refs) {
12938 -- bfqq->new_bfqq = new_bfqq;
12939 -- atomic_add(process_refs, &new_bfqq->ref);
12940 -- } else {
12941 -- new_bfqq->new_bfqq = bfqq;
12942 -- atomic_add(new_process_refs, &bfqq->ref);
12943 -- }
12944 -- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
12945 -- new_bfqq->pid);
12946 --}
12947 --
12948 - static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
12949 - {
12950 - struct bfq_entity *entity = &bfqq->entity;
12951 -@@ -1703,7 +1914,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
12952 - */
12953 - static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
12954 - {
12955 -- struct bfq_queue *bfqq, *new_bfqq = NULL;
12956 -+ struct bfq_queue *bfqq;
12957 - struct request *next_rq;
12958 - enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
12959 -
12960 -@@ -1713,17 +1924,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
12961 -
12962 - bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
12963 -
12964 -- /*
12965 -- * If another queue has a request waiting within our mean seek
12966 -- * distance, let it run. The expire code will check for close
12967 -- * cooperators and put the close queue at the front of the
12968 -- * service tree. If possible, merge the expiring queue with the
12969 -- * new bfqq.
12970 -- */
12971 -- new_bfqq = bfq_close_cooperator(bfqd, bfqq);
12972 -- if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
12973 -- bfq_setup_merge(bfqq, new_bfqq);
12974 --
12975 - if (bfq_may_expire_for_budg_timeout(bfqq) &&
12976 - !timer_pending(&bfqd->idle_slice_timer) &&
12977 - !bfq_bfqq_must_idle(bfqq))
12978 -@@ -1760,36 +1960,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
12979 - bfq_clear_bfqq_wait_request(bfqq);
12980 - del_timer(&bfqd->idle_slice_timer);
12981 - }
12982 -- if (new_bfqq == NULL)
12983 -- goto keep_queue;
12984 -- else
12985 -- goto expire;
12986 -+ goto keep_queue;
12987 - }
12988 - }
12989 -
12990 - /*
12991 -- * No requests pending. If the in-service queue has no cooperator and
12992 -- * still has requests in flight (possibly waiting for a completion)
12993 -- * or is idling for a new request, then keep it.
12994 -+ * No requests pending. If the in-service queue still has requests in
12995 -+ * flight (possibly waiting for a completion) or is idling for a new
12996 -+ * request, then keep it.
12997 - */
12998 -- if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
12999 -- (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
13000 -+ if (timer_pending(&bfqd->idle_slice_timer) ||
13001 -+ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {
13002 - bfqq = NULL;
13003 - goto keep_queue;
13004 -- } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
13005 -- /*
13006 -- * Expiring the queue because there is a close cooperator,
13007 -- * cancel timer.
13008 -- */
13009 -- bfq_clear_bfqq_wait_request(bfqq);
13010 -- del_timer(&bfqd->idle_slice_timer);
13011 - }
13012 -
13013 - reason = BFQ_BFQQ_NO_MORE_REQUESTS;
13014 - expire:
13015 - bfq_bfqq_expire(bfqd, bfqq, 0, reason);
13016 - new_queue:
13017 -- bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
13018 -+ bfqq = bfq_set_in_service_queue(bfqd);
13019 - bfq_log(bfqd, "select_queue: new queue %d returned",
13020 - bfqq != NULL ? bfqq->pid : 0);
13021 - keep_queue:
13022 -@@ -1799,9 +1989,8 @@ keep_queue:
13023 - static void bfq_update_raising_data(struct bfq_data *bfqd,
13024 - struct bfq_queue *bfqq)
13025 - {
13026 -+ struct bfq_entity *entity = &bfqq->entity;
13027 - if (bfqq->raising_coeff > 1) { /* queue is being boosted */
13028 -- struct bfq_entity *entity = &bfqq->entity;
13029 --
13030 - bfq_log_bfqq(bfqd, bfqq,
13031 - "raising period dur %u/%u msec, "
13032 - "old raising coeff %u, w %d(%d)",
13033 -@@ -1818,7 +2007,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,
13034 - "WARN: pending prio change");
13035 - /*
13036 - * If too much time has elapsed from the beginning
13037 -- * of this weight-raising, stop it.
13038 -+ * of this weight-raising period, stop it.
13039 - */
13040 - if (jiffies - bfqq->last_rais_start_finish >
13041 - bfqq->raising_cur_max_time) {
13042 -@@ -1830,11 +2019,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,
13043 - jiffies_to_msecs(bfqq->
13044 - raising_cur_max_time));
13045 - bfq_bfqq_end_raising(bfqq);
13046 -- __bfq_entity_update_weight_prio(
13047 -- bfq_entity_service_tree(entity),
13048 -- entity);
13049 - }
13050 - }
13051 -+ /* Update weight both if it must be raised and if it must be lowered */
13052 -+ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))
13053 -+ __bfq_entity_update_weight_prio(
13054 -+ bfq_entity_service_tree(entity),
13055 -+ entity);
13056 - }
13057 -
13058 - /*
13059 -@@ -2075,6 +2266,25 @@ static void bfq_init_icq(struct io_cq *icq)
13060 - struct bfq_io_cq *bic = icq_to_bic(icq);
13061 -
13062 - bic->ttime.last_end_request = jiffies;
13063 -+ /*
13064 -+ * A newly created bic indicates that the process has just
13065 -+ * started doing I/O, and is probably mapping into memory its
13066 -+ * executable and libraries: it definitely needs weight raising.
13067 -+ * There is however the possibility that the process performs,
13068 -+ * for a while, I/O close to some other process. EQM intercepts
13069 -+ * this behavior and may merge the queue corresponding to the
13070 -+ * process with some other queue, BEFORE the weight of the queue
13071 -+ * is raised. Merged queues are not weight-raised (they are assumed
13072 -+ * to belong to processes that benefit only from high throughput).
13073 -+ * If the merge is basically the consequence of an accident, then
13074 -+ * the queue will be split soon and will get back its old weight.
13075 -+ * It is then important to write down somewhere that this queue
13076 -+ * does need weight raising, even if it did not make it to get its
13077 -+ * weight raised before being merged. To this purpose, we overload
13078 -+ * the field raising_time_left and assign 1 to it, to mark the queue
13079 -+ * as needing weight raising.
13080 -+ */
13081 -+ bic->raising_time_left = 1;
13082 - }
13083 -
13084 - static void bfq_exit_icq(struct io_cq *icq)
13085 -@@ -2088,6 +2298,13 @@ static void bfq_exit_icq(struct io_cq *icq)
13086 - }
13087 -
13088 - if (bic->bfqq[BLK_RW_SYNC]) {
13089 -+ /*
13090 -+ * If the bic is using a shared queue, put the reference
13091 -+ * taken on the io_context when the bic started using a
13092 -+ * shared bfq_queue.
13093 -+ */
13094 -+ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
13095 -+ put_io_context(icq->ioc);
13096 - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
13097 - bic->bfqq[BLK_RW_SYNC] = NULL;
13098 - }
13099 -@@ -2375,6 +2592,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
13100 - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
13101 - return;
13102 -
13103 -+ /* Idle window just restored, statistics are meaningless. */
13104 -+ if (bfq_bfqq_just_split(bfqq))
13105 -+ return;
13106 -+
13107 - enable_idle = bfq_bfqq_idle_window(bfqq);
13108 -
13109 - if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
13110 -@@ -2415,6 +2636,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
13111 - if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
13112 - !BFQQ_SEEKY(bfqq))
13113 - bfq_update_idle_window(bfqd, bfqq, bic);
13114 -+ bfq_clear_bfqq_just_split(bfqq);
13115 -
13116 - bfq_log_bfqq(bfqd, bfqq,
13117 - "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
13118 -@@ -2475,13 +2697,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
13119 - static void bfq_insert_request(struct request_queue *q, struct request *rq)
13120 - {
13121 - struct bfq_data *bfqd = q->elevator->elevator_data;
13122 -- struct bfq_queue *bfqq = RQ_BFQQ(rq);
13123 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
13124 -
13125 - assert_spin_locked(bfqd->queue->queue_lock);
13126 -+
13127 -+ /*
13128 -+ * An unplug may trigger a requeue of a request from the device
13129 -+ * driver: make sure we are in process context while trying to
13130 -+ * merge two bfq_queues.
13131 -+ */
13132 -+ if (!in_interrupt()) {
13133 -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
13134 -+ if (new_bfqq != NULL) {
13135 -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
13136 -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
13137 -+ /*
13138 -+ * Release the request's reference to the old bfqq
13139 -+ * and make sure one is taken to the shared queue.
13140 -+ */
13141 -+ new_bfqq->allocated[rq_data_dir(rq)]++;
13142 -+ bfqq->allocated[rq_data_dir(rq)]--;
13143 -+ atomic_inc(&new_bfqq->ref);
13144 -+ bfq_put_queue(bfqq);
13145 -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
13146 -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
13147 -+ bfqq, new_bfqq);
13148 -+ rq->elv.priv[1] = new_bfqq;
13149 -+ bfqq = new_bfqq;
13150 -+ }
13151 -+ }
13152 -+
13153 - bfq_init_prio_data(bfqq, RQ_BIC(rq));
13154 -
13155 - bfq_add_rq_rb(rq);
13156 -
13157 -+ /*
13158 -+ * Here a newly-created bfq_queue has already started a weight-raising
13159 -+ * period: clear raising_time_left to prevent bfq_bfqq_save_state()
13160 -+ * from assigning it a full weight-raising period. See the detailed
13161 -+ * comments about this field in bfq_init_icq().
13162 -+ */
13163 -+ if (bfqq->bic != NULL)
13164 -+ bfqq->bic->raising_time_left = 0;
13165 - rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
13166 - list_add_tail(&rq->queuelist, &bfqq->fifo);
13167 -
13168 -@@ -2629,18 +2886,6 @@ static void bfq_put_request(struct request *rq)
13169 - }
13170 - }
13171 -
13172 --static struct bfq_queue *
13173 --bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
13174 -- struct bfq_queue *bfqq)
13175 --{
13176 -- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
13177 -- (long unsigned)bfqq->new_bfqq->pid);
13178 -- bic_set_bfqq(bic, bfqq->new_bfqq, 1);
13179 -- bfq_mark_bfqq_coop(bfqq->new_bfqq);
13180 -- bfq_put_queue(bfqq);
13181 -- return bic_to_bfqq(bic, 1);
13182 --}
13183 --
13184 - /*
13185 - * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
13186 - * was the last process referring to said bfqq.
13187 -@@ -2649,6 +2894,9 @@ static struct bfq_queue *
13188 - bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
13189 - {
13190 - bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
13191 -+
13192 -+ put_io_context(bic->icq.ioc);
13193 -+
13194 - if (bfqq_process_refs(bfqq) == 1) {
13195 - bfqq->pid = current->pid;
13196 - bfq_clear_bfqq_coop(bfqq);
13197 -@@ -2677,6 +2925,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
13198 - struct bfq_queue *bfqq;
13199 - struct bfq_group *bfqg;
13200 - unsigned long flags;
13201 -+ bool split = false;
13202 -
13203 - might_sleep_if(gfp_mask & __GFP_WAIT);
13204 -
13205 -@@ -2695,24 +2944,14 @@ new_queue:
13206 - bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
13207 - bic_set_bfqq(bic, bfqq, is_sync);
13208 - } else {
13209 -- /*
13210 -- * If the queue was seeky for too long, break it apart.
13211 -- */
13212 -+ /* If the queue was seeky for too long, break it apart. */
13213 - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
13214 - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
13215 - bfqq = bfq_split_bfqq(bic, bfqq);
13216 -+ split = true;
13217 - if (!bfqq)
13218 - goto new_queue;
13219 - }
13220 --
13221 -- /*
13222 -- * Check to see if this queue is scheduled to merge with
13223 -- * another closely cooperating queue. The merging of queues
13224 -- * happens here as it must be done in process context.
13225 -- * The reference on new_bfqq was taken in merge_bfqqs.
13226 -- */
13227 -- if (bfqq->new_bfqq != NULL)
13228 -- bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
13229 - }
13230 -
13231 - bfqq->allocated[rw]++;
13232 -@@ -2723,6 +2962,26 @@ new_queue:
13233 - rq->elv.priv[0] = bic;
13234 - rq->elv.priv[1] = bfqq;
13235 -
13236 -+ /*
13237 -+ * If a bfq_queue has only one process reference, it is owned
13238 -+ * by only one bfq_io_cq: we can set the bic field of the
13239 -+ * bfq_queue to the address of that structure. Also, if the
13240 -+ * queue has just been split, mark a flag so that the
13241 -+ * information is available to the other scheduler hooks.
13242 -+ */
13243 -+ if (bfqq_process_refs(bfqq) == 1) {
13244 -+ bfqq->bic = bic;
13245 -+ if (split) {
13246 -+ bfq_mark_bfqq_just_split(bfqq);
13247 -+ /*
13248 -+ * If the queue has just been split from a shared queue,
13249 -+ * restore the idle window and the possible weight
13250 -+ * raising period.
13251 -+ */
13252 -+ bfq_bfqq_resume_state(bfqq, bic);
13253 -+ }
13254 -+ }
13255 -+
13256 - spin_unlock_irqrestore(q->queue_lock, flags);
13257 -
13258 - return 0;
13259 -diff --git a/block/bfq-sched.c b/block/bfq-sched.c
13260 -index 30df81c..47e66a8 100644
13261 ---- a/block/bfq-sched.c
13262 -+++ b/block/bfq-sched.c
13263 -@@ -979,34 +979,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
13264 - return bfqq;
13265 - }
13266 -
13267 --/*
13268 -- * Forced extraction of the given queue.
13269 -- */
13270 --static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
13271 -- struct bfq_queue *bfqq)
13272 --{
13273 -- struct bfq_entity *entity;
13274 -- struct bfq_sched_data *sd;
13275 --
13276 -- BUG_ON(bfqd->in_service_queue != NULL);
13277 --
13278 -- entity = &bfqq->entity;
13279 -- /*
13280 -- * Bubble up extraction/update from the leaf to the root.
13281 -- */
13282 -- for_each_entity(entity) {
13283 -- sd = entity->sched_data;
13284 -- bfq_update_budget(entity);
13285 -- bfq_update_vtime(bfq_entity_service_tree(entity));
13286 -- bfq_active_extract(bfq_entity_service_tree(entity), entity);
13287 -- sd->active_entity = entity;
13288 -- sd->next_active = NULL;
13289 -- entity->service = 0;
13290 -- }
13291 --
13292 -- return;
13293 --}
13294 --
13295 - static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
13296 - {
13297 - if (bfqd->in_service_bic != NULL) {
13298 -diff --git a/block/bfq.h b/block/bfq.h
13299 -index 68b28e3..438f560 100644
13300 ---- a/block/bfq.h
13301 -+++ b/block/bfq.h
13302 -@@ -192,6 +192,8 @@ struct bfq_group;
13303 - * idle to backlogged
13304 - * @service_from_backlogged: cumulative service received from the @bfq_queue
13305 - * since the last transition from idle to backlogged
13306 -+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
13307 -+ * queue is shared
13308 - *
13309 - * A bfq_queue is a leaf request queue; it can be associated to an io_context
13310 - * or more (if it is an async one). @cgroup holds a reference to the
13311 -@@ -235,6 +237,7 @@ struct bfq_queue {
13312 - sector_t last_request_pos;
13313 -
13314 - pid_t pid;
13315 -+ struct bfq_io_cq *bic;
13316 -
13317 - /* weight-raising fields */
13318 - unsigned int raising_cur_max_time;
13319 -@@ -264,12 +267,23 @@ struct bfq_ttime {
13320 - * @icq: associated io_cq structure
13321 - * @bfqq: array of two process queues, the sync and the async
13322 - * @ttime: associated @bfq_ttime struct
13323 -+ * @raising_time_left: snapshot of the time left before weight raising ends
13324 -+ * for the sync queue associated to this process; this
13325 -+ * snapshot is taken to remember this value while the weight
13326 -+ * raising is suspended because the queue is merged with a
13327 -+ * shared queue, and is used to set @raising_cur_max_time
13328 -+ * when the queue is split from the shared queue and its
13329 -+ * weight is raised again
13330 -+ * @saved_idle_window: same purpose as the previous field for the idle window
13331 - */
13332 - struct bfq_io_cq {
13333 - struct io_cq icq; /* must be the first member */
13334 - struct bfq_queue *bfqq[2];
13335 - struct bfq_ttime ttime;
13336 - int ioprio;
13337 -+
13338 -+ unsigned int raising_time_left;
13339 -+ unsigned int saved_idle_window;
13340 - };
13341 -
13342 - /**
13343 -@@ -411,6 +425,7 @@ enum bfqq_state_flags {
13344 - BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
13345 - BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
13346 - BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
13347 -+ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */
13348 - BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */
13349 - };
13350 -
13351 -@@ -438,6 +453,7 @@ BFQ_BFQQ_FNS(sync);
13352 - BFQ_BFQQ_FNS(budget_new);
13353 - BFQ_BFQQ_FNS(coop);
13354 - BFQ_BFQQ_FNS(split_coop);
13355 -+BFQ_BFQQ_FNS(just_split);
13356 - BFQ_BFQQ_FNS(softrt_update);
13357 - #undef BFQ_BFQQ_FNS
13358 -
13359 ---
13360 -1.8.5.2
13361 -
13362
13363 Deleted: genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch
13364 ===================================================================
13365 --- genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch 2014-03-26 23:50:52 UTC (rev 2715)
13366 +++ genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch 2014-03-31 12:03:14 UTC (rev 2716)
13367 @@ -1,1034 +0,0 @@
13368 -From 39b1dba58b2562ba0d93a33a4f9af662d3c790c5 Mon Sep 17 00:00:00 2001
13369 -From: Mauro Andreolini <mauro.andreolini@×××××××.it>
13370 -Date: Thu, 23 Jan 2014 16:54:44 +0100
13371 -Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r1 for
13372 - 3.13.0
13373 -
13374 -A set of processes may happen to perform interleaved reads, i.e., requests
13375 -whose union would give rise to a sequential read pattern. There are two
13376 -typical cases: in the first case, processes read fixed-size chunks of
13377 -data at a fixed distance from each other, while in the second case processes
13378 -may read variable-size chunks at variable distances. The latter case occurs
13379 -for example with KVM, which splits the I/O generated by the guest into
13380 -multiple chunks, and lets these chunks be served by a pool of cooperating
13381 -processes, iteratively assigning the next chunk of I/O to the first
13382 -available process. CFQ uses actual queue merging for the first type of
13383 -rocesses, whereas it uses preemption to get a sequential read pattern out
13384 -of the read requests performed by the second type of processes. In the end
13385 -it uses two different mechanisms to achieve the same goal: boosting the
13386 -throughput with interleaved I/O.
13387 -
13388 -This patch introduces Early Queue Merge (EQM), a unified mechanism to get a
13389 -sequential read pattern with both types of processes. The main idea is
13390 -checking newly arrived requests against the next request of the active queue
13391 -both in case of actual request insert and in case of request merge. By doing
13392 -so, both the types of processes can be handled by just merging their queues.
13393 -EQM is then simpler and more compact than the pair of mechanisms used in
13394 -CFQ.
13395 -
13396 -Finally, EQM also preserves the typical low-latency properties of BFQ, by
13397 -properly restoring the weight-raising state of a queue when it gets back to
13398 -a non-merged state.
13399 -
13400 -Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it>
13401 -Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com>
13402 -Reviewed-by: Paolo Valente <paolo.valente@×××××××.it>
13403 ----
13404 - block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++----------------
13405 - block/bfq-sched.c | 28 ---
13406 - block/bfq.h | 16 ++
13407 - 3 files changed, 474 insertions(+), 227 deletions(-)
13408 -
13409 -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
13410 -index eb760de..06ee844 100644
13411 ---- a/block/bfq-iosched.c
13412 -+++ b/block/bfq-iosched.c
13413 -@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
13414 - return dur;
13415 - }
13416 -
13417 -+static inline void
13418 -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
13419 -+{
13420 -+ if (bic->saved_idle_window)
13421 -+ bfq_mark_bfqq_idle_window(bfqq);
13422 -+ else
13423 -+ bfq_clear_bfqq_idle_window(bfqq);
13424 -+ if (bic->raising_time_left && bfqq->bfqd->low_latency) {
13425 -+ /*
13426 -+ * Start a weight raising period with the duration given by
13427 -+ * the raising_time_left snapshot.
13428 -+ */
13429 -+ if (bfq_bfqq_busy(bfqq))
13430 -+ bfqq->bfqd->raised_busy_queues++;
13431 -+ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;
13432 -+ bfqq->raising_cur_max_time = bic->raising_time_left;
13433 -+ bfqq->last_rais_start_finish = jiffies;
13434 -+ bfqq->entity.ioprio_changed = 1;
13435 -+ }
13436 -+ /*
13437 -+ * Clear raising_time_left to prevent bfq_bfqq_save_state() from
13438 -+ * getting confused about the queue's need of a weight-raising
13439 -+ * period.
13440 -+ */
13441 -+ bic->raising_time_left = 0;
13442 -+}
13443 -+
13444 -+/*
13445 -+ * Must be called with the queue_lock held.
13446 -+ */
13447 -+static int bfqq_process_refs(struct bfq_queue *bfqq)
13448 -+{
13449 -+ int process_refs, io_refs;
13450 -+
13451 -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
13452 -+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
13453 -+ BUG_ON(process_refs < 0);
13454 -+ return process_refs;
13455 -+}
13456 -+
13457 - static void bfq_add_rq_rb(struct request *rq)
13458 - {
13459 - struct bfq_queue *bfqq = RQ_BFQQ(rq);
13460 -@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq)
13461 - if (!bfqd->low_latency)
13462 - goto add_bfqq_busy;
13463 -
13464 -+ if (bfq_bfqq_just_split(bfqq))
13465 -+ goto set_ioprio_changed;
13466 -+
13467 - /*
13468 -- * If the queue is not being boosted and has been idle
13469 -- * for enough time, start a weight-raising period
13470 -+ * If the queue:
13471 -+ * - is not being boosted,
13472 -+ * - has been idle for enough time,
13473 -+ * - is not a sync queue or is linked to a bfq_io_cq (it is
13474 -+ * shared "for its nature" or it is not shared and its
13475 -+ * requests have not been redirected to a shared queue)
13476 -+ * start a weight-raising period.
13477 - */
13478 -- if (old_raising_coeff == 1 &&
13479 -- (idle_for_long_time || soft_rt)) {
13480 -+ if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&
13481 -+ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
13482 - bfqq->raising_coeff = bfqd->bfq_raising_coeff;
13483 - if (idle_for_long_time)
13484 - bfqq->raising_cur_max_time =
13485 -@@ -574,6 +622,7 @@ static void bfq_add_rq_rb(struct request *rq)
13486 - bfqd->bfq_raising_rt_max_time;
13487 - }
13488 - }
13489 -+set_ioprio_changed:
13490 - if (old_raising_coeff != bfqq->raising_coeff)
13491 - entity->ioprio_changed = 1;
13492 - add_bfqq_busy:
13493 -@@ -756,90 +805,35 @@ static void bfq_end_raising(struct bfq_data *bfqd)
13494 - spin_unlock_irq(bfqd->queue->queue_lock);
13495 - }
13496 -
13497 --static int bfq_allow_merge(struct request_queue *q, struct request *rq,
13498 -- struct bio *bio)
13499 --{
13500 -- struct bfq_data *bfqd = q->elevator->elevator_data;
13501 -- struct bfq_io_cq *bic;
13502 -- struct bfq_queue *bfqq;
13503 --
13504 -- /*
13505 -- * Disallow merge of a sync bio into an async request.
13506 -- */
13507 -- if (bfq_bio_sync(bio) && !rq_is_sync(rq))
13508 -- return 0;
13509 --
13510 -- /*
13511 -- * Lookup the bfqq that this bio will be queued with. Allow
13512 -- * merge only if rq is queued there.
13513 -- * Queue lock is held here.
13514 -- */
13515 -- bic = bfq_bic_lookup(bfqd, current->io_context);
13516 -- if (bic == NULL)
13517 -- return 0;
13518 --
13519 -- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
13520 -- return bfqq == RQ_BFQQ(rq);
13521 --}
13522 --
13523 --static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
13524 -- struct bfq_queue *bfqq)
13525 --{
13526 -- if (bfqq != NULL) {
13527 -- bfq_mark_bfqq_must_alloc(bfqq);
13528 -- bfq_mark_bfqq_budget_new(bfqq);
13529 -- bfq_clear_bfqq_fifo_expire(bfqq);
13530 --
13531 -- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
13532 --
13533 -- bfq_log_bfqq(bfqd, bfqq,
13534 -- "set_in_service_queue, cur-budget = %lu",
13535 -- bfqq->entity.budget);
13536 -- }
13537 --
13538 -- bfqd->in_service_queue = bfqq;
13539 --}
13540 --
13541 --/*
13542 -- * Get and set a new queue for service.
13543 -- */
13544 --static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd,
13545 -- struct bfq_queue *bfqq)
13546 -+static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
13547 - {
13548 -- if (!bfqq)
13549 -- bfqq = bfq_get_next_queue(bfqd);
13550 -+ if (request)
13551 -+ return blk_rq_pos(io_struct);
13552 - else
13553 -- bfq_get_next_queue_forced(bfqd, bfqq);
13554 --
13555 -- __bfq_set_in_service_queue(bfqd, bfqq);
13556 -- return bfqq;
13557 -+ return ((struct bio *)io_struct)->bi_sector;
13558 - }
13559 -
13560 --static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd,
13561 -- struct request *rq)
13562 -+static inline sector_t bfq_dist_from(sector_t pos1,
13563 -+ sector_t pos2)
13564 - {
13565 -- if (blk_rq_pos(rq) >= bfqd->last_position)
13566 -- return blk_rq_pos(rq) - bfqd->last_position;
13567 -+ if (pos1 >= pos2)
13568 -+ return pos1 - pos2;
13569 - else
13570 -- return bfqd->last_position - blk_rq_pos(rq);
13571 -+ return pos2 - pos1;
13572 - }
13573 -
13574 --/*
13575 -- * Return true if bfqq has no request pending and rq is close enough to
13576 -- * bfqd->last_position, or if rq is closer to bfqd->last_position than
13577 -- * bfqq->next_rq
13578 -- */
13579 --static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq)
13580 -+static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
13581 -+ sector_t sector)
13582 - {
13583 -- return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR;
13584 -+ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
13585 -+ BFQQ_SEEK_THR;
13586 - }
13587 -
13588 --static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13589 -+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
13590 - {
13591 - struct rb_root *root = &bfqd->rq_pos_tree;
13592 - struct rb_node *parent, *node;
13593 - struct bfq_queue *__bfqq;
13594 -- sector_t sector = bfqd->last_position;
13595 -
13596 - if (RB_EMPTY_ROOT(root))
13597 - return NULL;
13598 -@@ -858,7 +852,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13599 - * position).
13600 - */
13601 - __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
13602 -- if (bfq_rq_close(bfqd, __bfqq->next_rq))
13603 -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
13604 - return __bfqq;
13605 -
13606 - if (blk_rq_pos(__bfqq->next_rq) < sector)
13607 -@@ -869,7 +863,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13608 - return NULL;
13609 -
13610 - __bfqq = rb_entry(node, struct bfq_queue, pos_node);
13611 -- if (bfq_rq_close(bfqd, __bfqq->next_rq))
13612 -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
13613 - return __bfqq;
13614 -
13615 - return NULL;
13616 -@@ -878,14 +872,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd)
13617 - /*
13618 - * bfqd - obvious
13619 - * cur_bfqq - passed in so that we don't decide that the current queue
13620 -- * is closely cooperating with itself.
13621 -- *
13622 -- * We are assuming that cur_bfqq has dispatched at least one request,
13623 -- * and that bfqd->last_position reflects a position on the disk associated
13624 -- * with the I/O issued by cur_bfqq.
13625 -+ * is closely cooperating with itself
13626 -+ * sector - used as a reference point to search for a close queue
13627 - */
13628 - static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
13629 -- struct bfq_queue *cur_bfqq)
13630 -+ struct bfq_queue *cur_bfqq,
13631 -+ sector_t sector)
13632 - {
13633 - struct bfq_queue *bfqq;
13634 -
13635 -@@ -905,7 +897,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
13636 - * working closely on the same area of the disk. In that case,
13637 - * we can group them together and don't waste time idling.
13638 - */
13639 -- bfqq = bfqq_close(bfqd);
13640 -+ bfqq = bfqq_close(bfqd, sector);
13641 - if (bfqq == NULL || bfqq == cur_bfqq)
13642 - return NULL;
13643 -
13644 -@@ -932,6 +924,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
13645 - return bfqq;
13646 - }
13647 -
13648 -+static struct bfq_queue *
13649 -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
13650 -+{
13651 -+ int process_refs, new_process_refs;
13652 -+ struct bfq_queue *__bfqq;
13653 -+
13654 -+ /*
13655 -+ * If there are no process references on the new_bfqq, then it is
13656 -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
13657 -+ * may have dropped their last reference (not just their last process
13658 -+ * reference).
13659 -+ */
13660 -+ if (!bfqq_process_refs(new_bfqq))
13661 -+ return NULL;
13662 -+
13663 -+ /* Avoid a circular list and skip interim queue merges. */
13664 -+ while ((__bfqq = new_bfqq->new_bfqq)) {
13665 -+ if (__bfqq == bfqq)
13666 -+ return NULL;
13667 -+ new_bfqq = __bfqq;
13668 -+ }
13669 -+
13670 -+ process_refs = bfqq_process_refs(bfqq);
13671 -+ new_process_refs = bfqq_process_refs(new_bfqq);
13672 -+ /*
13673 -+ * If the process for the bfqq has gone away, there is no
13674 -+ * sense in merging the queues.
13675 -+ */
13676 -+ if (process_refs == 0 || new_process_refs == 0)
13677 -+ return NULL;
13678 -+
13679 -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
13680 -+ new_bfqq->pid);
13681 -+
13682 -+ /*
13683 -+ * Merging is just a redirection: the requests of the process owning
13684 -+ * one of the two queues are redirected to the other queue. The latter
13685 -+ * queue, in its turn, is set as shared if this is the first time that
13686 -+ * the requests of some process are redirected to it.
13687 -+ *
13688 -+ * We redirect bfqq to new_bfqq and not the opposite, because we
13689 -+ * are in the context of the process owning bfqq, hence we have the
13690 -+ * io_cq of this process. So we can immediately configure this io_cq
13691 -+ * to redirect the requests of the process to new_bfqq.
13692 -+ *
13693 -+ * NOTE, even if new_bfqq coincides with the in-service queue, the
13694 -+ * io_cq of new_bfqq is not available, because, if the in-service queue
13695 -+ * is shared, bfqd->in_service_bic may not point to the io_cq of the
13696 -+ * in-service queue.
13697 -+ * Redirecting the requests of the process owning bfqq to the currently
13698 -+ * in-service queue is in any case the best option, as we feed the
13699 -+ * in-service queue with new requests close to the last request served
13700 -+ * and, by doing so, hopefully increase the throughput.
13701 -+ */
13702 -+ bfqq->new_bfqq = new_bfqq;
13703 -+ atomic_add(process_refs, &new_bfqq->ref);
13704 -+ return new_bfqq;
13705 -+}
13706 -+
13707 -+/*
13708 -+ * Attempt to schedule a merge of bfqq with the currently in-service queue or
13709 -+ * with a close queue among the scheduled queues.
13710 -+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
13711 -+ * structure otherwise.
13712 -+ */
13713 -+static struct bfq_queue *
13714 -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
13715 -+ void *io_struct, bool request)
13716 -+{
13717 -+ struct bfq_queue *in_service_bfqq, *new_bfqq;
13718 -+
13719 -+ if (bfqq->new_bfqq)
13720 -+ return bfqq->new_bfqq;
13721 -+
13722 -+ if (!io_struct)
13723 -+ return NULL;
13724 -+
13725 -+ in_service_bfqq = bfqd->in_service_queue;
13726 -+
13727 -+ if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||
13728 -+ !bfqd->in_service_bic)
13729 -+ goto check_scheduled;
13730 -+
13731 -+ if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))
13732 -+ goto check_scheduled;
13733 -+
13734 -+ if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))
13735 -+ goto check_scheduled;
13736 -+
13737 -+ if (in_service_bfqq->entity.parent != bfqq->entity.parent)
13738 -+ goto check_scheduled;
13739 -+
13740 -+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
13741 -+ bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {
13742 -+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
13743 -+ if (new_bfqq != NULL)
13744 -+ return new_bfqq; /* Merge with the in-service queue */
13745 -+ }
13746 -+
13747 -+ /*
13748 -+ * Check whether there is a cooperator among currently scheduled
13749 -+ * queues. The only thing we need is that the bio/request is not
13750 -+ * NULL, as we need it to establish whether a cooperator exists.
13751 -+ */
13752 -+check_scheduled:
13753 -+ new_bfqq = bfq_close_cooperator(bfqd, bfqq,
13754 -+ bfq_io_struct_pos(io_struct, request));
13755 -+ if (new_bfqq)
13756 -+ return bfq_setup_merge(bfqq, new_bfqq);
13757 -+
13758 -+ return NULL;
13759 -+}
13760 -+
13761 -+static inline void
13762 -+bfq_bfqq_save_state(struct bfq_queue *bfqq)
13763 -+{
13764 -+ /*
13765 -+ * If bfqq->bic == NULL, the queue is already shared or its requests
13766 -+ * have already been redirected to a shared queue; both idle window
13767 -+ * and weight raising state have already been saved. Do nothing.
13768 -+ */
13769 -+ if (bfqq->bic == NULL)
13770 -+ return;
13771 -+ if (bfqq->bic->raising_time_left)
13772 -+ /*
13773 -+ * This is the queue of a just-started process, and would
13774 -+ * deserve weight raising: we set raising_time_left to the full
13775 -+ * weight-raising duration to trigger weight-raising when and
13776 -+ * if the queue is split and the first request of the queue
13777 -+ * is enqueued.
13778 -+ */
13779 -+ bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd);
13780 -+ else if (bfqq->raising_coeff > 1) {
13781 -+ unsigned long wrais_duration =
13782 -+ jiffies - bfqq->last_rais_start_finish;
13783 -+ /*
13784 -+ * It may happen that a queue's weight raising period lasts
13785 -+ * longer than its raising_cur_max_time, as weight raising is
13786 -+ * handled only when a request is enqueued or dispatched (it
13787 -+ * does not use any timer). If the weight raising period is
13788 -+ * about to end, don't save it.
13789 -+ */
13790 -+ if (bfqq->raising_cur_max_time <= wrais_duration)
13791 -+ bfqq->bic->raising_time_left = 0;
13792 -+ else
13793 -+ bfqq->bic->raising_time_left =
13794 -+ bfqq->raising_cur_max_time - wrais_duration;
13795 -+ /*
13796 -+ * The bfq_queue is becoming shared or the requests of the
13797 -+ * process owning the queue are being redirected to a shared
13798 -+ * queue. Stop the weight raising period of the queue, as in
13799 -+ * both cases it should not be owned by an interactive or soft
13800 -+ * real-time application.
13801 -+ */
13802 -+ bfq_bfqq_end_raising(bfqq);
13803 -+ } else
13804 -+ bfqq->bic->raising_time_left = 0;
13805 -+ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
13806 -+}
13807 -+
13808 -+static inline void
13809 -+bfq_get_bic_reference(struct bfq_queue *bfqq)
13810 -+{
13811 -+ /*
13812 -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs
13813 -+ * is about to begin using a shared bfq_queue.
13814 -+ */
13815 -+ if (bfqq->bic)
13816 -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
13817 -+}
13818 -+
13819 -+static void
13820 -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
13821 -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
13822 -+{
13823 -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
13824 -+ (long unsigned)new_bfqq->pid);
13825 -+ /* Save weight raising and idle window of the merged queues */
13826 -+ bfq_bfqq_save_state(bfqq);
13827 -+ bfq_bfqq_save_state(new_bfqq);
13828 -+ /*
13829 -+ * Grab a reference to the bic, to prevent it from being destroyed
13830 -+ * before being possibly touched by a bfq_split_bfqq().
13831 -+ */
13832 -+ bfq_get_bic_reference(bfqq);
13833 -+ bfq_get_bic_reference(new_bfqq);
13834 -+ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */
13835 -+ bic_set_bfqq(bic, new_bfqq, 1);
13836 -+ bfq_mark_bfqq_coop(new_bfqq);
13837 -+ /*
13838 -+ * new_bfqq now belongs to at least two bics (it is a shared queue): set
13839 -+ * new_bfqq->bic to NULL. bfqq either:
13840 -+ * - does not belong to any bic any more, and hence bfqq->bic must
13841 -+ * be set to NULL, or
13842 -+ * - is a queue whose owning bics have already been redirected to a
13843 -+ * different queue, hence the queue is destined to not belong to any
13844 -+ * bic soon and bfqq->bic is already NULL (therefore the next
13845 -+ * assignment causes no harm).
13846 -+ */
13847 -+ new_bfqq->bic = NULL;
13848 -+ bfqq->bic = NULL;
13849 -+ bfq_put_queue(bfqq);
13850 -+}
13851 -+
13852 -+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
13853 -+ struct bio *bio)
13854 -+{
13855 -+ struct bfq_data *bfqd = q->elevator->elevator_data;
13856 -+ struct bfq_io_cq *bic;
13857 -+ struct bfq_queue *bfqq, *new_bfqq;
13858 -+
13859 -+ /*
13860 -+ * Disallow merge of a sync bio into an async request.
13861 -+ */
13862 -+ if (bfq_bio_sync(bio) && !rq_is_sync(rq))
13863 -+ return 0;
13864 -+
13865 -+ /*
13866 -+ * Lookup the bfqq that this bio will be queued with. Allow
13867 -+ * merge only if rq is queued there.
13868 -+ * Queue lock is held here.
13869 -+ */
13870 -+ bic = bfq_bic_lookup(bfqd, current->io_context);
13871 -+ if (bic == NULL)
13872 -+ return 0;
13873 -+
13874 -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
13875 -+ /*
13876 -+ * We take advantage of this function to perform an early merge
13877 -+ * of the queues of possible cooperating processes.
13878 -+ */
13879 -+ if (bfqq != NULL) {
13880 -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
13881 -+ if (new_bfqq != NULL) {
13882 -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
13883 -+ /*
13884 -+ * If we get here, the bio will be queued in the shared queue,
13885 -+ * i.e., new_bfqq, so use new_bfqq to decide whether bio and
13886 -+ * rq can be merged.
13887 -+ */
13888 -+ bfqq = new_bfqq;
13889 -+ }
13890 -+ }
13891 -+
13892 -+ return bfqq == RQ_BFQQ(rq);
13893 -+}
13894 -+
13895 -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
13896 -+ struct bfq_queue *bfqq)
13897 -+{
13898 -+ if (bfqq != NULL) {
13899 -+ bfq_mark_bfqq_must_alloc(bfqq);
13900 -+ bfq_mark_bfqq_budget_new(bfqq);
13901 -+ bfq_clear_bfqq_fifo_expire(bfqq);
13902 -+
13903 -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
13904 -+
13905 -+ bfq_log_bfqq(bfqd, bfqq,
13906 -+ "set_in_service_queue, cur-budget = %lu",
13907 -+ bfqq->entity.budget);
13908 -+ }
13909 -+
13910 -+ bfqd->in_service_queue = bfqq;
13911 -+}
13912 -+
13913 -+/*
13914 -+ * Get and set a new queue for service.
13915 -+ */
13916 -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
13917 -+{
13918 -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
13919 -+
13920 -+ __bfq_set_in_service_queue(bfqd, bfqq);
13921 -+ return bfqq;
13922 -+}
13923 -+
13924 - /*
13925 - * If enough samples have been computed, return the current max budget
13926 - * stored in bfqd, which is dynamically updated according to the
13927 -@@ -1079,63 +1347,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
13928 - return rq;
13929 - }
13930 -
13931 --/*
13932 -- * Must be called with the queue_lock held.
13933 -- */
13934 --static int bfqq_process_refs(struct bfq_queue *bfqq)
13935 --{
13936 -- int process_refs, io_refs;
13937 --
13938 -- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
13939 -- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
13940 -- BUG_ON(process_refs < 0);
13941 -- return process_refs;
13942 --}
13943 --
13944 --static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
13945 --{
13946 -- int process_refs, new_process_refs;
13947 -- struct bfq_queue *__bfqq;
13948 --
13949 -- /*
13950 -- * If there are no process references on the new_bfqq, then it is
13951 -- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
13952 -- * may have dropped their last reference (not just their last process
13953 -- * reference).
13954 -- */
13955 -- if (!bfqq_process_refs(new_bfqq))
13956 -- return;
13957 --
13958 -- /* Avoid a circular list and skip interim queue merges. */
13959 -- while ((__bfqq = new_bfqq->new_bfqq)) {
13960 -- if (__bfqq == bfqq)
13961 -- return;
13962 -- new_bfqq = __bfqq;
13963 -- }
13964 --
13965 -- process_refs = bfqq_process_refs(bfqq);
13966 -- new_process_refs = bfqq_process_refs(new_bfqq);
13967 -- /*
13968 -- * If the process for the bfqq has gone away, there is no
13969 -- * sense in merging the queues.
13970 -- */
13971 -- if (process_refs == 0 || new_process_refs == 0)
13972 -- return;
13973 --
13974 -- /*
13975 -- * Merge in the direction of the lesser amount of work.
13976 -- */
13977 -- if (new_process_refs >= process_refs) {
13978 -- bfqq->new_bfqq = new_bfqq;
13979 -- atomic_add(process_refs, &new_bfqq->ref);
13980 -- } else {
13981 -- new_bfqq->new_bfqq = bfqq;
13982 -- atomic_add(new_process_refs, &bfqq->ref);
13983 -- }
13984 -- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
13985 -- new_bfqq->pid);
13986 --}
13987 --
13988 - static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
13989 - {
13990 - struct bfq_entity *entity = &bfqq->entity;
13991 -@@ -1729,7 +1940,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
13992 - */
13993 - static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
13994 - {
13995 -- struct bfq_queue *bfqq, *new_bfqq = NULL;
13996 -+ struct bfq_queue *bfqq;
13997 - struct request *next_rq;
13998 - enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
13999 -
14000 -@@ -1739,17 +1950,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
14001 -
14002 - bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
14003 -
14004 -- /*
14005 -- * If another queue has a request waiting within our mean seek
14006 -- * distance, let it run. The expire code will check for close
14007 -- * cooperators and put the close queue at the front of the
14008 -- * service tree. If possible, merge the expiring queue with the
14009 -- * new bfqq.
14010 -- */
14011 -- new_bfqq = bfq_close_cooperator(bfqd, bfqq);
14012 -- if (new_bfqq != NULL && bfqq->new_bfqq == NULL)
14013 -- bfq_setup_merge(bfqq, new_bfqq);
14014 --
14015 - if (bfq_may_expire_for_budg_timeout(bfqq) &&
14016 - !timer_pending(&bfqd->idle_slice_timer) &&
14017 - !bfq_bfqq_must_idle(bfqq))
14018 -@@ -1786,36 +1986,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
14019 - bfq_clear_bfqq_wait_request(bfqq);
14020 - del_timer(&bfqd->idle_slice_timer);
14021 - }
14022 -- if (new_bfqq == NULL)
14023 -- goto keep_queue;
14024 -- else
14025 -- goto expire;
14026 -+ goto keep_queue;
14027 - }
14028 - }
14029 -
14030 - /*
14031 -- * No requests pending. If the in-service queue has no cooperator and
14032 -- * still has requests in flight (possibly waiting for a completion)
14033 -- * or is idling for a new request, then keep it.
14034 -+ * No requests pending. If the in-service queue still has requests in
14035 -+ * flight (possibly waiting for a completion) or is idling for a new
14036 -+ * request, then keep it.
14037 - */
14038 -- if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) ||
14039 -- (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) {
14040 -+ if (timer_pending(&bfqd->idle_slice_timer) ||
14041 -+ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {
14042 - bfqq = NULL;
14043 - goto keep_queue;
14044 -- } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) {
14045 -- /*
14046 -- * Expiring the queue because there is a close cooperator,
14047 -- * cancel timer.
14048 -- */
14049 -- bfq_clear_bfqq_wait_request(bfqq);
14050 -- del_timer(&bfqd->idle_slice_timer);
14051 - }
14052 -
14053 - reason = BFQ_BFQQ_NO_MORE_REQUESTS;
14054 - expire:
14055 - bfq_bfqq_expire(bfqd, bfqq, 0, reason);
14056 - new_queue:
14057 -- bfqq = bfq_set_in_service_queue(bfqd, new_bfqq);
14058 -+ bfqq = bfq_set_in_service_queue(bfqd);
14059 - bfq_log(bfqd, "select_queue: new queue %d returned",
14060 - bfqq != NULL ? bfqq->pid : 0);
14061 - keep_queue:
14062 -@@ -1825,9 +2015,8 @@ keep_queue:
14063 - static void bfq_update_raising_data(struct bfq_data *bfqd,
14064 - struct bfq_queue *bfqq)
14065 - {
14066 -+ struct bfq_entity *entity = &bfqq->entity;
14067 - if (bfqq->raising_coeff > 1) { /* queue is being boosted */
14068 -- struct bfq_entity *entity = &bfqq->entity;
14069 --
14070 - bfq_log_bfqq(bfqd, bfqq,
14071 - "raising period dur %u/%u msec, "
14072 - "old raising coeff %u, w %d(%d)",
14073 -@@ -1844,7 +2033,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,
14074 - "WARN: pending prio change");
14075 - /*
14076 - * If too much time has elapsed from the beginning
14077 -- * of this weight-raising, stop it.
14078 -+ * of this weight-raising period, stop it.
14079 - */
14080 - if (time_is_before_jiffies(bfqq->last_rais_start_finish +
14081 - bfqq->raising_cur_max_time)) {
14082 -@@ -1856,11 +2045,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd,
14083 - jiffies_to_msecs(bfqq->
14084 - raising_cur_max_time));
14085 - bfq_bfqq_end_raising(bfqq);
14086 -- __bfq_entity_update_weight_prio(
14087 -- bfq_entity_service_tree(entity),
14088 -- entity);
14089 - }
14090 - }
14091 -+ /* Update weight both if it must be raised and if it must be lowered */
14092 -+ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))
14093 -+ __bfq_entity_update_weight_prio(
14094 -+ bfq_entity_service_tree(entity),
14095 -+ entity);
14096 - }
14097 -
14098 - /*
14099 -@@ -2101,6 +2292,25 @@ static void bfq_init_icq(struct io_cq *icq)
14100 - struct bfq_io_cq *bic = icq_to_bic(icq);
14101 -
14102 - bic->ttime.last_end_request = jiffies;
14103 -+ /*
14104 -+ * A newly created bic indicates that the process has just
14105 -+ * started doing I/O, and is probably mapping into memory its
14106 -+ * executable and libraries: it definitely needs weight raising.
14107 -+ * There is however the possibility that the process performs,
14108 -+ * for a while, I/O close to some other process. EQM intercepts
14109 -+ * this behavior and may merge the queue corresponding to the
14110 -+ * process with some other queue, BEFORE the weight of the queue
14111 -+ * is raised. Merged queues are not weight-raised (they are assumed
14112 -+ * to belong to processes that benefit only from high throughput).
14113 -+ * If the merge is basically the consequence of an accident, then
14114 -+ * the queue will be split soon and will get back its old weight.
14115 -+ * It is then important to write down somewhere that this queue
14116 -+ * does need weight raising, even if it did not make it to get its
14117 -+ * weight raised before being merged. To this purpose, we overload
14118 -+ * the field raising_time_left and assign 1 to it, to mark the queue
14119 -+ * as needing weight raising.
14120 -+ */
14121 -+ bic->raising_time_left = 1;
14122 - }
14123 -
14124 - static void bfq_exit_icq(struct io_cq *icq)
14125 -@@ -2114,6 +2324,13 @@ static void bfq_exit_icq(struct io_cq *icq)
14126 - }
14127 -
14128 - if (bic->bfqq[BLK_RW_SYNC]) {
14129 -+ /*
14130 -+ * If the bic is using a shared queue, put the reference
14131 -+ * taken on the io_context when the bic started using a
14132 -+ * shared bfq_queue.
14133 -+ */
14134 -+ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
14135 -+ put_io_context(icq->ioc);
14136 - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
14137 - bic->bfqq[BLK_RW_SYNC] = NULL;
14138 - }
14139 -@@ -2405,6 +2622,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
14140 - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
14141 - return;
14142 -
14143 -+ /* Idle window just restored, statistics are meaningless. */
14144 -+ if (bfq_bfqq_just_split(bfqq))
14145 -+ return;
14146 -+
14147 - enable_idle = bfq_bfqq_idle_window(bfqq);
14148 -
14149 - if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
14150 -@@ -2445,6 +2666,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
14151 - if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
14152 - !BFQQ_SEEKY(bfqq))
14153 - bfq_update_idle_window(bfqd, bfqq, bic);
14154 -+ bfq_clear_bfqq_just_split(bfqq);
14155 -
14156 - bfq_log_bfqq(bfqd, bfqq,
14157 - "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
14158 -@@ -2505,13 +2727,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
14159 - static void bfq_insert_request(struct request_queue *q, struct request *rq)
14160 - {
14161 - struct bfq_data *bfqd = q->elevator->elevator_data;
14162 -- struct bfq_queue *bfqq = RQ_BFQQ(rq);
14163 -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
14164 -
14165 - assert_spin_locked(bfqd->queue->queue_lock);
14166 -+
14167 -+ /*
14168 -+ * An unplug may trigger a requeue of a request from the device
14169 -+ * driver: make sure we are in process context while trying to
14170 -+ * merge two bfq_queues.
14171 -+ */
14172 -+ if (!in_interrupt()) {
14173 -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
14174 -+ if (new_bfqq != NULL) {
14175 -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
14176 -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
14177 -+ /*
14178 -+ * Release the request's reference to the old bfqq
14179 -+ * and make sure one is taken to the shared queue.
14180 -+ */
14181 -+ new_bfqq->allocated[rq_data_dir(rq)]++;
14182 -+ bfqq->allocated[rq_data_dir(rq)]--;
14183 -+ atomic_inc(&new_bfqq->ref);
14184 -+ bfq_put_queue(bfqq);
14185 -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
14186 -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
14187 -+ bfqq, new_bfqq);
14188 -+ rq->elv.priv[1] = new_bfqq;
14189 -+ bfqq = new_bfqq;
14190 -+ }
14191 -+ }
14192 -+
14193 - bfq_init_prio_data(bfqq, RQ_BIC(rq));
14194 -
14195 - bfq_add_rq_rb(rq);
14196 -
14197 -+ /*
14198 -+ * Here a newly-created bfq_queue has already started a weight-raising
14199 -+ * period: clear raising_time_left to prevent bfq_bfqq_save_state()
14200 -+ * from assigning it a full weight-raising period. See the detailed
14201 -+ * comments about this field in bfq_init_icq().
14202 -+ */
14203 -+ if (bfqq->bic != NULL)
14204 -+ bfqq->bic->raising_time_left = 0;
14205 - rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
14206 - list_add_tail(&rq->queuelist, &bfqq->fifo);
14207 -
14208 -@@ -2659,18 +2916,6 @@ static void bfq_put_request(struct request *rq)
14209 - }
14210 - }
14211 -
14212 --static struct bfq_queue *
14213 --bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
14214 -- struct bfq_queue *bfqq)
14215 --{
14216 -- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
14217 -- (long unsigned)bfqq->new_bfqq->pid);
14218 -- bic_set_bfqq(bic, bfqq->new_bfqq, 1);
14219 -- bfq_mark_bfqq_coop(bfqq->new_bfqq);
14220 -- bfq_put_queue(bfqq);
14221 -- return bic_to_bfqq(bic, 1);
14222 --}
14223 --
14224 - /*
14225 - * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
14226 - * was the last process referring to said bfqq.
14227 -@@ -2679,6 +2924,9 @@ static struct bfq_queue *
14228 - bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
14229 - {
14230 - bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
14231 -+
14232 -+ put_io_context(bic->icq.ioc);
14233 -+
14234 - if (bfqq_process_refs(bfqq) == 1) {
14235 - bfqq->pid = current->pid;
14236 - bfq_clear_bfqq_coop(bfqq);
14237 -@@ -2707,6 +2955,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
14238 - struct bfq_queue *bfqq;
14239 - struct bfq_group *bfqg;
14240 - unsigned long flags;
14241 -+ bool split = false;
14242 -
14243 - might_sleep_if(gfp_mask & __GFP_WAIT);
14244 -
14245 -@@ -2725,24 +2974,14 @@ new_queue:
14246 - bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
14247 - bic_set_bfqq(bic, bfqq, is_sync);
14248 - } else {
14249 -- /*
14250 -- * If the queue was seeky for too long, break it apart.
14251 -- */
14252 -+ /* If the queue was seeky for too long, break it apart. */
14253 - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
14254 - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
14255 - bfqq = bfq_split_bfqq(bic, bfqq);
14256 -+ split = true;
14257 - if (!bfqq)
14258 - goto new_queue;
14259 - }
14260 --
14261 -- /*
14262 -- * Check to see if this queue is scheduled to merge with
14263 -- * another closely cooperating queue. The merging of queues
14264 -- * happens here as it must be done in process context.
14265 -- * The reference on new_bfqq was taken in merge_bfqqs.
14266 -- */
14267 -- if (bfqq->new_bfqq != NULL)
14268 -- bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq);
14269 - }
14270 -
14271 - bfqq->allocated[rw]++;
14272 -@@ -2753,6 +2992,26 @@ new_queue:
14273 - rq->elv.priv[0] = bic;
14274 - rq->elv.priv[1] = bfqq;
14275 -
14276 -+ /*
14277 -+ * If a bfq_queue has only one process reference, it is owned
14278 -+ * by only one bfq_io_cq: we can set the bic field of the
14279 -+ * bfq_queue to the address of that structure. Also, if the
14280 -+ * queue has just been split, mark a flag so that the
14281 -+ * information is available to the other scheduler hooks.
14282 -+ */
14283 -+ if (bfqq_process_refs(bfqq) == 1) {
14284 -+ bfqq->bic = bic;
14285 -+ if (split) {
14286 -+ bfq_mark_bfqq_just_split(bfqq);
14287 -+ /*
14288 -+ * If the queue has just been split from a shared queue,
14289 -+ * restore the idle window and the possible weight
14290 -+ * raising period.
14291 -+ */
14292 -+ bfq_bfqq_resume_state(bfqq, bic);
14293 -+ }
14294 -+ }
14295 -+
14296 - spin_unlock_irqrestore(q->queue_lock, flags);
14297 -
14298 - return 0;
14299 -diff --git a/block/bfq-sched.c b/block/bfq-sched.c
14300 -index 999b475..e54ea33 100644
14301 ---- a/block/bfq-sched.c
14302 -+++ b/block/bfq-sched.c
14303 -@@ -980,34 +980,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
14304 - return bfqq;
14305 - }
14306 -
14307 --/*
14308 -- * Forced extraction of the given queue.
14309 -- */
14310 --static void bfq_get_next_queue_forced(struct bfq_data *bfqd,
14311 -- struct bfq_queue *bfqq)
14312 --{
14313 -- struct bfq_entity *entity;
14314 -- struct bfq_sched_data *sd;
14315 --
14316 -- BUG_ON(bfqd->in_service_queue != NULL);
14317 --
14318 -- entity = &bfqq->entity;
14319 -- /*
14320 -- * Bubble up extraction/update from the leaf to the root.
14321 -- */
14322 -- for_each_entity(entity) {
14323 -- sd = entity->sched_data;
14324 -- bfq_update_budget(entity);
14325 -- bfq_update_vtime(bfq_entity_service_tree(entity));
14326 -- bfq_active_extract(bfq_entity_service_tree(entity), entity);
14327 -- sd->active_entity = entity;
14328 -- sd->next_active = NULL;
14329 -- entity->service = 0;
14330 -- }
14331 --
14332 -- return;
14333 --}
14334 --
14335 - static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
14336 - {
14337 - if (bfqd->in_service_bic != NULL) {
14338 -diff --git a/block/bfq.h b/block/bfq.h
14339 -index f9b5881..0bfad40 100644
14340 ---- a/block/bfq.h
14341 -+++ b/block/bfq.h
14342 -@@ -192,6 +192,8 @@ struct bfq_group;
14343 - * idle to backlogged
14344 - * @service_from_backlogged: cumulative service received from the @bfq_queue
14345 - * since the last transition from idle to backlogged
14346 -+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
14347 -+ * queue is shared
14348 - *
14349 - * A bfq_queue is a leaf request queue; it can be associated to an io_context
14350 - * or more (if it is an async one). @cgroup holds a reference to the
14351 -@@ -235,6 +237,7 @@ struct bfq_queue {
14352 - sector_t last_request_pos;
14353 -
14354 - pid_t pid;
14355 -+ struct bfq_io_cq *bic;
14356 -
14357 - /* weight-raising fields */
14358 - unsigned long raising_cur_max_time;
14359 -@@ -264,12 +267,23 @@ struct bfq_ttime {
14360 - * @icq: associated io_cq structure
14361 - * @bfqq: array of two process queues, the sync and the async
14362 - * @ttime: associated @bfq_ttime struct
14363 -+ * @raising_time_left: snapshot of the time left before weight raising ends
14364 -+ * for the sync queue associated to this process; this
14365 -+ * snapshot is taken to remember this value while the weight
14366 -+ * raising is suspended because the queue is merged with a
14367 -+ * shared queue, and is used to set @raising_cur_max_time
14368 -+ * when the queue is split from the shared queue and its
14369 -+ * weight is raised again
14370 -+ * @saved_idle_window: same purpose as the previous field for the idle window
14371 - */
14372 - struct bfq_io_cq {
14373 - struct io_cq icq; /* must be the first member */
14374 - struct bfq_queue *bfqq[2];
14375 - struct bfq_ttime ttime;
14376 - int ioprio;
14377 -+
14378 -+ unsigned int raising_time_left;
14379 -+ unsigned int saved_idle_window;
14380 - };
14381 -
14382 - /**
14383 -@@ -411,6 +425,7 @@ enum bfqq_state_flags {
14384 - BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */
14385 - BFQ_BFQQ_FLAG_coop, /* bfqq is shared */
14386 - BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */
14387 -+ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */
14388 - BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */
14389 - };
14390 -
14391 -@@ -438,6 +453,7 @@ BFQ_BFQQ_FNS(sync);
14392 - BFQ_BFQQ_FNS(budget_new);
14393 - BFQ_BFQQ_FNS(coop);
14394 - BFQ_BFQQ_FNS(split_coop);
14395 -+BFQ_BFQQ_FNS(just_split);
14396 - BFQ_BFQQ_FNS(softrt_update);
14397 - #undef BFQ_BFQQ_FNS
14398 -
14399 ---
14400 -1.8.5.2
14401 -
14402
14403 Deleted: genpatches-2.6/trunk/3.14/5000_enable-additional-cpu-optimizations-for-gcc.patch
14404 ===================================================================
14405 --- genpatches-2.6/trunk/3.14/5000_enable-additional-cpu-optimizations-for-gcc.patch 2014-03-26 23:50:52 UTC (rev 2715)
14406 +++ genpatches-2.6/trunk/3.14/5000_enable-additional-cpu-optimizations-for-gcc.patch 2014-03-31 12:03:14 UTC (rev 2716)
14407 @@ -1,325 +0,0 @@
14408 -This patch has been tested on and known to work with kernel versions from 3.2
14409 -up to the latest git version (pulled on 12/14/2013).
14410 -
14411 -This patch will expand the number of microarchitectures to include new
14412 -processors including: AMD K10-family, AMD Family 10h (Barcelona), AMD Family
14413 -14h (Bobcat), AMD Family 15h (Bulldozer), AMD Family 15h (Piledriver), AMD
14414 -Family 16h (Jaguar), Intel 1st Gen Core i3/i5/i7 (Nehalem), Intel 2nd Gen Core
14415 -i3/i5/i7 (Sandybridge), Intel 3rd Gen Core i3/i5/i7 (Ivybridge), and Intel 4th
14416 -Gen Core i3/i5/i7 (Haswell). It also offers the compiler the 'native' flag.
14417 -
14418 -Small but real speed increases are measurable using a make endpoint comparing
14419 -a generic kernel to one built with one of the respective microarchs.
14420 -
14421 -See the following experimental evidence supporting this statement:
14422 -https://github.com/graysky2/kernel_gcc_patch
14423 -
14424 ----
14425 -diff -uprN a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
14426 ---- a/arch/x86/include/asm/module.h 2013-11-03 18:41:51.000000000 -0500
14427 -+++ b/arch/x86/include/asm/module.h 2013-12-15 06:21:24.351122516 -0500
14428 -@@ -15,6 +15,16 @@
14429 - #define MODULE_PROC_FAMILY "586MMX "
14430 - #elif defined CONFIG_MCORE2
14431 - #define MODULE_PROC_FAMILY "CORE2 "
14432 -+#elif defined CONFIG_MNATIVE
14433 -+#define MODULE_PROC_FAMILY "NATIVE "
14434 -+#elif defined CONFIG_MCOREI7
14435 -+#define MODULE_PROC_FAMILY "COREI7 "
14436 -+#elif defined CONFIG_MCOREI7AVX
14437 -+#define MODULE_PROC_FAMILY "COREI7AVX "
14438 -+#elif defined CONFIG_MCOREAVXI
14439 -+#define MODULE_PROC_FAMILY "COREAVXI "
14440 -+#elif defined CONFIG_MCOREAVX2
14441 -+#define MODULE_PROC_FAMILY "COREAVX2 "
14442 - #elif defined CONFIG_MATOM
14443 - #define MODULE_PROC_FAMILY "ATOM "
14444 - #elif defined CONFIG_M686
14445 -@@ -33,6 +43,18 @@
14446 - #define MODULE_PROC_FAMILY "K7 "
14447 - #elif defined CONFIG_MK8
14448 - #define MODULE_PROC_FAMILY "K8 "
14449 -+#elif defined CONFIG_MK10
14450 -+#define MODULE_PROC_FAMILY "K10 "
14451 -+#elif defined CONFIG_MBARCELONA
14452 -+#define MODULE_PROC_FAMILY "BARCELONA "
14453 -+#elif defined CONFIG_MBOBCAT
14454 -+#define MODULE_PROC_FAMILY "BOBCAT "
14455 -+#elif defined CONFIG_MBULLDOZER
14456 -+#define MODULE_PROC_FAMILY "BULLDOZER "
14457 -+#elif defined CONFIG_MPILEDRIVER
14458 -+#define MODULE_PROC_FAMILY "PILEDRIVER "
14459 -+#elif defined CONFIG_MJAGUAR
14460 -+#define MODULE_PROC_FAMILY "JAGUAR "
14461 - #elif defined CONFIG_MELAN
14462 - #define MODULE_PROC_FAMILY "ELAN "
14463 - #elif defined CONFIG_MCRUSOE
14464 -diff -uprN a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
14465 ---- a/arch/x86/Kconfig.cpu 2013-11-03 18:41:51.000000000 -0500
14466 -+++ b/arch/x86/Kconfig.cpu 2013-12-15 06:21:24.351122516 -0500
14467 -@@ -139,7 +139,7 @@ config MPENTIUM4
14468 -
14469 -
14470 - config MK6
14471 -- bool "K6/K6-II/K6-III"
14472 -+ bool "AMD K6/K6-II/K6-III"
14473 - depends on X86_32
14474 - ---help---
14475 - Select this for an AMD K6-family processor. Enables use of
14476 -@@ -147,7 +147,7 @@ config MK6
14477 - flags to GCC.
14478 -
14479 - config MK7
14480 -- bool "Athlon/Duron/K7"
14481 -+ bool "AMD Athlon/Duron/K7"
14482 - depends on X86_32
14483 - ---help---
14484 - Select this for an AMD Athlon K7-family processor. Enables use of
14485 -@@ -155,12 +155,55 @@ config MK7
14486 - flags to GCC.
14487 -
14488 - config MK8
14489 -- bool "Opteron/Athlon64/Hammer/K8"
14490 -+ bool "AMD Opteron/Athlon64/Hammer/K8"
14491 - ---help---
14492 - Select this for an AMD Opteron or Athlon64 Hammer-family processor.
14493 - Enables use of some extended instructions, and passes appropriate
14494 - optimization flags to GCC.
14495 -
14496 -+config MK10
14497 -+ bool "AMD 61xx/7x50/PhenomX3/X4/II/K10"
14498 -+ ---help---
14499 -+ Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50,
14500 -+ Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor.
14501 -+ Enables use of some extended instructions, and passes appropriate
14502 -+ optimization flags to GCC.
14503 -+
14504 -+config MBARCELONA
14505 -+ bool "AMD Barcelona"
14506 -+ ---help---
14507 -+ Select this for AMD Barcelona and newer processors.
14508 -+
14509 -+ Enables -march=barcelona
14510 -+
14511 -+config MBOBCAT
14512 -+ bool "AMD Bobcat"
14513 -+ ---help---
14514 -+ Select this for AMD Bobcat processors.
14515 -+
14516 -+ Enables -march=btver1
14517 -+
14518 -+config MBULLDOZER
14519 -+ bool "AMD Bulldozer"
14520 -+ ---help---
14521 -+ Select this for AMD Bulldozer processors.
14522 -+
14523 -+ Enables -march=bdver1
14524 -+
14525 -+config MPILEDRIVER
14526 -+ bool "AMD Piledriver"
14527 -+ ---help---
14528 -+ Select this for AMD Piledriver processors.
14529 -+
14530 -+ Enables -march=bdver2
14531 -+
14532 -+config MJAGUAR
14533 -+ bool "AMD Jaguar"
14534 -+ ---help---
14535 -+ Select this for AMD Jaguar processors.
14536 -+
14537 -+ Enables -march=btver2
14538 -+
14539 - config MCRUSOE
14540 - bool "Crusoe"
14541 - depends on X86_32
14542 -@@ -251,8 +294,17 @@ config MPSC
14543 - using the cpu family field
14544 - in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
14545 -
14546 -+config MATOM
14547 -+ bool "Intel Atom"
14548 -+ ---help---
14549 -+
14550 -+ Select this for the Intel Atom platform. Intel Atom CPUs have an
14551 -+ in-order pipelining architecture and thus can benefit from
14552 -+ accordingly optimized code. Use a recent GCC with specific Atom
14553 -+ support in order to fully benefit from selecting this option.
14554 -+
14555 - config MCORE2
14556 -- bool "Core 2/newer Xeon"
14557 -+ bool "Intel Core 2"
14558 - ---help---
14559 -
14560 - Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
14561 -@@ -260,14 +312,40 @@ config MCORE2
14562 - family in /proc/cpuinfo. Newer ones have 6 and older ones 15
14563 - (not a typo)
14564 -
14565 --config MATOM
14566 -- bool "Intel Atom"
14567 -+ Enables -march=core2
14568 -+
14569 -+config MCOREI7
14570 -+ bool "Intel Core i7"
14571 - ---help---
14572 -
14573 -- Select this for the Intel Atom platform. Intel Atom CPUs have an
14574 -- in-order pipelining architecture and thus can benefit from
14575 -- accordingly optimized code. Use a recent GCC with specific Atom
14576 -- support in order to fully benefit from selecting this option.
14577 -+ Select this for the Intel Nehalem platform. Intel Nehalem proecessors
14578 -+ include Core i3, i5, i7, Xeon: 34xx, 35xx, 55xx, 56xx, 75xx processors.
14579 -+
14580 -+ Enables -march=corei7
14581 -+
14582 -+config MCOREI7AVX
14583 -+ bool "Intel Core 2nd Gen AVX"
14584 -+ ---help---
14585 -+
14586 -+ Select this for 2nd Gen Core processors including Sandy Bridge.
14587 -+
14588 -+ Enables -march=corei7-avx
14589 -+
14590 -+config MCOREAVXI
14591 -+ bool "Intel Core 3rd Gen AVX"
14592 -+ ---help---
14593 -+
14594 -+ Select this for 3rd Gen Core processors including Ivy Bridge.
14595 -+
14596 -+ Enables -march=core-avx-i
14597 -+
14598 -+config MCOREAVX2
14599 -+ bool "Intel Core AVX2"
14600 -+ ---help---
14601 -+
14602 -+ Select this for AVX2 enabled processors including Haswell.
14603 -+
14604 -+ Enables -march=core-avx2
14605 -
14606 - config GENERIC_CPU
14607 - bool "Generic-x86-64"
14608 -@@ -276,6 +354,19 @@ config GENERIC_CPU
14609 - Generic x86-64 CPU.
14610 - Run equally well on all x86-64 CPUs.
14611 -
14612 -+config MNATIVE
14613 -+ bool "Native optimizations autodetected by GCC"
14614 -+ ---help---
14615 -+
14616 -+ GCC 4.2 and above support -march=native, which automatically detects
14617 -+ the optimum settings to use based on your processor. -march=native
14618 -+ also detects and applies additional settings beyond -march specific
14619 -+ to your CPU, (eg. -msse4). Unless you have a specific reason not to
14620 -+ (e.g. distcc cross-compiling), you should probably be using
14621 -+ -march=native rather than anything listed below.
14622 -+
14623 -+ Enables -march=native
14624 -+
14625 - endchoice
14626 -
14627 - config X86_GENERIC
14628 -@@ -300,7 +391,7 @@ config X86_INTERNODE_CACHE_SHIFT
14629 - config X86_L1_CACHE_SHIFT
14630 - int
14631 - default "7" if MPENTIUM4 || MPSC
14632 -- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
14633 -+ default "6" if MK7 || MK8 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MJAGUAR || MPENTIUMM || MCORE2 || MCOREI7 || MCOREI7AVX || MCOREAVXI || MCOREAVX2 || MATOM || MVIAC7 || X86_GENERIC || MNATIVE || GENERIC_CPU
14634 - default "4" if MELAN || M486 || MGEODEGX1
14635 - default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
14636 -
14637 -@@ -331,11 +422,11 @@ config X86_ALIGNMENT_16
14638 -
14639 - config X86_INTEL_USERCOPY
14640 - def_bool y
14641 -- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
14642 -+ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || MNATIVE || X86_GENERIC || MK8 || MK7 || MK10 || MBARCELONA || MEFFICEON || MCORE2 || MCOREI7 || MCOREI7AVX || MCOREAVXI || MCOREAVX2
14643 -
14644 - config X86_USE_PPRO_CHECKSUM
14645 - def_bool y
14646 -- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
14647 -+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MCOREI7 || MCOREI7AVX || MCOREAVXI || MCOREAVX2 || MATOM || MNATIVE
14648 -
14649 - config X86_USE_3DNOW
14650 - def_bool y
14651 -@@ -363,17 +454,17 @@ config X86_P6_NOP
14652 -
14653 - config X86_TSC
14654 - def_bool y
14655 -- depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) && !X86_NUMAQ) || X86_64
14656 -+ depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MJAGUAR || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MCOREI7 || MCOREI7-AVX || MATOM) && !X86_NUMAQ) || X86_64 || MNATIVE
14657 -
14658 - config X86_CMPXCHG64
14659 - def_bool y
14660 -- depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM
14661 -+ depends on X86_PAE || X86_64 || MCORE2 || MCOREI7 || MCOREI7AVX || MCOREAVXI || MCOREAVX2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM || MNATIVE
14662 -
14663 - # this should be set for all -march=.. options where the compiler
14664 - # generates cmov.
14665 - config X86_CMOV
14666 - def_bool y
14667 -- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
14668 -+ depends on (MK8 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MJAGUAR || MK7 || MCORE2 || MCOREI7 || MCOREI7AVX || MCOREAVXI || MCOREAVX2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX)
14669 -
14670 - config X86_MINIMUM_CPU_FAMILY
14671 - int
14672 -diff -uprN a/arch/x86/Makefile b/arch/x86/Makefile
14673 ---- a/arch/x86/Makefile 2013-11-03 18:41:51.000000000 -0500
14674 -+++ b/arch/x86/Makefile 2013-12-15 06:21:24.354455723 -0500
14675 -@@ -61,11 +61,26 @@ else
14676 - KBUILD_CFLAGS += $(call cc-option,-mno-sse -mpreferred-stack-boundary=3)
14677 -
14678 - # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
14679 -+ cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native)
14680 - cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
14681 -+ cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10)
14682 -+ cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona)
14683 -+ cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1)
14684 -+ cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1)
14685 -+ cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2)
14686 -+ cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2)
14687 - cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
14688 -
14689 - cflags-$(CONFIG_MCORE2) += \
14690 -- $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
14691 -+ $(call cc-option,-march=core2,$(call cc-option,-mtune=core2))
14692 -+ cflags-$(CONFIG_MCOREI7) += \
14693 -+ $(call cc-option,-march=corei7,$(call cc-option,-mtune=corei7))
14694 -+ cflags-$(CONFIG_MCOREI7AVX) += \
14695 -+ $(call cc-option,-march=corei7-avx,$(call cc-option,-mtune=corei7-avx))
14696 -+ cflags-$(CONFIG_MCOREAVXI) += \
14697 -+ $(call cc-option,-march=core-avx-i,$(call cc-option,-mtune=core-avx-i))
14698 -+ cflags-$(CONFIG_MCOREAVX2) += \
14699 -+ $(call cc-option,-march=core-avx2,$(call cc-option,-mtune=core-avx2))
14700 - cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \
14701 - $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
14702 - cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
14703 -diff -uprN a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
14704 ---- a/arch/x86/Makefile_32.cpu 2013-11-03 18:41:51.000000000 -0500
14705 -+++ b/arch/x86/Makefile_32.cpu 2013-12-15 06:21:24.354455723 -0500
14706 -@@ -23,7 +23,14 @@ cflags-$(CONFIG_MK6) += -march=k6
14707 - # Please note, that patches that add -march=athlon-xp and friends are pointless.
14708 - # They make zero difference whatsosever to performance at this time.
14709 - cflags-$(CONFIG_MK7) += -march=athlon
14710 -+cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native)
14711 - cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon)
14712 -+cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10,-march=athlon)
14713 -+cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona,-march=athlon)
14714 -+cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1,-march=athlon)
14715 -+cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1,-march=athlon)
14716 -+cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2,-march=athlon)
14717 -+cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2,-march=athlon)
14718 - cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
14719 - cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
14720 - cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586)
14721 -@@ -32,6 +39,10 @@ cflags-$(CONFIG_MCYRIXIII) += $(call cc-
14722 - cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
14723 - cflags-$(CONFIG_MVIAC7) += -march=i686
14724 - cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2)
14725 -+cflags-$(CONFIG_MCOREI7) += -march=i686 $(call tune,corei7)
14726 -+cflags-$(CONFIG_MCOREI7AVX) += -march=i686 $(call tune,corei7-avx)
14727 -+cflags-$(CONFIG_MCOREAVXI) += -march=i686 $(call tune,core-avx-i)
14728 -+cflags-$(CONFIG_MCOREAVX2) += -march=i686 $(call tune,core-avx2)
14729 - cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \
14730 - $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
14731 -
14732 -