1 |
Author: tomwij |
2 |
Date: 2014-02-07 15:42:35 +0000 (Fri, 07 Feb 2014) |
3 |
New Revision: 2666 |
4 |
|
5 |
Added: |
6 |
genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch |
7 |
genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1 |
8 |
genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch |
9 |
genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch |
10 |
genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1 |
11 |
genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch |
12 |
Removed: |
13 |
genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch |
14 |
genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1 |
15 |
genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch |
16 |
Modified: |
17 |
genpatches-2.6/trunk/3.13/0000_README |
18 |
genpatches-2.6/trunk/3.14/0000_README |
19 |
Log: |
20 |
Updated experimental BFQ patches to new revision v7r1. |
21 |
|
22 |
Modified: genpatches-2.6/trunk/3.13/0000_README |
23 |
=================================================================== |
24 |
--- genpatches-2.6/trunk/3.13/0000_README 2014-02-07 14:46:59 UTC (rev 2665) |
25 |
+++ genpatches-2.6/trunk/3.13/0000_README 2014-02-07 15:42:35 UTC (rev 2666) |
26 |
@@ -91,17 +91,17 @@ |
27 |
From: Tom Wijsman <TomWij@g.o> |
28 |
Desc: Add Gentoo Linux support config settings and defaults. |
29 |
|
30 |
-Patch: 5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7-3.13.patch |
31 |
+Patch: 5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7r1-3.13.patch |
32 |
From: http://algo.ing.unimo.it/people/paolo/disk_sched/ |
33 |
-Desc: BFQ v7 patch 1 for 3.13: Build, cgroups and kconfig bits |
34 |
+Desc: BFQ v7r1 patch 1 for 3.13: Build, cgroups and kconfig bits |
35 |
|
36 |
-Patch: 5000_BFQ-2-block-introduce-the-v7-I-O-sched-for-3.13.patch1 |
37 |
+Patch: 5000_BFQ-2-block-introduce-the-v7r1-I-O-sched-for-3.13.patch1 |
38 |
From: http://algo.ing.unimo.it/people/paolo/disk_sched/ |
39 |
-Desc: BFQ v7 patch 2 for 3.13: BFQ Scheduler |
40 |
+Desc: BFQ v7r1 patch 2 for 3.13: BFQ Scheduler |
41 |
|
42 |
-Patch: 5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7-for-3.13.0.patch |
43 |
+Patch: 5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r1-for-3.13.0.patch |
44 |
From: http://algo.ing.unimo.it/people/paolo/disk_sched/ |
45 |
-Desc: BFQ v7 patch 3 for 3.13: Early Queue Merge (EQM) |
46 |
+Desc: BFQ v7r1 patch 3 for 3.13: Early Queue Merge (EQM) |
47 |
|
48 |
Patch: 5000_enable-additional-cpu-optimizations-for-gcc.patch |
49 |
From: https://github.com/graysky2/kernel_gcc_patch/ |
50 |
|
51 |
Deleted: genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch |
52 |
=================================================================== |
53 |
--- genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch 2014-02-07 14:46:59 UTC (rev 2665) |
54 |
+++ genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7-3.13.patch 2014-02-07 15:42:35 UTC (rev 2666) |
55 |
@@ -1,104 +0,0 @@ |
56 |
-From 7f029ed2a02bea57b791c032d6242129c3372a84 Mon Sep 17 00:00:00 2001 |
57 |
-From: Paolo Valente <paolo.valente@×××××××.it> |
58 |
-Date: Tue, 3 Sep 2013 16:50:42 +0200 |
59 |
-Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7-3.13 |
60 |
- |
61 |
-Update Kconfig.iosched and do the related Makefile changes to include |
62 |
-kernel configuration options for BFQ. Also add the bfqio controller |
63 |
-to the cgroups subsystem. |
64 |
- |
65 |
-Signed-off-by: Paolo Valente <paolo.valente@×××××××.it> |
66 |
-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com> |
67 |
---- |
68 |
- block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++ |
69 |
- block/Makefile | 1 + |
70 |
- include/linux/cgroup_subsys.h | 4 ++++ |
71 |
- 3 files changed, 37 insertions(+) |
72 |
- |
73 |
-diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched |
74 |
-index 421bef9..8f552ba 100644 |
75 |
---- a/block/Kconfig.iosched |
76 |
-+++ b/block/Kconfig.iosched |
77 |
-@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED |
78 |
- ---help--- |
79 |
- Enable group IO scheduling in CFQ. |
80 |
- |
81 |
-+config IOSCHED_BFQ |
82 |
-+ tristate "BFQ I/O scheduler" |
83 |
-+ default n |
84 |
-+ ---help--- |
85 |
-+ The BFQ I/O scheduler tries to distribute bandwidth among |
86 |
-+ all processes according to their weights. |
87 |
-+ It aims at distributing the bandwidth as desired, independently of |
88 |
-+ the disk parameters and with any workload. It also tries to |
89 |
-+ guarantee low latency to interactive and soft real-time |
90 |
-+ applications. If compiled built-in (saying Y here), BFQ can |
91 |
-+ be configured to support hierarchical scheduling. |
92 |
-+ |
93 |
-+config CGROUP_BFQIO |
94 |
-+ bool "BFQ hierarchical scheduling support" |
95 |
-+ depends on CGROUPS && IOSCHED_BFQ=y |
96 |
-+ default n |
97 |
-+ ---help--- |
98 |
-+ Enable hierarchical scheduling in BFQ, using the cgroups |
99 |
-+ filesystem interface. The name of the subsystem will be |
100 |
-+ bfqio. |
101 |
-+ |
102 |
- choice |
103 |
- prompt "Default I/O scheduler" |
104 |
- default DEFAULT_CFQ |
105 |
-@@ -52,6 +73,16 @@ choice |
106 |
- config DEFAULT_CFQ |
107 |
- bool "CFQ" if IOSCHED_CFQ=y |
108 |
- |
109 |
-+ config DEFAULT_BFQ |
110 |
-+ bool "BFQ" if IOSCHED_BFQ=y |
111 |
-+ help |
112 |
-+ Selects BFQ as the default I/O scheduler which will be |
113 |
-+ used by default for all block devices. |
114 |
-+ The BFQ I/O scheduler aims at distributing the bandwidth |
115 |
-+ as desired, independently of the disk parameters and with |
116 |
-+ any workload. It also tries to guarantee low latency to |
117 |
-+ interactive and soft real-time applications. |
118 |
-+ |
119 |
- config DEFAULT_NOOP |
120 |
- bool "No-op" |
121 |
- |
122 |
-@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED |
123 |
- string |
124 |
- default "deadline" if DEFAULT_DEADLINE |
125 |
- default "cfq" if DEFAULT_CFQ |
126 |
-+ default "bfq" if DEFAULT_BFQ |
127 |
- default "noop" if DEFAULT_NOOP |
128 |
- |
129 |
- endmenu |
130 |
-diff --git a/block/Makefile b/block/Makefile |
131 |
-index 20645e8..cbd83fb 100644 |
132 |
---- a/block/Makefile |
133 |
-+++ b/block/Makefile |
134 |
-@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o |
135 |
- obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o |
136 |
- obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o |
137 |
- obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o |
138 |
-+obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o |
139 |
- |
140 |
- obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o |
141 |
- obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o |
142 |
-diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h |
143 |
-index b613ffd..43c5dc9 100644 |
144 |
---- a/include/linux/cgroup_subsys.h |
145 |
-+++ b/include/linux/cgroup_subsys.h |
146 |
-@@ -39,6 +39,10 @@ SUBSYS(net_cls) |
147 |
- SUBSYS(blkio) |
148 |
- #endif |
149 |
- |
150 |
-+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO) |
151 |
-+SUBSYS(bfqio) |
152 |
-+#endif |
153 |
-+ |
154 |
- #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF) |
155 |
- SUBSYS(perf) |
156 |
- #endif |
157 |
--- |
158 |
-1.8.5.2 |
159 |
- |
160 |
|
161 |
Added: genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch |
162 |
=================================================================== |
163 |
--- genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch (rev 0) |
164 |
+++ genpatches-2.6/trunk/3.13/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch 2014-02-07 15:42:35 UTC (rev 2666) |
165 |
@@ -0,0 +1,104 @@ |
166 |
+From ae1b820a5286601aa9d5426459f8f3de658342b4 Mon Sep 17 00:00:00 2001 |
167 |
+From: Paolo Valente <paolo.valente@×××××××.it> |
168 |
+Date: Tue, 3 Sep 2013 16:50:42 +0200 |
169 |
+Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r1-3.13 |
170 |
+ |
171 |
+Update Kconfig.iosched and do the related Makefile changes to include |
172 |
+kernel configuration options for BFQ. Also add the bfqio controller |
173 |
+to the cgroups subsystem. |
174 |
+ |
175 |
+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it> |
176 |
+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com> |
177 |
+--- |
178 |
+ block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++ |
179 |
+ block/Makefile | 1 + |
180 |
+ include/linux/cgroup_subsys.h | 4 ++++ |
181 |
+ 3 files changed, 37 insertions(+) |
182 |
+ |
183 |
+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched |
184 |
+index 421bef9..8f552ba 100644 |
185 |
+--- a/block/Kconfig.iosched |
186 |
++++ b/block/Kconfig.iosched |
187 |
+@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED |
188 |
+ ---help--- |
189 |
+ Enable group IO scheduling in CFQ. |
190 |
+ |
191 |
++config IOSCHED_BFQ |
192 |
++ tristate "BFQ I/O scheduler" |
193 |
++ default n |
194 |
++ ---help--- |
195 |
++ The BFQ I/O scheduler tries to distribute bandwidth among |
196 |
++ all processes according to their weights. |
197 |
++ It aims at distributing the bandwidth as desired, independently of |
198 |
++ the disk parameters and with any workload. It also tries to |
199 |
++ guarantee low latency to interactive and soft real-time |
200 |
++ applications. If compiled built-in (saying Y here), BFQ can |
201 |
++ be configured to support hierarchical scheduling. |
202 |
++ |
203 |
++config CGROUP_BFQIO |
204 |
++ bool "BFQ hierarchical scheduling support" |
205 |
++ depends on CGROUPS && IOSCHED_BFQ=y |
206 |
++ default n |
207 |
++ ---help--- |
208 |
++ Enable hierarchical scheduling in BFQ, using the cgroups |
209 |
++ filesystem interface. The name of the subsystem will be |
210 |
++ bfqio. |
211 |
++ |
212 |
+ choice |
213 |
+ prompt "Default I/O scheduler" |
214 |
+ default DEFAULT_CFQ |
215 |
+@@ -52,6 +73,16 @@ choice |
216 |
+ config DEFAULT_CFQ |
217 |
+ bool "CFQ" if IOSCHED_CFQ=y |
218 |
+ |
219 |
++ config DEFAULT_BFQ |
220 |
++ bool "BFQ" if IOSCHED_BFQ=y |
221 |
++ help |
222 |
++ Selects BFQ as the default I/O scheduler which will be |
223 |
++ used by default for all block devices. |
224 |
++ The BFQ I/O scheduler aims at distributing the bandwidth |
225 |
++ as desired, independently of the disk parameters and with |
226 |
++ any workload. It also tries to guarantee low latency to |
227 |
++ interactive and soft real-time applications. |
228 |
++ |
229 |
+ config DEFAULT_NOOP |
230 |
+ bool "No-op" |
231 |
+ |
232 |
+@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED |
233 |
+ string |
234 |
+ default "deadline" if DEFAULT_DEADLINE |
235 |
+ default "cfq" if DEFAULT_CFQ |
236 |
++ default "bfq" if DEFAULT_BFQ |
237 |
+ default "noop" if DEFAULT_NOOP |
238 |
+ |
239 |
+ endmenu |
240 |
+diff --git a/block/Makefile b/block/Makefile |
241 |
+index 20645e8..cbd83fb 100644 |
242 |
+--- a/block/Makefile |
243 |
++++ b/block/Makefile |
244 |
+@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o |
245 |
+ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o |
246 |
+ obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o |
247 |
+ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o |
248 |
++obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o |
249 |
+ |
250 |
+ obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o |
251 |
+ obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o |
252 |
+diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h |
253 |
+index b613ffd..43c5dc9 100644 |
254 |
+--- a/include/linux/cgroup_subsys.h |
255 |
++++ b/include/linux/cgroup_subsys.h |
256 |
+@@ -39,6 +39,10 @@ SUBSYS(net_cls) |
257 |
+ SUBSYS(blkio) |
258 |
+ #endif |
259 |
+ |
260 |
++#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO) |
261 |
++SUBSYS(bfqio) |
262 |
++#endif |
263 |
++ |
264 |
+ #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF) |
265 |
+ SUBSYS(perf) |
266 |
+ #endif |
267 |
+-- |
268 |
+1.8.5.2 |
269 |
+ |
270 |
|
271 |
Deleted: genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1 |
272 |
=================================================================== |
273 |
--- genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1 2014-02-07 14:46:59 UTC (rev 2665) |
274 |
+++ genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7-I-O-sched-for-3.13.patch1 2014-02-07 15:42:35 UTC (rev 2666) |
275 |
@@ -1,6008 +0,0 @@ |
276 |
-From 3747f129106ce58fbad1b8f05cc836a6addd8588 Mon Sep 17 00:00:00 2001 |
277 |
-From: Paolo Valente <paolo.valente@×××××××.it> |
278 |
-Date: Thu, 9 May 2013 19:10:02 +0200 |
279 |
-Subject: [PATCH 2/3] block: introduce the BFQ-v7 I/O sched for 3.13 |
280 |
- |
281 |
-Add the BFQ-v7 I/O scheduler to 3.13. |
282 |
-The general structure is borrowed from CFQ, as much of the code for |
283 |
-handling I/O contexts Over time, several useful features have been |
284 |
-ported from CFQ as well (details in the changelog in README.BFQ). A |
285 |
-(bfq_)queue is associated to each task doing I/O on a device, and each |
286 |
-time a scheduling decision has to be made a queue is selected and served |
287 |
-until it expires. |
288 |
- |
289 |
- - Slices are given in the service domain: tasks are assigned |
290 |
- budgets, measured in number of sectors. Once got the disk, a task |
291 |
- must however consume its assigned budget within a configurable |
292 |
- maximum time (by default, the maximum possible value of the |
293 |
- budgets is automatically computed to comply with this timeout). |
294 |
- This allows the desired latency vs "throughput boosting" tradeoff |
295 |
- to be set. |
296 |
- |
297 |
- - Budgets are scheduled according to a variant of WF2Q+, implemented |
298 |
- using an augmented rb-tree to take eligibility into account while |
299 |
- preserving an O(log N) overall complexity. |
300 |
- |
301 |
- - A low-latency tunable is provided; if enabled, both interactive |
302 |
- and soft real-time applications are guaranteed a very low latency. |
303 |
- |
304 |
- - Latency guarantees are preserved also in the presence of NCQ. |
305 |
- |
306 |
- - Also with flash-based devices, a high throughput is achieved |
307 |
- while still preserving latency guarantees. |
308 |
- |
309 |
- - BFQ features Early Queue Merge (EQM), a sort of fusion of the |
310 |
- cooperating-queue-merging and the preemption mechanisms present |
311 |
- in CFQ. EQM is in fact a unified mechanism that tries to get a |
312 |
- sequential read pattern, and hence a high throughput, with any |
313 |
- set of processes performing interleaved I/O over a contiguous |
314 |
- sequence of sectors. |
315 |
- |
316 |
- - BFQ supports full hierarchical scheduling, exporting a cgroups |
317 |
- interface. Since each node has a full scheduler, each group can |
318 |
- be assigned its own weight. |
319 |
- |
320 |
- - If the cgroups interface is not used, only I/O priorities can be |
321 |
- assigned to processes, with ioprio values mapped to weights |
322 |
- with the relation weight = IOPRIO_BE_NR - ioprio. |
323 |
- |
324 |
- - ioprio classes are served in strict priority order, i.e., lower |
325 |
- priority queues are not served as long as there are higher |
326 |
- priority queues. Among queues in the same class the bandwidth is |
327 |
- distributed in proportion to the weight of each queue. A very |
328 |
- thin extra bandwidth is however guaranteed to the Idle class, to |
329 |
- prevent it from starving. |
330 |
- |
331 |
-Signed-off-by: Paolo Valente <paolo.valente@×××××××.it> |
332 |
-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com> |
333 |
---- |
334 |
- block/bfq-cgroup.c | 910 ++++++++++++++ |
335 |
- block/bfq-ioc.c | 36 + |
336 |
- block/bfq-iosched.c | 3268 +++++++++++++++++++++++++++++++++++++++++++++++++++ |
337 |
- block/bfq-sched.c | 1077 +++++++++++++++++ |
338 |
- block/bfq.h | 614 ++++++++++ |
339 |
- 5 files changed, 5905 insertions(+) |
340 |
- create mode 100644 block/bfq-cgroup.c |
341 |
- create mode 100644 block/bfq-ioc.c |
342 |
- create mode 100644 block/bfq-iosched.c |
343 |
- create mode 100644 block/bfq-sched.c |
344 |
- create mode 100644 block/bfq.h |
345 |
- |
346 |
-diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c |
347 |
-new file mode 100644 |
348 |
-index 0000000..b889acf |
349 |
---- /dev/null |
350 |
-+++ b/block/bfq-cgroup.c |
351 |
-@@ -0,0 +1,910 @@ |
352 |
-+/* |
353 |
-+ * BFQ: CGROUPS support. |
354 |
-+ * |
355 |
-+ * Based on ideas and code from CFQ: |
356 |
-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
357 |
-+ * |
358 |
-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
359 |
-+ * Paolo Valente <paolo.valente@×××××××.it> |
360 |
-+ * |
361 |
-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
362 |
-+ * |
363 |
-+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. |
364 |
-+ */ |
365 |
-+ |
366 |
-+#ifdef CONFIG_CGROUP_BFQIO |
367 |
-+ |
368 |
-+static DEFINE_MUTEX(bfqio_mutex); |
369 |
-+ |
370 |
-+static bool bfqio_is_removed(struct bfqio_cgroup *bgrp) |
371 |
-+{ |
372 |
-+ return bgrp ? !bgrp->online : false; |
373 |
-+} |
374 |
-+ |
375 |
-+static struct bfqio_cgroup bfqio_root_cgroup = { |
376 |
-+ .weight = BFQ_DEFAULT_GRP_WEIGHT, |
377 |
-+ .ioprio = BFQ_DEFAULT_GRP_IOPRIO, |
378 |
-+ .ioprio_class = BFQ_DEFAULT_GRP_CLASS, |
379 |
-+}; |
380 |
-+ |
381 |
-+static inline void bfq_init_entity(struct bfq_entity *entity, |
382 |
-+ struct bfq_group *bfqg) |
383 |
-+{ |
384 |
-+ entity->weight = entity->new_weight; |
385 |
-+ entity->orig_weight = entity->new_weight; |
386 |
-+ entity->ioprio = entity->new_ioprio; |
387 |
-+ entity->ioprio_class = entity->new_ioprio_class; |
388 |
-+ entity->parent = bfqg->my_entity; |
389 |
-+ entity->sched_data = &bfqg->sched_data; |
390 |
-+} |
391 |
-+ |
392 |
-+static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css) |
393 |
-+{ |
394 |
-+ return css ? container_of(css, struct bfqio_cgroup, css) : NULL; |
395 |
-+} |
396 |
-+ |
397 |
-+/* |
398 |
-+ * Search the bfq_group for bfqd into the hash table (by now only a list) |
399 |
-+ * of bgrp. Must be called under rcu_read_lock(). |
400 |
-+ */ |
401 |
-+static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp, |
402 |
-+ struct bfq_data *bfqd) |
403 |
-+{ |
404 |
-+ struct bfq_group *bfqg; |
405 |
-+ void *key; |
406 |
-+ |
407 |
-+ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) { |
408 |
-+ key = rcu_dereference(bfqg->bfqd); |
409 |
-+ if (key == bfqd) |
410 |
-+ return bfqg; |
411 |
-+ } |
412 |
-+ |
413 |
-+ return NULL; |
414 |
-+} |
415 |
-+ |
416 |
-+static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp, |
417 |
-+ struct bfq_group *bfqg) |
418 |
-+{ |
419 |
-+ struct bfq_entity *entity = &bfqg->entity; |
420 |
-+ |
421 |
-+ /* |
422 |
-+ * If the weight of the entity has never been set via the sysfs |
423 |
-+ * interface, then bgrp->weight == 0. In this case we initialize |
424 |
-+ * the weight from the current ioprio value. Otherwise, the group |
425 |
-+ * weight, if set, has priority over the ioprio value. |
426 |
-+ */ |
427 |
-+ if (bgrp->weight == 0) { |
428 |
-+ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio); |
429 |
-+ entity->new_ioprio = bgrp->ioprio; |
430 |
-+ } else { |
431 |
-+ entity->new_weight = bgrp->weight; |
432 |
-+ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight); |
433 |
-+ } |
434 |
-+ entity->orig_weight = entity->weight = entity->new_weight; |
435 |
-+ entity->ioprio = entity->new_ioprio; |
436 |
-+ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class; |
437 |
-+ entity->my_sched_data = &bfqg->sched_data; |
438 |
-+} |
439 |
-+ |
440 |
-+static inline void bfq_group_set_parent(struct bfq_group *bfqg, |
441 |
-+ struct bfq_group *parent) |
442 |
-+{ |
443 |
-+ struct bfq_entity *entity; |
444 |
-+ |
445 |
-+ BUG_ON(parent == NULL); |
446 |
-+ BUG_ON(bfqg == NULL); |
447 |
-+ |
448 |
-+ entity = &bfqg->entity; |
449 |
-+ entity->parent = parent->my_entity; |
450 |
-+ entity->sched_data = &parent->sched_data; |
451 |
-+} |
452 |
-+ |
453 |
-+/** |
454 |
-+ * bfq_group_chain_alloc - allocate a chain of groups. |
455 |
-+ * @bfqd: queue descriptor. |
456 |
-+ * @css: the leaf cgroup_subsys_state this chain starts from. |
457 |
-+ * |
458 |
-+ * Allocate a chain of groups starting from the one belonging to |
459 |
-+ * @cgroup up to the root cgroup. Stop if a cgroup on the chain |
460 |
-+ * to the root has already an allocated group on @bfqd. |
461 |
-+ */ |
462 |
-+static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd, |
463 |
-+ struct cgroup_subsys_state *css) |
464 |
-+{ |
465 |
-+ struct bfqio_cgroup *bgrp; |
466 |
-+ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL; |
467 |
-+ |
468 |
-+ for (; css != NULL; css = css->parent) { |
469 |
-+ bgrp = css_to_bfqio(css); |
470 |
-+ |
471 |
-+ bfqg = bfqio_lookup_group(bgrp, bfqd); |
472 |
-+ if (bfqg != NULL) { |
473 |
-+ /* |
474 |
-+ * All the cgroups in the path from there to the |
475 |
-+ * root must have a bfq_group for bfqd, so we don't |
476 |
-+ * need any more allocations. |
477 |
-+ */ |
478 |
-+ break; |
479 |
-+ } |
480 |
-+ |
481 |
-+ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC); |
482 |
-+ if (bfqg == NULL) |
483 |
-+ goto cleanup; |
484 |
-+ |
485 |
-+ bfq_group_init_entity(bgrp, bfqg); |
486 |
-+ bfqg->my_entity = &bfqg->entity; |
487 |
-+ |
488 |
-+ if (leaf == NULL) { |
489 |
-+ leaf = bfqg; |
490 |
-+ prev = leaf; |
491 |
-+ } else { |
492 |
-+ bfq_group_set_parent(prev, bfqg); |
493 |
-+ /* |
494 |
-+ * Build a list of allocated nodes using the bfqd |
495 |
-+ * filed, that is still unused and will be initialized |
496 |
-+ * only after the node will be connected. |
497 |
-+ */ |
498 |
-+ prev->bfqd = bfqg; |
499 |
-+ prev = bfqg; |
500 |
-+ } |
501 |
-+ } |
502 |
-+ |
503 |
-+ return leaf; |
504 |
-+ |
505 |
-+cleanup: |
506 |
-+ while (leaf != NULL) { |
507 |
-+ prev = leaf; |
508 |
-+ leaf = leaf->bfqd; |
509 |
-+ kfree(prev); |
510 |
-+ } |
511 |
-+ |
512 |
-+ return NULL; |
513 |
-+} |
514 |
-+ |
515 |
-+/** |
516 |
-+ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy. |
517 |
-+ * @bfqd: the queue descriptor. |
518 |
-+ * @css: the leaf cgroup_subsys_state to start from. |
519 |
-+ * @leaf: the leaf group (to be associated to @cgroup). |
520 |
-+ * |
521 |
-+ * Try to link a chain of groups to a cgroup hierarchy, connecting the |
522 |
-+ * nodes bottom-up, so we can be sure that when we find a cgroup in the |
523 |
-+ * hierarchy that already as a group associated to @bfqd all the nodes |
524 |
-+ * in the path to the root cgroup have one too. |
525 |
-+ * |
526 |
-+ * On locking: the queue lock protects the hierarchy (there is a hierarchy |
527 |
-+ * per device) while the bfqio_cgroup lock protects the list of groups |
528 |
-+ * belonging to the same cgroup. |
529 |
-+ */ |
530 |
-+static void bfq_group_chain_link(struct bfq_data *bfqd, |
531 |
-+ struct cgroup_subsys_state *css, |
532 |
-+ struct bfq_group *leaf) |
533 |
-+{ |
534 |
-+ struct bfqio_cgroup *bgrp; |
535 |
-+ struct bfq_group *bfqg, *next, *prev = NULL; |
536 |
-+ unsigned long flags; |
537 |
-+ |
538 |
-+ assert_spin_locked(bfqd->queue->queue_lock); |
539 |
-+ |
540 |
-+ for (; css != NULL && leaf != NULL; css = css->parent) { |
541 |
-+ bgrp = css_to_bfqio(css); |
542 |
-+ next = leaf->bfqd; |
543 |
-+ |
544 |
-+ bfqg = bfqio_lookup_group(bgrp, bfqd); |
545 |
-+ BUG_ON(bfqg != NULL); |
546 |
-+ |
547 |
-+ spin_lock_irqsave(&bgrp->lock, flags); |
548 |
-+ |
549 |
-+ rcu_assign_pointer(leaf->bfqd, bfqd); |
550 |
-+ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data); |
551 |
-+ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list); |
552 |
-+ |
553 |
-+ spin_unlock_irqrestore(&bgrp->lock, flags); |
554 |
-+ |
555 |
-+ prev = leaf; |
556 |
-+ leaf = next; |
557 |
-+ } |
558 |
-+ |
559 |
-+ BUG_ON(css == NULL && leaf != NULL); |
560 |
-+ if (css != NULL && prev != NULL) { |
561 |
-+ bgrp = css_to_bfqio(css); |
562 |
-+ bfqg = bfqio_lookup_group(bgrp, bfqd); |
563 |
-+ bfq_group_set_parent(prev, bfqg); |
564 |
-+ } |
565 |
-+} |
566 |
-+ |
567 |
-+/** |
568 |
-+ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup. |
569 |
-+ * @bfqd: queue descriptor. |
570 |
-+ * @cgroup: cgroup being searched for. |
571 |
-+ * |
572 |
-+ * Return a group associated to @bfqd in @cgroup, allocating one if |
573 |
-+ * necessary. When a group is returned all the cgroups in the path |
574 |
-+ * to the root have a group associated to @bfqd. |
575 |
-+ * |
576 |
-+ * If the allocation fails, return the root group: this breaks guarantees |
577 |
-+ * but is a safe fallbak. If this loss becames a problem it can be |
578 |
-+ * mitigated using the equivalent weight (given by the product of the |
579 |
-+ * weights of the groups in the path from @group to the root) in the |
580 |
-+ * root scheduler. |
581 |
-+ * |
582 |
-+ * We allocate all the missing nodes in the path from the leaf cgroup |
583 |
-+ * to the root and we connect the nodes only after all the allocations |
584 |
-+ * have been successful. |
585 |
-+ */ |
586 |
-+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, |
587 |
-+ struct cgroup_subsys_state *css) |
588 |
-+{ |
589 |
-+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); |
590 |
-+ struct bfq_group *bfqg; |
591 |
-+ |
592 |
-+ bfqg = bfqio_lookup_group(bgrp, bfqd); |
593 |
-+ if (bfqg != NULL) |
594 |
-+ return bfqg; |
595 |
-+ |
596 |
-+ bfqg = bfq_group_chain_alloc(bfqd, css); |
597 |
-+ if (bfqg != NULL) |
598 |
-+ bfq_group_chain_link(bfqd, css, bfqg); |
599 |
-+ else |
600 |
-+ bfqg = bfqd->root_group; |
601 |
-+ |
602 |
-+ return bfqg; |
603 |
-+} |
604 |
-+ |
605 |
-+/** |
606 |
-+ * bfq_bfqq_move - migrate @bfqq to @bfqg. |
607 |
-+ * @bfqd: queue descriptor. |
608 |
-+ * @bfqq: the queue to move. |
609 |
-+ * @entity: @bfqq's entity. |
610 |
-+ * @bfqg: the group to move to. |
611 |
-+ * |
612 |
-+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating |
613 |
-+ * it on the new one. Avoid putting the entity on the old group idle tree. |
614 |
-+ * |
615 |
-+ * Must be called under the queue lock; the cgroup owning @bfqg must |
616 |
-+ * not disappear (by now this just means that we are called under |
617 |
-+ * rcu_read_lock()). |
618 |
-+ */ |
619 |
-+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
620 |
-+ struct bfq_entity *entity, struct bfq_group *bfqg) |
621 |
-+{ |
622 |
-+ int busy, resume; |
623 |
-+ |
624 |
-+ busy = bfq_bfqq_busy(bfqq); |
625 |
-+ resume = !RB_EMPTY_ROOT(&bfqq->sort_list); |
626 |
-+ |
627 |
-+ BUG_ON(resume && !entity->on_st); |
628 |
-+ BUG_ON(busy && !resume && entity->on_st && |
629 |
-+ bfqq != bfqd->in_service_queue); |
630 |
-+ |
631 |
-+ if (busy) { |
632 |
-+ BUG_ON(atomic_read(&bfqq->ref) < 2); |
633 |
-+ |
634 |
-+ if (!resume) |
635 |
-+ bfq_del_bfqq_busy(bfqd, bfqq, 0); |
636 |
-+ else |
637 |
-+ bfq_deactivate_bfqq(bfqd, bfqq, 0); |
638 |
-+ } else if (entity->on_st) |
639 |
-+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); |
640 |
-+ |
641 |
-+ /* |
642 |
-+ * Here we use a reference to bfqg. We don't need a refcounter |
643 |
-+ * as the cgroup reference will not be dropped, so that its |
644 |
-+ * destroy() callback will not be invoked. |
645 |
-+ */ |
646 |
-+ entity->parent = bfqg->my_entity; |
647 |
-+ entity->sched_data = &bfqg->sched_data; |
648 |
-+ |
649 |
-+ if (busy && resume) |
650 |
-+ bfq_activate_bfqq(bfqd, bfqq); |
651 |
-+ |
652 |
-+ if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver) |
653 |
-+ bfq_schedule_dispatch(bfqd); |
654 |
-+} |
655 |
-+ |
656 |
-+/** |
657 |
-+ * __bfq_bic_change_cgroup - move @bic to @cgroup. |
658 |
-+ * @bfqd: the queue descriptor. |
659 |
-+ * @bic: the bic to move. |
660 |
-+ * @cgroup: the cgroup to move to. |
661 |
-+ * |
662 |
-+ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller |
663 |
-+ * has to make sure that the reference to cgroup is valid across the call. |
664 |
-+ * |
665 |
-+ * NOTE: an alternative approach might have been to store the current |
666 |
-+ * cgroup in bfqq and getting a reference to it, reducing the lookup |
667 |
-+ * time here, at the price of slightly more complex code. |
668 |
-+ */ |
669 |
-+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, |
670 |
-+ struct bfq_io_cq *bic, |
671 |
-+ struct cgroup_subsys_state *css) |
672 |
-+{ |
673 |
-+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); |
674 |
-+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); |
675 |
-+ struct bfq_entity *entity; |
676 |
-+ struct bfq_group *bfqg; |
677 |
-+ struct bfqio_cgroup *bgrp; |
678 |
-+ |
679 |
-+ bgrp = css_to_bfqio(css); |
680 |
-+ |
681 |
-+ bfqg = bfq_find_alloc_group(bfqd, css); |
682 |
-+ if (async_bfqq != NULL) { |
683 |
-+ entity = &async_bfqq->entity; |
684 |
-+ |
685 |
-+ if (entity->sched_data != &bfqg->sched_data) { |
686 |
-+ bic_set_bfqq(bic, NULL, 0); |
687 |
-+ bfq_log_bfqq(bfqd, async_bfqq, |
688 |
-+ "bic_change_group: %p %d", |
689 |
-+ async_bfqq, atomic_read(&async_bfqq->ref)); |
690 |
-+ bfq_put_queue(async_bfqq); |
691 |
-+ } |
692 |
-+ } |
693 |
-+ |
694 |
-+ if (sync_bfqq != NULL) { |
695 |
-+ entity = &sync_bfqq->entity; |
696 |
-+ if (entity->sched_data != &bfqg->sched_data) |
697 |
-+ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); |
698 |
-+ } |
699 |
-+ |
700 |
-+ return bfqg; |
701 |
-+} |
702 |
-+ |
703 |
-+/** |
704 |
-+ * bfq_bic_change_cgroup - move @bic to @cgroup. |
705 |
-+ * @bic: the bic being migrated. |
706 |
-+ * @cgroup: the destination cgroup. |
707 |
-+ * |
708 |
-+ * When the task owning @bic is moved to @cgroup, @bic is immediately |
709 |
-+ * moved into its new parent group. |
710 |
-+ */ |
711 |
-+static void bfq_bic_change_cgroup(struct bfq_io_cq *bic, |
712 |
-+ struct cgroup_subsys_state *css) |
713 |
-+{ |
714 |
-+ struct bfq_data *bfqd; |
715 |
-+ unsigned long uninitialized_var(flags); |
716 |
-+ |
717 |
-+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), |
718 |
-+ &flags); |
719 |
-+ if (bfqd != NULL) { |
720 |
-+ __bfq_bic_change_cgroup(bfqd, bic, css); |
721 |
-+ bfq_put_bfqd_unlock(bfqd, &flags); |
722 |
-+ } |
723 |
-+} |
724 |
-+ |
725 |
-+/** |
726 |
-+ * bfq_bic_update_cgroup - update the cgroup of @bic. |
727 |
-+ * @bic: the @bic to update. |
728 |
-+ * |
729 |
-+ * Make sure that @bic is enqueued in the cgroup of the current task. |
730 |
-+ * We need this in addition to moving bics during the cgroup attach |
731 |
-+ * phase because the task owning @bic could be at its first disk |
732 |
-+ * access or we may end up in the root cgroup as the result of a |
733 |
-+ * memory allocation failure and here we try to move to the right |
734 |
-+ * group. |
735 |
-+ * |
736 |
-+ * Must be called under the queue lock. It is safe to use the returned |
737 |
-+ * value even after the rcu_read_unlock() as the migration/destruction |
738 |
-+ * paths act under the queue lock too. IOW it is impossible to race with |
739 |
-+ * group migration/destruction and end up with an invalid group as: |
740 |
-+ * a) here cgroup has not yet been destroyed, nor its destroy callback |
741 |
-+ * has started execution, as current holds a reference to it, |
742 |
-+ * b) if it is destroyed after rcu_read_unlock() [after current is |
743 |
-+ * migrated to a different cgroup] its attach() callback will have |
744 |
-+ * taken care of remove all the references to the old cgroup data. |
745 |
-+ */ |
746 |
-+static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic) |
747 |
-+{ |
748 |
-+ struct bfq_data *bfqd = bic_to_bfqd(bic); |
749 |
-+ struct bfq_group *bfqg; |
750 |
-+ struct cgroup_subsys_state *css; |
751 |
-+ |
752 |
-+ BUG_ON(bfqd == NULL); |
753 |
-+ |
754 |
-+ rcu_read_lock(); |
755 |
-+ css = task_css(current, bfqio_subsys_id); |
756 |
-+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, css); |
757 |
-+ rcu_read_unlock(); |
758 |
-+ |
759 |
-+ return bfqg; |
760 |
-+} |
761 |
-+ |
762 |
-+/** |
763 |
-+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. |
764 |
-+ * @st: the service tree being flushed. |
765 |
-+ */ |
766 |
-+static inline void bfq_flush_idle_tree(struct bfq_service_tree *st) |
767 |
-+{ |
768 |
-+ struct bfq_entity *entity = st->first_idle; |
769 |
-+ |
770 |
-+ for (; entity != NULL; entity = st->first_idle) |
771 |
-+ __bfq_deactivate_entity(entity, 0); |
772 |
-+} |
773 |
-+ |
774 |
-+/** |
775 |
-+ * bfq_reparent_leaf_entity - move leaf entity to the root_group. |
776 |
-+ * @bfqd: the device data structure with the root group. |
777 |
-+ * @entity: the entity to move. |
778 |
-+ */ |
779 |
-+static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd, |
780 |
-+ struct bfq_entity *entity) |
781 |
-+{ |
782 |
-+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
783 |
-+ |
784 |
-+ BUG_ON(bfqq == NULL); |
785 |
-+ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); |
786 |
-+ return; |
787 |
-+} |
788 |
-+ |
789 |
-+/** |
790 |
-+ * bfq_reparent_active_entities - move to the root group all active entities. |
791 |
-+ * @bfqd: the device data structure with the root group. |
792 |
-+ * @bfqg: the group to move from. |
793 |
-+ * @st: the service tree with the entities. |
794 |
-+ * |
795 |
-+ * Needs queue_lock to be taken and reference to be valid over the call. |
796 |
-+ */ |
797 |
-+static inline void bfq_reparent_active_entities(struct bfq_data *bfqd, |
798 |
-+ struct bfq_group *bfqg, |
799 |
-+ struct bfq_service_tree *st) |
800 |
-+{ |
801 |
-+ struct rb_root *active = &st->active; |
802 |
-+ struct bfq_entity *entity = NULL; |
803 |
-+ |
804 |
-+ if (!RB_EMPTY_ROOT(&st->active)) |
805 |
-+ entity = bfq_entity_of(rb_first(active)); |
806 |
-+ |
807 |
-+ for (; entity != NULL; entity = bfq_entity_of(rb_first(active))) |
808 |
-+ bfq_reparent_leaf_entity(bfqd, entity); |
809 |
-+ |
810 |
-+ if (bfqg->sched_data.active_entity != NULL) |
811 |
-+ bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity); |
812 |
-+ |
813 |
-+ return; |
814 |
-+} |
815 |
-+ |
816 |
-+/** |
817 |
-+ * bfq_destroy_group - destroy @bfqg. |
818 |
-+ * @bgrp: the bfqio_cgroup containing @bfqg. |
819 |
-+ * @bfqg: the group being destroyed. |
820 |
-+ * |
821 |
-+ * Destroy @bfqg, making sure that it is not referenced from its parent. |
822 |
-+ */ |
823 |
-+static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) |
824 |
-+{ |
825 |
-+ struct bfq_data *bfqd; |
826 |
-+ struct bfq_service_tree *st; |
827 |
-+ struct bfq_entity *entity = bfqg->my_entity; |
828 |
-+ unsigned long uninitialized_var(flags); |
829 |
-+ int i; |
830 |
-+ |
831 |
-+ hlist_del(&bfqg->group_node); |
832 |
-+ |
833 |
-+ /* |
834 |
-+ * Empty all service_trees belonging to this group before deactivating |
835 |
-+ * the group itself. |
836 |
-+ */ |
837 |
-+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { |
838 |
-+ st = bfqg->sched_data.service_tree + i; |
839 |
-+ |
840 |
-+ /* |
841 |
-+ * The idle tree may still contain bfq_queues belonging |
842 |
-+ * to exited task because they never migrated to a different |
843 |
-+ * cgroup from the one being destroyed now. Noone else |
844 |
-+ * can access them so it's safe to act without any lock. |
845 |
-+ */ |
846 |
-+ bfq_flush_idle_tree(st); |
847 |
-+ |
848 |
-+ /* |
849 |
-+ * It may happen that some queues are still active |
850 |
-+ * (busy) upon group destruction (if the corresponding |
851 |
-+ * processes have been forced to terminate). We move |
852 |
-+ * all the leaf entities corresponding to these queues |
853 |
-+ * to the root_group. |
854 |
-+ * Also, it may happen that the group has an entity |
855 |
-+ * under service, which is disconnected from the active |
856 |
-+ * tree: it must be moved, too. |
857 |
-+ * There is no need to put the sync queues, as the |
858 |
-+ * scheduler has taken no reference. |
859 |
-+ */ |
860 |
-+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); |
861 |
-+ if (bfqd != NULL) { |
862 |
-+ bfq_reparent_active_entities(bfqd, bfqg, st); |
863 |
-+ bfq_put_bfqd_unlock(bfqd, &flags); |
864 |
-+ } |
865 |
-+ BUG_ON(!RB_EMPTY_ROOT(&st->active)); |
866 |
-+ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); |
867 |
-+ } |
868 |
-+ BUG_ON(bfqg->sched_data.next_active != NULL); |
869 |
-+ BUG_ON(bfqg->sched_data.active_entity != NULL); |
870 |
-+ |
871 |
-+ /* |
872 |
-+ * We may race with device destruction, take extra care when |
873 |
-+ * dereferencing bfqg->bfqd. |
874 |
-+ */ |
875 |
-+ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); |
876 |
-+ if (bfqd != NULL) { |
877 |
-+ hlist_del(&bfqg->bfqd_node); |
878 |
-+ __bfq_deactivate_entity(entity, 0); |
879 |
-+ bfq_put_async_queues(bfqd, bfqg); |
880 |
-+ bfq_put_bfqd_unlock(bfqd, &flags); |
881 |
-+ } |
882 |
-+ BUG_ON(entity->tree != NULL); |
883 |
-+ |
884 |
-+ /* |
885 |
-+ * No need to defer the kfree() to the end of the RCU grace |
886 |
-+ * period: we are called from the destroy() callback of our |
887 |
-+ * cgroup, so we can be sure that noone is a) still using |
888 |
-+ * this cgroup or b) doing lookups in it. |
889 |
-+ */ |
890 |
-+ kfree(bfqg); |
891 |
-+} |
892 |
-+ |
893 |
-+static void bfq_end_raising_async(struct bfq_data *bfqd) |
894 |
-+{ |
895 |
-+ struct hlist_node *tmp; |
896 |
-+ struct bfq_group *bfqg; |
897 |
-+ |
898 |
-+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) |
899 |
-+ bfq_end_raising_async_queues(bfqd, bfqg); |
900 |
-+ bfq_end_raising_async_queues(bfqd, bfqd->root_group); |
901 |
-+} |
902 |
-+ |
903 |
-+/** |
904 |
-+ * bfq_disconnect_groups - diconnect @bfqd from all its groups. |
905 |
-+ * @bfqd: the device descriptor being exited. |
906 |
-+ * |
907 |
-+ * When the device exits we just make sure that no lookup can return |
908 |
-+ * the now unused group structures. They will be deallocated on cgroup |
909 |
-+ * destruction. |
910 |
-+ */ |
911 |
-+static void bfq_disconnect_groups(struct bfq_data *bfqd) |
912 |
-+{ |
913 |
-+ struct hlist_node *tmp; |
914 |
-+ struct bfq_group *bfqg; |
915 |
-+ |
916 |
-+ bfq_log(bfqd, "disconnect_groups beginning"); |
917 |
-+ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) { |
918 |
-+ hlist_del(&bfqg->bfqd_node); |
919 |
-+ |
920 |
-+ __bfq_deactivate_entity(bfqg->my_entity, 0); |
921 |
-+ |
922 |
-+ /* |
923 |
-+ * Don't remove from the group hash, just set an |
924 |
-+ * invalid key. No lookups can race with the |
925 |
-+ * assignment as bfqd is being destroyed; this |
926 |
-+ * implies also that new elements cannot be added |
927 |
-+ * to the list. |
928 |
-+ */ |
929 |
-+ rcu_assign_pointer(bfqg->bfqd, NULL); |
930 |
-+ |
931 |
-+ bfq_log(bfqd, "disconnect_groups: put async for group %p", |
932 |
-+ bfqg); |
933 |
-+ bfq_put_async_queues(bfqd, bfqg); |
934 |
-+ } |
935 |
-+} |
936 |
-+ |
937 |
-+static inline void bfq_free_root_group(struct bfq_data *bfqd) |
938 |
-+{ |
939 |
-+ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup; |
940 |
-+ struct bfq_group *bfqg = bfqd->root_group; |
941 |
-+ |
942 |
-+ bfq_put_async_queues(bfqd, bfqg); |
943 |
-+ |
944 |
-+ spin_lock_irq(&bgrp->lock); |
945 |
-+ hlist_del_rcu(&bfqg->group_node); |
946 |
-+ spin_unlock_irq(&bgrp->lock); |
947 |
-+ |
948 |
-+ /* |
949 |
-+ * No need to synchronize_rcu() here: since the device is gone |
950 |
-+ * there cannot be any read-side access to its root_group. |
951 |
-+ */ |
952 |
-+ kfree(bfqg); |
953 |
-+} |
954 |
-+ |
955 |
-+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) |
956 |
-+{ |
957 |
-+ struct bfq_group *bfqg; |
958 |
-+ struct bfqio_cgroup *bgrp; |
959 |
-+ int i; |
960 |
-+ |
961 |
-+ bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node); |
962 |
-+ if (bfqg == NULL) |
963 |
-+ return NULL; |
964 |
-+ |
965 |
-+ bfqg->entity.parent = NULL; |
966 |
-+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) |
967 |
-+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; |
968 |
-+ |
969 |
-+ bgrp = &bfqio_root_cgroup; |
970 |
-+ spin_lock_irq(&bgrp->lock); |
971 |
-+ rcu_assign_pointer(bfqg->bfqd, bfqd); |
972 |
-+ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data); |
973 |
-+ spin_unlock_irq(&bgrp->lock); |
974 |
-+ |
975 |
-+ return bfqg; |
976 |
-+} |
977 |
-+ |
978 |
-+#define SHOW_FUNCTION(__VAR) \ |
979 |
-+static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \ |
980 |
-+ struct cftype *cftype) \ |
981 |
-+{ \ |
982 |
-+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \ |
983 |
-+ u64 ret = -ENODEV; \ |
984 |
-+ \ |
985 |
-+ mutex_lock(&bfqio_mutex); \ |
986 |
-+ if (bfqio_is_removed(bgrp)) \ |
987 |
-+ goto out_unlock; \ |
988 |
-+ \ |
989 |
-+ spin_lock_irq(&bgrp->lock); \ |
990 |
-+ ret = bgrp->__VAR; \ |
991 |
-+ spin_unlock_irq(&bgrp->lock); \ |
992 |
-+ \ |
993 |
-+out_unlock: \ |
994 |
-+ mutex_unlock(&bfqio_mutex); \ |
995 |
-+ return ret; \ |
996 |
-+} |
997 |
-+ |
998 |
-+SHOW_FUNCTION(weight); |
999 |
-+SHOW_FUNCTION(ioprio); |
1000 |
-+SHOW_FUNCTION(ioprio_class); |
1001 |
-+#undef SHOW_FUNCTION |
1002 |
-+ |
1003 |
-+#define STORE_FUNCTION(__VAR, __MIN, __MAX) \ |
1004 |
-+static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\ |
1005 |
-+ struct cftype *cftype, \ |
1006 |
-+ u64 val) \ |
1007 |
-+{ \ |
1008 |
-+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \ |
1009 |
-+ struct bfq_group *bfqg; \ |
1010 |
-+ int ret = -EINVAL; \ |
1011 |
-+ \ |
1012 |
-+ if (val < (__MIN) || val > (__MAX)) \ |
1013 |
-+ return ret; \ |
1014 |
-+ \ |
1015 |
-+ ret = -ENODEV; \ |
1016 |
-+ mutex_lock(&bfqio_mutex); \ |
1017 |
-+ if (bfqio_is_removed(bgrp)) \ |
1018 |
-+ goto out_unlock; \ |
1019 |
-+ ret = 0; \ |
1020 |
-+ \ |
1021 |
-+ spin_lock_irq(&bgrp->lock); \ |
1022 |
-+ bgrp->__VAR = (unsigned short)val; \ |
1023 |
-+ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \ |
1024 |
-+ /* \ |
1025 |
-+ * Setting the ioprio_changed flag of the entity \ |
1026 |
-+ * to 1 with new_##__VAR == ##__VAR would re-set \ |
1027 |
-+ * the value of the weight to its ioprio mapping. \ |
1028 |
-+ * Set the flag only if necessary. \ |
1029 |
-+ */ \ |
1030 |
-+ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \ |
1031 |
-+ bfqg->entity.new_##__VAR = (unsigned short)val; \ |
1032 |
-+ smp_wmb(); \ |
1033 |
-+ bfqg->entity.ioprio_changed = 1; \ |
1034 |
-+ } \ |
1035 |
-+ } \ |
1036 |
-+ spin_unlock_irq(&bgrp->lock); \ |
1037 |
-+ \ |
1038 |
-+out_unlock: \ |
1039 |
-+ mutex_unlock(&bfqio_mutex); \ |
1040 |
-+ return ret; \ |
1041 |
-+} |
1042 |
-+ |
1043 |
-+STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT); |
1044 |
-+STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1); |
1045 |
-+STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); |
1046 |
-+#undef STORE_FUNCTION |
1047 |
-+ |
1048 |
-+static struct cftype bfqio_files[] = { |
1049 |
-+ { |
1050 |
-+ .name = "weight", |
1051 |
-+ .read_u64 = bfqio_cgroup_weight_read, |
1052 |
-+ .write_u64 = bfqio_cgroup_weight_write, |
1053 |
-+ }, |
1054 |
-+ { |
1055 |
-+ .name = "ioprio", |
1056 |
-+ .read_u64 = bfqio_cgroup_ioprio_read, |
1057 |
-+ .write_u64 = bfqio_cgroup_ioprio_write, |
1058 |
-+ }, |
1059 |
-+ { |
1060 |
-+ .name = "ioprio_class", |
1061 |
-+ .read_u64 = bfqio_cgroup_ioprio_class_read, |
1062 |
-+ .write_u64 = bfqio_cgroup_ioprio_class_write, |
1063 |
-+ }, |
1064 |
-+ { }, /* terminate */ |
1065 |
-+}; |
1066 |
-+ |
1067 |
-+static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state |
1068 |
-+ *parent_css) |
1069 |
-+{ |
1070 |
-+ struct bfqio_cgroup *bgrp; |
1071 |
-+ |
1072 |
-+ if (parent_css != NULL) { |
1073 |
-+ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL); |
1074 |
-+ if (bgrp == NULL) |
1075 |
-+ return ERR_PTR(-ENOMEM); |
1076 |
-+ } else |
1077 |
-+ bgrp = &bfqio_root_cgroup; |
1078 |
-+ |
1079 |
-+ spin_lock_init(&bgrp->lock); |
1080 |
-+ INIT_HLIST_HEAD(&bgrp->group_data); |
1081 |
-+ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO; |
1082 |
-+ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS; |
1083 |
-+ |
1084 |
-+ return &bgrp->css; |
1085 |
-+} |
1086 |
-+ |
1087 |
-+/* |
1088 |
-+ * We cannot support shared io contexts, as we have no means to support |
1089 |
-+ * two tasks with the same ioc in two different groups without major rework |
1090 |
-+ * of the main bic/bfqq data structures. By now we allow a task to change |
1091 |
-+ * its cgroup only if it's the only owner of its ioc; the drawback of this |
1092 |
-+ * behavior is that a group containing a task that forked using CLONE_IO |
1093 |
-+ * will not be destroyed until the tasks sharing the ioc die. |
1094 |
-+ */ |
1095 |
-+static int bfqio_can_attach(struct cgroup_subsys_state *css, |
1096 |
-+ struct cgroup_taskset *tset) |
1097 |
-+{ |
1098 |
-+ struct task_struct *task; |
1099 |
-+ struct io_context *ioc; |
1100 |
-+ int ret = 0; |
1101 |
-+ |
1102 |
-+ cgroup_taskset_for_each(task, css, tset) { |
1103 |
-+ /* |
1104 |
-+ * task_lock() is needed to avoid races with |
1105 |
-+ * exit_io_context() |
1106 |
-+ */ |
1107 |
-+ task_lock(task); |
1108 |
-+ ioc = task->io_context; |
1109 |
-+ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) |
1110 |
-+ /* |
1111 |
-+ * ioc == NULL means that the task is either too young |
1112 |
-+ * or exiting: if it has still no ioc the ioc can't be |
1113 |
-+ * shared, if the task is exiting the attach will fail |
1114 |
-+ * anyway, no matter what we return here. |
1115 |
-+ */ |
1116 |
-+ ret = -EINVAL; |
1117 |
-+ task_unlock(task); |
1118 |
-+ if (ret) |
1119 |
-+ break; |
1120 |
-+ } |
1121 |
-+ |
1122 |
-+ return ret; |
1123 |
-+} |
1124 |
-+ |
1125 |
-+static void bfqio_attach(struct cgroup_subsys_state *css, |
1126 |
-+ struct cgroup_taskset *tset) |
1127 |
-+{ |
1128 |
-+ struct task_struct *task; |
1129 |
-+ struct io_context *ioc; |
1130 |
-+ struct io_cq *icq; |
1131 |
-+ |
1132 |
-+ /* |
1133 |
-+ * IMPORTANT NOTE: The move of more than one process at a time to a |
1134 |
-+ * new group has not yet been tested. |
1135 |
-+ */ |
1136 |
-+ cgroup_taskset_for_each(task, css, tset) { |
1137 |
-+ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); |
1138 |
-+ if (ioc) { |
1139 |
-+ /* |
1140 |
-+ * Handle cgroup change here. |
1141 |
-+ */ |
1142 |
-+ rcu_read_lock(); |
1143 |
-+ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node) |
1144 |
-+ if (!strncmp( |
1145 |
-+ icq->q->elevator->type->elevator_name, |
1146 |
-+ "bfq", ELV_NAME_MAX)) |
1147 |
-+ bfq_bic_change_cgroup(icq_to_bic(icq), |
1148 |
-+ css); |
1149 |
-+ rcu_read_unlock(); |
1150 |
-+ put_io_context(ioc); |
1151 |
-+ } |
1152 |
-+ } |
1153 |
-+} |
1154 |
-+ |
1155 |
-+static void bfqio_destroy(struct cgroup_subsys_state *css) |
1156 |
-+{ |
1157 |
-+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); |
1158 |
-+ struct hlist_node *tmp; |
1159 |
-+ struct bfq_group *bfqg; |
1160 |
-+ |
1161 |
-+ /* |
1162 |
-+ * Since we are destroying the cgroup, there are no more tasks |
1163 |
-+ * referencing it, and all the RCU grace periods that may have |
1164 |
-+ * referenced it are ended (as the destruction of the parent |
1165 |
-+ * cgroup is RCU-safe); bgrp->group_data will not be accessed by |
1166 |
-+ * anything else and we don't need any synchronization. |
1167 |
-+ */ |
1168 |
-+ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node) |
1169 |
-+ bfq_destroy_group(bgrp, bfqg); |
1170 |
-+ |
1171 |
-+ BUG_ON(!hlist_empty(&bgrp->group_data)); |
1172 |
-+ |
1173 |
-+ kfree(bgrp); |
1174 |
-+} |
1175 |
-+ |
1176 |
-+static int bfqio_css_online(struct cgroup_subsys_state *css) |
1177 |
-+{ |
1178 |
-+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); |
1179 |
-+ |
1180 |
-+ mutex_lock(&bfqio_mutex); |
1181 |
-+ bgrp->online = true; |
1182 |
-+ mutex_unlock(&bfqio_mutex); |
1183 |
-+ |
1184 |
-+ return 0; |
1185 |
-+} |
1186 |
-+ |
1187 |
-+static void bfqio_css_offline(struct cgroup_subsys_state *css) |
1188 |
-+{ |
1189 |
-+ struct bfqio_cgroup *bgrp = css_to_bfqio(css); |
1190 |
-+ |
1191 |
-+ mutex_lock(&bfqio_mutex); |
1192 |
-+ bgrp->online = false; |
1193 |
-+ mutex_unlock(&bfqio_mutex); |
1194 |
-+} |
1195 |
-+ |
1196 |
-+struct cgroup_subsys bfqio_subsys = { |
1197 |
-+ .name = "bfqio", |
1198 |
-+ .css_alloc = bfqio_create, |
1199 |
-+ .css_online = bfqio_css_online, |
1200 |
-+ .css_offline = bfqio_css_offline, |
1201 |
-+ .can_attach = bfqio_can_attach, |
1202 |
-+ .attach = bfqio_attach, |
1203 |
-+ .css_free = bfqio_destroy, |
1204 |
-+ .subsys_id = bfqio_subsys_id, |
1205 |
-+ .base_cftypes = bfqio_files, |
1206 |
-+}; |
1207 |
-+#else |
1208 |
-+static inline void bfq_init_entity(struct bfq_entity *entity, |
1209 |
-+ struct bfq_group *bfqg) |
1210 |
-+{ |
1211 |
-+ entity->weight = entity->new_weight; |
1212 |
-+ entity->orig_weight = entity->new_weight; |
1213 |
-+ entity->ioprio = entity->new_ioprio; |
1214 |
-+ entity->ioprio_class = entity->new_ioprio_class; |
1215 |
-+ entity->sched_data = &bfqg->sched_data; |
1216 |
-+} |
1217 |
-+ |
1218 |
-+static inline struct bfq_group * |
1219 |
-+bfq_bic_update_cgroup(struct bfq_io_cq *bic) |
1220 |
-+{ |
1221 |
-+ struct bfq_data *bfqd = bic_to_bfqd(bic); |
1222 |
-+ return bfqd->root_group; |
1223 |
-+} |
1224 |
-+ |
1225 |
-+static inline void bfq_bfqq_move(struct bfq_data *bfqd, |
1226 |
-+ struct bfq_queue *bfqq, |
1227 |
-+ struct bfq_entity *entity, |
1228 |
-+ struct bfq_group *bfqg) |
1229 |
-+{ |
1230 |
-+} |
1231 |
-+ |
1232 |
-+static void bfq_end_raising_async(struct bfq_data *bfqd) |
1233 |
-+{ |
1234 |
-+ bfq_end_raising_async_queues(bfqd, bfqd->root_group); |
1235 |
-+} |
1236 |
-+ |
1237 |
-+static inline void bfq_disconnect_groups(struct bfq_data *bfqd) |
1238 |
-+{ |
1239 |
-+ bfq_put_async_queues(bfqd, bfqd->root_group); |
1240 |
-+} |
1241 |
-+ |
1242 |
-+static inline void bfq_free_root_group(struct bfq_data *bfqd) |
1243 |
-+{ |
1244 |
-+ kfree(bfqd->root_group); |
1245 |
-+} |
1246 |
-+ |
1247 |
-+static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) |
1248 |
-+{ |
1249 |
-+ struct bfq_group *bfqg; |
1250 |
-+ int i; |
1251 |
-+ |
1252 |
-+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); |
1253 |
-+ if (bfqg == NULL) |
1254 |
-+ return NULL; |
1255 |
-+ |
1256 |
-+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) |
1257 |
-+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; |
1258 |
-+ |
1259 |
-+ return bfqg; |
1260 |
-+} |
1261 |
-+#endif |
1262 |
-diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c |
1263 |
-new file mode 100644 |
1264 |
-index 0000000..7f6b000 |
1265 |
---- /dev/null |
1266 |
-+++ b/block/bfq-ioc.c |
1267 |
-@@ -0,0 +1,36 @@ |
1268 |
-+/* |
1269 |
-+ * BFQ: I/O context handling. |
1270 |
-+ * |
1271 |
-+ * Based on ideas and code from CFQ: |
1272 |
-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
1273 |
-+ * |
1274 |
-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
1275 |
-+ * Paolo Valente <paolo.valente@×××××××.it> |
1276 |
-+ * |
1277 |
-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
1278 |
-+ */ |
1279 |
-+ |
1280 |
-+/** |
1281 |
-+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. |
1282 |
-+ * @icq: the iocontext queue. |
1283 |
-+ */ |
1284 |
-+static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq) |
1285 |
-+{ |
1286 |
-+ /* bic->icq is the first member, %NULL will convert to %NULL */ |
1287 |
-+ return container_of(icq, struct bfq_io_cq, icq); |
1288 |
-+} |
1289 |
-+ |
1290 |
-+/** |
1291 |
-+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. |
1292 |
-+ * @bfqd: the lookup key. |
1293 |
-+ * @ioc: the io_context of the process doing I/O. |
1294 |
-+ * |
1295 |
-+ * Queue lock must be held. |
1296 |
-+ */ |
1297 |
-+static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, |
1298 |
-+ struct io_context *ioc) |
1299 |
-+{ |
1300 |
-+ if (ioc) |
1301 |
-+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); |
1302 |
-+ return NULL; |
1303 |
-+} |
1304 |
-diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c |
1305 |
-new file mode 100644 |
1306 |
-index 0000000..7670400 |
1307 |
---- /dev/null |
1308 |
-+++ b/block/bfq-iosched.c |
1309 |
-@@ -0,0 +1,3268 @@ |
1310 |
-+/* |
1311 |
-+ * BFQ, or Budget Fair Queueing, disk scheduler. |
1312 |
-+ * |
1313 |
-+ * Based on ideas and code from CFQ: |
1314 |
-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
1315 |
-+ * |
1316 |
-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
1317 |
-+ * Paolo Valente <paolo.valente@×××××××.it> |
1318 |
-+ * |
1319 |
-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
1320 |
-+ * |
1321 |
-+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. |
1322 |
-+ * |
1323 |
-+ * BFQ is a proportional share disk scheduling algorithm based on the |
1324 |
-+ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in |
1325 |
-+ * number of sectors, to tasks instead of time slices. The disk is not granted |
1326 |
-+ * to the in-service task for a given time slice, but until it has exahusted |
1327 |
-+ * its assigned budget. This change from the time to the service domain allows |
1328 |
-+ * BFQ to distribute the disk bandwidth among tasks as desired, without any |
1329 |
-+ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an |
1330 |
-+ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to |
1331 |
-+ * their budgets (more precisely BFQ schedules queues associated to tasks). |
1332 |
-+ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to |
1333 |
-+ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low |
1334 |
-+ * latencies to interactive and soft real-time applications. |
1335 |
-+ * |
1336 |
-+ * BFQ is described in [1], where also a reference to the initial, more |
1337 |
-+ * theoretical paper on BFQ can be found. The interested reader can find in |
1338 |
-+ * the latter paper full details on the main algorithm as well as formulas of |
1339 |
-+ * the guarantees, plus formal proofs of all the properties. With respect to |
1340 |
-+ * the version of BFQ presented in these papers, this implementation adds a |
1341 |
-+ * few more heuristics, such as the one that guarantees a low latency to soft |
1342 |
-+ * real-time applications, and a hierarchical extension based on H-WF2Q+. |
1343 |
-+ * |
1344 |
-+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with |
1345 |
-+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) |
1346 |
-+ * complexity derives from the one introduced with EEVDF in [3]. |
1347 |
-+ * |
1348 |
-+ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness |
1349 |
-+ * with the BFQ Disk I/O Scheduler'', |
1350 |
-+ * Proceedings of the 5th Annual International Systems and Storage |
1351 |
-+ * Conference (SYSTOR '12), June 2012. |
1352 |
-+ * |
1353 |
-+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf |
1354 |
-+ * |
1355 |
-+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing |
1356 |
-+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, |
1357 |
-+ * Oct 1997. |
1358 |
-+ * |
1359 |
-+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz |
1360 |
-+ * |
1361 |
-+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline |
1362 |
-+ * First: A Flexible and Accurate Mechanism for Proportional Share |
1363 |
-+ * Resource Allocation,'' technical report. |
1364 |
-+ * |
1365 |
-+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf |
1366 |
-+ */ |
1367 |
-+#include <linux/module.h> |
1368 |
-+#include <linux/slab.h> |
1369 |
-+#include <linux/blkdev.h> |
1370 |
-+#include <linux/cgroup.h> |
1371 |
-+#include <linux/elevator.h> |
1372 |
-+#include <linux/jiffies.h> |
1373 |
-+#include <linux/rbtree.h> |
1374 |
-+#include <linux/ioprio.h> |
1375 |
-+#include "bfq.h" |
1376 |
-+#include "blk.h" |
1377 |
-+ |
1378 |
-+/* Max number of dispatches in one round of service. */ |
1379 |
-+static const int bfq_quantum = 4; |
1380 |
-+ |
1381 |
-+/* Expiration time of sync (0) and async (1) requests, in jiffies. */ |
1382 |
-+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; |
1383 |
-+ |
1384 |
-+/* Maximum backwards seek, in KiB. */ |
1385 |
-+static const int bfq_back_max = 16 * 1024; |
1386 |
-+ |
1387 |
-+/* Penalty of a backwards seek, in number of sectors. */ |
1388 |
-+static const int bfq_back_penalty = 2; |
1389 |
-+ |
1390 |
-+/* Idling period duration, in jiffies. */ |
1391 |
-+static int bfq_slice_idle = HZ / 125; |
1392 |
-+ |
1393 |
-+/* Default maximum budget values, in sectors and number of requests. */ |
1394 |
-+static const int bfq_default_max_budget = 16 * 1024; |
1395 |
-+static const int bfq_max_budget_async_rq = 4; |
1396 |
-+ |
1397 |
-+/* |
1398 |
-+ * Async to sync throughput distribution is controlled as follows: |
1399 |
-+ * when an async request is served, the entity is charged the number |
1400 |
-+ * of sectors of the request, multipled by the factor below |
1401 |
-+ */ |
1402 |
-+static const int bfq_async_charge_factor = 10; |
1403 |
-+ |
1404 |
-+/* Default timeout values, in jiffies, approximating CFQ defaults. */ |
1405 |
-+static const int bfq_timeout_sync = HZ / 8; |
1406 |
-+static int bfq_timeout_async = HZ / 25; |
1407 |
-+ |
1408 |
-+struct kmem_cache *bfq_pool; |
1409 |
-+ |
1410 |
-+/* Below this threshold (in ms), we consider thinktime immediate. */ |
1411 |
-+#define BFQ_MIN_TT 2 |
1412 |
-+ |
1413 |
-+/* hw_tag detection: parallel requests threshold and min samples needed. */ |
1414 |
-+#define BFQ_HW_QUEUE_THRESHOLD 4 |
1415 |
-+#define BFQ_HW_QUEUE_SAMPLES 32 |
1416 |
-+ |
1417 |
-+#define BFQQ_SEEK_THR (sector_t)(8 * 1024) |
1418 |
-+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) |
1419 |
-+ |
1420 |
-+/* Min samples used for peak rate estimation (for autotuning). */ |
1421 |
-+#define BFQ_PEAK_RATE_SAMPLES 32 |
1422 |
-+ |
1423 |
-+/* Shift used for peak rate fixed precision calculations. */ |
1424 |
-+#define BFQ_RATE_SHIFT 16 |
1425 |
-+ |
1426 |
-+/* |
1427 |
-+ * The duration of the weight raising for interactive applications is |
1428 |
-+ * computed automatically (as default behaviour), using the following |
1429 |
-+ * formula: duration = (R / r) * T, where r is the peak rate of the |
1430 |
-+ * disk, and R and T are two reference parameters. In particular, R is |
1431 |
-+ * the peak rate of a reference disk, and T is about the maximum time |
1432 |
-+ * for starting popular large applications on that disk, under BFQ and |
1433 |
-+ * while reading two files in parallel. Finally, BFQ uses two |
1434 |
-+ * different pairs (R, T) depending on whether the disk is rotational |
1435 |
-+ * or non-rotational. |
1436 |
-+ */ |
1437 |
-+#define T_rot (msecs_to_jiffies(5500)) |
1438 |
-+#define T_nonrot (msecs_to_jiffies(2000)) |
1439 |
-+/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */ |
1440 |
-+#define R_rot 17415 |
1441 |
-+#define R_nonrot 34791 |
1442 |
-+ |
1443 |
-+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ |
1444 |
-+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) |
1445 |
-+ |
1446 |
-+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) |
1447 |
-+#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) |
1448 |
-+ |
1449 |
-+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd); |
1450 |
-+ |
1451 |
-+#include "bfq-ioc.c" |
1452 |
-+#include "bfq-sched.c" |
1453 |
-+#include "bfq-cgroup.c" |
1454 |
-+ |
1455 |
-+#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\ |
1456 |
-+ IOPRIO_CLASS_IDLE) |
1457 |
-+#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\ |
1458 |
-+ IOPRIO_CLASS_RT) |
1459 |
-+ |
1460 |
-+#define bfq_sample_valid(samples) ((samples) > 80) |
1461 |
-+ |
1462 |
-+/* |
1463 |
-+ * We regard a request as SYNC, if either it's a read or has the SYNC bit |
1464 |
-+ * set (in which case it could also be a direct WRITE). |
1465 |
-+ */ |
1466 |
-+static inline int bfq_bio_sync(struct bio *bio) |
1467 |
-+{ |
1468 |
-+ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) |
1469 |
-+ return 1; |
1470 |
-+ |
1471 |
-+ return 0; |
1472 |
-+} |
1473 |
-+ |
1474 |
-+/* |
1475 |
-+ * Scheduler run of queue, if there are requests pending and no one in the |
1476 |
-+ * driver that will restart queueing. |
1477 |
-+ */ |
1478 |
-+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd) |
1479 |
-+{ |
1480 |
-+ if (bfqd->queued != 0) { |
1481 |
-+ bfq_log(bfqd, "schedule dispatch"); |
1482 |
-+ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work); |
1483 |
-+ } |
1484 |
-+} |
1485 |
-+ |
1486 |
-+/* |
1487 |
-+ * Lifted from AS - choose which of rq1 and rq2 that is best served now. |
1488 |
-+ * We choose the request that is closesr to the head right now. Distance |
1489 |
-+ * behind the head is penalized and only allowed to a certain extent. |
1490 |
-+ */ |
1491 |
-+static struct request *bfq_choose_req(struct bfq_data *bfqd, |
1492 |
-+ struct request *rq1, |
1493 |
-+ struct request *rq2, |
1494 |
-+ sector_t last) |
1495 |
-+{ |
1496 |
-+ sector_t s1, s2, d1 = 0, d2 = 0; |
1497 |
-+ unsigned long back_max; |
1498 |
-+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ |
1499 |
-+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ |
1500 |
-+ unsigned wrap = 0; /* bit mask: requests behind the disk head? */ |
1501 |
-+ |
1502 |
-+ if (rq1 == NULL || rq1 == rq2) |
1503 |
-+ return rq2; |
1504 |
-+ if (rq2 == NULL) |
1505 |
-+ return rq1; |
1506 |
-+ |
1507 |
-+ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) |
1508 |
-+ return rq1; |
1509 |
-+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) |
1510 |
-+ return rq2; |
1511 |
-+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) |
1512 |
-+ return rq1; |
1513 |
-+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) |
1514 |
-+ return rq2; |
1515 |
-+ |
1516 |
-+ s1 = blk_rq_pos(rq1); |
1517 |
-+ s2 = blk_rq_pos(rq2); |
1518 |
-+ |
1519 |
-+ /* |
1520 |
-+ * By definition, 1KiB is 2 sectors. |
1521 |
-+ */ |
1522 |
-+ back_max = bfqd->bfq_back_max * 2; |
1523 |
-+ |
1524 |
-+ /* |
1525 |
-+ * Strict one way elevator _except_ in the case where we allow |
1526 |
-+ * short backward seeks which are biased as twice the cost of a |
1527 |
-+ * similar forward seek. |
1528 |
-+ */ |
1529 |
-+ if (s1 >= last) |
1530 |
-+ d1 = s1 - last; |
1531 |
-+ else if (s1 + back_max >= last) |
1532 |
-+ d1 = (last - s1) * bfqd->bfq_back_penalty; |
1533 |
-+ else |
1534 |
-+ wrap |= BFQ_RQ1_WRAP; |
1535 |
-+ |
1536 |
-+ if (s2 >= last) |
1537 |
-+ d2 = s2 - last; |
1538 |
-+ else if (s2 + back_max >= last) |
1539 |
-+ d2 = (last - s2) * bfqd->bfq_back_penalty; |
1540 |
-+ else |
1541 |
-+ wrap |= BFQ_RQ2_WRAP; |
1542 |
-+ |
1543 |
-+ /* Found required data */ |
1544 |
-+ |
1545 |
-+ /* |
1546 |
-+ * By doing switch() on the bit mask "wrap" we avoid having to |
1547 |
-+ * check two variables for all permutations: --> faster! |
1548 |
-+ */ |
1549 |
-+ switch (wrap) { |
1550 |
-+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ |
1551 |
-+ if (d1 < d2) |
1552 |
-+ return rq1; |
1553 |
-+ else if (d2 < d1) |
1554 |
-+ return rq2; |
1555 |
-+ else { |
1556 |
-+ if (s1 >= s2) |
1557 |
-+ return rq1; |
1558 |
-+ else |
1559 |
-+ return rq2; |
1560 |
-+ } |
1561 |
-+ |
1562 |
-+ case BFQ_RQ2_WRAP: |
1563 |
-+ return rq1; |
1564 |
-+ case BFQ_RQ1_WRAP: |
1565 |
-+ return rq2; |
1566 |
-+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ |
1567 |
-+ default: |
1568 |
-+ /* |
1569 |
-+ * Since both rqs are wrapped, |
1570 |
-+ * start with the one that's further behind head |
1571 |
-+ * (--> only *one* back seek required), |
1572 |
-+ * since back seek takes more time than forward. |
1573 |
-+ */ |
1574 |
-+ if (s1 <= s2) |
1575 |
-+ return rq1; |
1576 |
-+ else |
1577 |
-+ return rq2; |
1578 |
-+ } |
1579 |
-+} |
1580 |
-+ |
1581 |
-+static struct bfq_queue * |
1582 |
-+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, |
1583 |
-+ sector_t sector, struct rb_node **ret_parent, |
1584 |
-+ struct rb_node ***rb_link) |
1585 |
-+{ |
1586 |
-+ struct rb_node **p, *parent; |
1587 |
-+ struct bfq_queue *bfqq = NULL; |
1588 |
-+ |
1589 |
-+ parent = NULL; |
1590 |
-+ p = &root->rb_node; |
1591 |
-+ while (*p) { |
1592 |
-+ struct rb_node **n; |
1593 |
-+ |
1594 |
-+ parent = *p; |
1595 |
-+ bfqq = rb_entry(parent, struct bfq_queue, pos_node); |
1596 |
-+ |
1597 |
-+ /* |
1598 |
-+ * Sort strictly based on sector. Smallest to the left, |
1599 |
-+ * largest to the right. |
1600 |
-+ */ |
1601 |
-+ if (sector > blk_rq_pos(bfqq->next_rq)) |
1602 |
-+ n = &(*p)->rb_right; |
1603 |
-+ else if (sector < blk_rq_pos(bfqq->next_rq)) |
1604 |
-+ n = &(*p)->rb_left; |
1605 |
-+ else |
1606 |
-+ break; |
1607 |
-+ p = n; |
1608 |
-+ bfqq = NULL; |
1609 |
-+ } |
1610 |
-+ |
1611 |
-+ *ret_parent = parent; |
1612 |
-+ if (rb_link) |
1613 |
-+ *rb_link = p; |
1614 |
-+ |
1615 |
-+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", |
1616 |
-+ (long long unsigned)sector, |
1617 |
-+ bfqq != NULL ? bfqq->pid : 0); |
1618 |
-+ |
1619 |
-+ return bfqq; |
1620 |
-+} |
1621 |
-+ |
1622 |
-+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
1623 |
-+{ |
1624 |
-+ struct rb_node **p, *parent; |
1625 |
-+ struct bfq_queue *__bfqq; |
1626 |
-+ |
1627 |
-+ if (bfqq->pos_root != NULL) { |
1628 |
-+ rb_erase(&bfqq->pos_node, bfqq->pos_root); |
1629 |
-+ bfqq->pos_root = NULL; |
1630 |
-+ } |
1631 |
-+ |
1632 |
-+ if (bfq_class_idle(bfqq)) |
1633 |
-+ return; |
1634 |
-+ if (!bfqq->next_rq) |
1635 |
-+ return; |
1636 |
-+ |
1637 |
-+ bfqq->pos_root = &bfqd->rq_pos_tree; |
1638 |
-+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, |
1639 |
-+ blk_rq_pos(bfqq->next_rq), &parent, &p); |
1640 |
-+ if (__bfqq == NULL) { |
1641 |
-+ rb_link_node(&bfqq->pos_node, parent, p); |
1642 |
-+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); |
1643 |
-+ } else |
1644 |
-+ bfqq->pos_root = NULL; |
1645 |
-+} |
1646 |
-+ |
1647 |
-+static struct request *bfq_find_next_rq(struct bfq_data *bfqd, |
1648 |
-+ struct bfq_queue *bfqq, |
1649 |
-+ struct request *last) |
1650 |
-+{ |
1651 |
-+ struct rb_node *rbnext = rb_next(&last->rb_node); |
1652 |
-+ struct rb_node *rbprev = rb_prev(&last->rb_node); |
1653 |
-+ struct request *next = NULL, *prev = NULL; |
1654 |
-+ |
1655 |
-+ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); |
1656 |
-+ |
1657 |
-+ if (rbprev != NULL) |
1658 |
-+ prev = rb_entry_rq(rbprev); |
1659 |
-+ |
1660 |
-+ if (rbnext != NULL) |
1661 |
-+ next = rb_entry_rq(rbnext); |
1662 |
-+ else { |
1663 |
-+ rbnext = rb_first(&bfqq->sort_list); |
1664 |
-+ if (rbnext && rbnext != &last->rb_node) |
1665 |
-+ next = rb_entry_rq(rbnext); |
1666 |
-+ } |
1667 |
-+ |
1668 |
-+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); |
1669 |
-+} |
1670 |
-+ |
1671 |
-+static void bfq_del_rq_rb(struct request *rq) |
1672 |
-+{ |
1673 |
-+ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
1674 |
-+ struct bfq_data *bfqd = bfqq->bfqd; |
1675 |
-+ const int sync = rq_is_sync(rq); |
1676 |
-+ |
1677 |
-+ BUG_ON(bfqq->queued[sync] == 0); |
1678 |
-+ bfqq->queued[sync]--; |
1679 |
-+ bfqd->queued--; |
1680 |
-+ |
1681 |
-+ elv_rb_del(&bfqq->sort_list, rq); |
1682 |
-+ |
1683 |
-+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { |
1684 |
-+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) |
1685 |
-+ bfq_del_bfqq_busy(bfqd, bfqq, 1); |
1686 |
-+ /* |
1687 |
-+ * Remove queue from request-position tree as it is empty. |
1688 |
-+ */ |
1689 |
-+ if (bfqq->pos_root != NULL) { |
1690 |
-+ rb_erase(&bfqq->pos_node, bfqq->pos_root); |
1691 |
-+ bfqq->pos_root = NULL; |
1692 |
-+ } |
1693 |
-+ } |
1694 |
-+} |
1695 |
-+ |
1696 |
-+/* see the definition of bfq_async_charge_factor for details */ |
1697 |
-+static inline unsigned long bfq_serv_to_charge(struct request *rq, |
1698 |
-+ struct bfq_queue *bfqq) |
1699 |
-+{ |
1700 |
-+ return blk_rq_sectors(rq) * |
1701 |
-+ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) * |
1702 |
-+ bfq_async_charge_factor)); |
1703 |
-+} |
1704 |
-+ |
1705 |
-+/** |
1706 |
-+ * bfq_updated_next_req - update the queue after a new next_rq selection. |
1707 |
-+ * @bfqd: the device data the queue belongs to. |
1708 |
-+ * @bfqq: the queue to update. |
1709 |
-+ * |
1710 |
-+ * If the first request of a queue changes we make sure that the queue |
1711 |
-+ * has enough budget to serve at least its first request (if the |
1712 |
-+ * request has grown). We do this because if the queue has not enough |
1713 |
-+ * budget for its first request, it has to go through two dispatch |
1714 |
-+ * rounds to actually get it dispatched. |
1715 |
-+ */ |
1716 |
-+static void bfq_updated_next_req(struct bfq_data *bfqd, |
1717 |
-+ struct bfq_queue *bfqq) |
1718 |
-+{ |
1719 |
-+ struct bfq_entity *entity = &bfqq->entity; |
1720 |
-+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); |
1721 |
-+ struct request *next_rq = bfqq->next_rq; |
1722 |
-+ unsigned long new_budget; |
1723 |
-+ |
1724 |
-+ if (next_rq == NULL) |
1725 |
-+ return; |
1726 |
-+ |
1727 |
-+ if (bfqq == bfqd->in_service_queue) |
1728 |
-+ /* |
1729 |
-+ * In order not to break guarantees, budgets cannot be |
1730 |
-+ * changed after an entity has been selected. |
1731 |
-+ */ |
1732 |
-+ return; |
1733 |
-+ |
1734 |
-+ BUG_ON(entity->tree != &st->active); |
1735 |
-+ BUG_ON(entity == entity->sched_data->active_entity); |
1736 |
-+ |
1737 |
-+ new_budget = max_t(unsigned long, bfqq->max_budget, |
1738 |
-+ bfq_serv_to_charge(next_rq, bfqq)); |
1739 |
-+ entity->budget = new_budget; |
1740 |
-+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); |
1741 |
-+ bfq_activate_bfqq(bfqd, bfqq); |
1742 |
-+} |
1743 |
-+ |
1744 |
-+static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd) |
1745 |
-+{ |
1746 |
-+ u64 dur; |
1747 |
-+ |
1748 |
-+ if (bfqd->bfq_raising_max_time > 0) |
1749 |
-+ return bfqd->bfq_raising_max_time; |
1750 |
-+ |
1751 |
-+ dur = bfqd->RT_prod; |
1752 |
-+ do_div(dur, bfqd->peak_rate); |
1753 |
-+ |
1754 |
-+ return dur; |
1755 |
-+} |
1756 |
-+ |
1757 |
-+static void bfq_add_rq_rb(struct request *rq) |
1758 |
-+{ |
1759 |
-+ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
1760 |
-+ struct bfq_entity *entity = &bfqq->entity; |
1761 |
-+ struct bfq_data *bfqd = bfqq->bfqd; |
1762 |
-+ struct request *next_rq, *prev; |
1763 |
-+ unsigned long old_raising_coeff = bfqq->raising_coeff; |
1764 |
-+ int idle_for_long_time = 0; |
1765 |
-+ |
1766 |
-+ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq)); |
1767 |
-+ bfqq->queued[rq_is_sync(rq)]++; |
1768 |
-+ bfqd->queued++; |
1769 |
-+ |
1770 |
-+ elv_rb_add(&bfqq->sort_list, rq); |
1771 |
-+ |
1772 |
-+ /* |
1773 |
-+ * Check if this request is a better next-serve candidate. |
1774 |
-+ */ |
1775 |
-+ prev = bfqq->next_rq; |
1776 |
-+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); |
1777 |
-+ BUG_ON(next_rq == NULL); |
1778 |
-+ bfqq->next_rq = next_rq; |
1779 |
-+ |
1780 |
-+ /* |
1781 |
-+ * Adjust priority tree position, if next_rq changes. |
1782 |
-+ */ |
1783 |
-+ if (prev != bfqq->next_rq) |
1784 |
-+ bfq_rq_pos_tree_add(bfqd, bfqq); |
1785 |
-+ |
1786 |
-+ if (!bfq_bfqq_busy(bfqq)) { |
1787 |
-+ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 && |
1788 |
-+ time_is_before_jiffies(bfqq->soft_rt_next_start); |
1789 |
-+ idle_for_long_time = time_is_before_jiffies( |
1790 |
-+ bfqq->budget_timeout + |
1791 |
-+ bfqd->bfq_raising_min_idle_time); |
1792 |
-+ entity->budget = max_t(unsigned long, bfqq->max_budget, |
1793 |
-+ bfq_serv_to_charge(next_rq, bfqq)); |
1794 |
-+ |
1795 |
-+ if (!bfqd->low_latency) |
1796 |
-+ goto add_bfqq_busy; |
1797 |
-+ |
1798 |
-+ /* |
1799 |
-+ * If the queue is not being boosted and has been idle |
1800 |
-+ * for enough time, start a weight-raising period |
1801 |
-+ */ |
1802 |
-+ if (old_raising_coeff == 1 && |
1803 |
-+ (idle_for_long_time || soft_rt)) { |
1804 |
-+ bfqq->raising_coeff = bfqd->bfq_raising_coeff; |
1805 |
-+ if (idle_for_long_time) |
1806 |
-+ bfqq->raising_cur_max_time = |
1807 |
-+ bfq_wrais_duration(bfqd); |
1808 |
-+ else |
1809 |
-+ bfqq->raising_cur_max_time = |
1810 |
-+ bfqd->bfq_raising_rt_max_time; |
1811 |
-+ bfq_log_bfqq(bfqd, bfqq, |
1812 |
-+ "wrais starting at %llu msec," |
1813 |
-+ "rais_max_time %u", |
1814 |
-+ bfqq->last_rais_start_finish, |
1815 |
-+ jiffies_to_msecs(bfqq-> |
1816 |
-+ raising_cur_max_time)); |
1817 |
-+ } else if (old_raising_coeff > 1) { |
1818 |
-+ if (idle_for_long_time) |
1819 |
-+ bfqq->raising_cur_max_time = |
1820 |
-+ bfq_wrais_duration(bfqd); |
1821 |
-+ else if (bfqq->raising_cur_max_time == |
1822 |
-+ bfqd->bfq_raising_rt_max_time && |
1823 |
-+ !soft_rt) { |
1824 |
-+ bfqq->raising_coeff = 1; |
1825 |
-+ bfq_log_bfqq(bfqd, bfqq, |
1826 |
-+ "wrais ending at %llu msec," |
1827 |
-+ "rais_max_time %u", |
1828 |
-+ bfqq->last_rais_start_finish, |
1829 |
-+ jiffies_to_msecs(bfqq-> |
1830 |
-+ raising_cur_max_time)); |
1831 |
-+ } else if ((bfqq->last_rais_start_finish + |
1832 |
-+ bfqq->raising_cur_max_time < |
1833 |
-+ jiffies + bfqd->bfq_raising_rt_max_time) && |
1834 |
-+ soft_rt) { |
1835 |
-+ /* |
1836 |
-+ * |
1837 |
-+ * The remaining weight-raising time is lower |
1838 |
-+ * than bfqd->bfq_raising_rt_max_time, which |
1839 |
-+ * means that the application is enjoying |
1840 |
-+ * weight raising either because deemed soft rt |
1841 |
-+ * in the near past, or because deemed |
1842 |
-+ * interactive a long ago. In both cases, |
1843 |
-+ * resetting now the current remaining weight- |
1844 |
-+ * raising time for the application to the |
1845 |
-+ * weight-raising duration for soft rt |
1846 |
-+ * applications would not cause any latency |
1847 |
-+ * increase for the application (as the new |
1848 |
-+ * duration would be higher than the remaining |
1849 |
-+ * time). |
1850 |
-+ * |
1851 |
-+ * In addition, the application is now meeting |
1852 |
-+ * the requirements for being deemed soft rt. |
1853 |
-+ * In the end we can correctly and safely |
1854 |
-+ * (re)charge the weight-raising duration for |
1855 |
-+ * the application with the weight-raising |
1856 |
-+ * duration for soft rt applications. |
1857 |
-+ * |
1858 |
-+ * In particular, doing this recharge now, i.e., |
1859 |
-+ * before the weight-raising period for the |
1860 |
-+ * application finishes, reduces the probability |
1861 |
-+ * of the following negative scenario: |
1862 |
-+ * 1) the weight of a soft rt application is |
1863 |
-+ * raised at startup (as for any newly |
1864 |
-+ * created application), |
1865 |
-+ * 2) since the application is not interactive, |
1866 |
-+ * at a certain time weight-raising is |
1867 |
-+ * stopped for the application, |
1868 |
-+ * 3) at that time the application happens to |
1869 |
-+ * still have pending requests, and hence |
1870 |
-+ * is destined to not have a chance to be |
1871 |
-+ * deemed soft rt before these requests are |
1872 |
-+ * completed (see the comments to the |
1873 |
-+ * function bfq_bfqq_softrt_next_start() |
1874 |
-+ * for details on soft rt detection), |
1875 |
-+ * 4) these pending requests experience a high |
1876 |
-+ * latency because the application is not |
1877 |
-+ * weight-raised while they are pending. |
1878 |
-+ */ |
1879 |
-+ bfqq->last_rais_start_finish = jiffies; |
1880 |
-+ bfqq->raising_cur_max_time = |
1881 |
-+ bfqd->bfq_raising_rt_max_time; |
1882 |
-+ } |
1883 |
-+ } |
1884 |
-+ if (old_raising_coeff != bfqq->raising_coeff) |
1885 |
-+ entity->ioprio_changed = 1; |
1886 |
-+add_bfqq_busy: |
1887 |
-+ bfqq->last_idle_bklogged = jiffies; |
1888 |
-+ bfqq->service_from_backlogged = 0; |
1889 |
-+ bfq_clear_bfqq_softrt_update(bfqq); |
1890 |
-+ bfq_add_bfqq_busy(bfqd, bfqq); |
1891 |
-+ } else { |
1892 |
-+ if (bfqd->low_latency && old_raising_coeff == 1 && |
1893 |
-+ !rq_is_sync(rq) && |
1894 |
-+ bfqq->last_rais_start_finish + |
1895 |
-+ time_is_before_jiffies( |
1896 |
-+ bfqd->bfq_raising_min_inter_arr_async)) { |
1897 |
-+ bfqq->raising_coeff = bfqd->bfq_raising_coeff; |
1898 |
-+ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd); |
1899 |
-+ |
1900 |
-+ bfqd->raised_busy_queues++; |
1901 |
-+ entity->ioprio_changed = 1; |
1902 |
-+ bfq_log_bfqq(bfqd, bfqq, |
1903 |
-+ "non-idle wrais starting at %llu msec," |
1904 |
-+ "rais_max_time %u", |
1905 |
-+ bfqq->last_rais_start_finish, |
1906 |
-+ jiffies_to_msecs(bfqq-> |
1907 |
-+ raising_cur_max_time)); |
1908 |
-+ } |
1909 |
-+ bfq_updated_next_req(bfqd, bfqq); |
1910 |
-+ } |
1911 |
-+ |
1912 |
-+ if (bfqd->low_latency && |
1913 |
-+ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 || |
1914 |
-+ idle_for_long_time)) |
1915 |
-+ bfqq->last_rais_start_finish = jiffies; |
1916 |
-+} |
1917 |
-+ |
1918 |
-+static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq) |
1919 |
-+{ |
1920 |
-+ elv_rb_del(&bfqq->sort_list, rq); |
1921 |
-+ bfqq->queued[rq_is_sync(rq)]--; |
1922 |
-+ bfqq->bfqd->queued--; |
1923 |
-+ bfq_add_rq_rb(rq); |
1924 |
-+} |
1925 |
-+ |
1926 |
-+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, |
1927 |
-+ struct bio *bio) |
1928 |
-+{ |
1929 |
-+ struct task_struct *tsk = current; |
1930 |
-+ struct bfq_io_cq *bic; |
1931 |
-+ struct bfq_queue *bfqq; |
1932 |
-+ |
1933 |
-+ bic = bfq_bic_lookup(bfqd, tsk->io_context); |
1934 |
-+ if (bic == NULL) |
1935 |
-+ return NULL; |
1936 |
-+ |
1937 |
-+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); |
1938 |
-+ if (bfqq != NULL) |
1939 |
-+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); |
1940 |
-+ |
1941 |
-+ return NULL; |
1942 |
-+} |
1943 |
-+ |
1944 |
-+static void bfq_activate_request(struct request_queue *q, struct request *rq) |
1945 |
-+{ |
1946 |
-+ struct bfq_data *bfqd = q->elevator->elevator_data; |
1947 |
-+ |
1948 |
-+ bfqd->rq_in_driver++; |
1949 |
-+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); |
1950 |
-+ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", |
1951 |
-+ (long long unsigned)bfqd->last_position); |
1952 |
-+} |
1953 |
-+ |
1954 |
-+static void bfq_deactivate_request(struct request_queue *q, struct request *rq) |
1955 |
-+{ |
1956 |
-+ struct bfq_data *bfqd = q->elevator->elevator_data; |
1957 |
-+ |
1958 |
-+ WARN_ON(bfqd->rq_in_driver == 0); |
1959 |
-+ bfqd->rq_in_driver--; |
1960 |
-+} |
1961 |
-+ |
1962 |
-+static void bfq_remove_request(struct request *rq) |
1963 |
-+{ |
1964 |
-+ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
1965 |
-+ struct bfq_data *bfqd = bfqq->bfqd; |
1966 |
-+ |
1967 |
-+ if (bfqq->next_rq == rq) { |
1968 |
-+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); |
1969 |
-+ bfq_updated_next_req(bfqd, bfqq); |
1970 |
-+ } |
1971 |
-+ |
1972 |
-+ list_del_init(&rq->queuelist); |
1973 |
-+ bfq_del_rq_rb(rq); |
1974 |
-+ |
1975 |
-+ if (rq->cmd_flags & REQ_META) { |
1976 |
-+ WARN_ON(bfqq->meta_pending == 0); |
1977 |
-+ bfqq->meta_pending--; |
1978 |
-+ } |
1979 |
-+} |
1980 |
-+ |
1981 |
-+static int bfq_merge(struct request_queue *q, struct request **req, |
1982 |
-+ struct bio *bio) |
1983 |
-+{ |
1984 |
-+ struct bfq_data *bfqd = q->elevator->elevator_data; |
1985 |
-+ struct request *__rq; |
1986 |
-+ |
1987 |
-+ __rq = bfq_find_rq_fmerge(bfqd, bio); |
1988 |
-+ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) { |
1989 |
-+ *req = __rq; |
1990 |
-+ return ELEVATOR_FRONT_MERGE; |
1991 |
-+ } |
1992 |
-+ |
1993 |
-+ return ELEVATOR_NO_MERGE; |
1994 |
-+} |
1995 |
-+ |
1996 |
-+static void bfq_merged_request(struct request_queue *q, struct request *req, |
1997 |
-+ int type) |
1998 |
-+{ |
1999 |
-+ if (type == ELEVATOR_FRONT_MERGE) { |
2000 |
-+ struct bfq_queue *bfqq = RQ_BFQQ(req); |
2001 |
-+ |
2002 |
-+ bfq_reposition_rq_rb(bfqq, req); |
2003 |
-+ } |
2004 |
-+} |
2005 |
-+ |
2006 |
-+static void bfq_merged_requests(struct request_queue *q, struct request *rq, |
2007 |
-+ struct request *next) |
2008 |
-+{ |
2009 |
-+ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
2010 |
-+ |
2011 |
-+ /* |
2012 |
-+ * Reposition in fifo if next is older than rq. |
2013 |
-+ */ |
2014 |
-+ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && |
2015 |
-+ time_before(rq_fifo_time(next), rq_fifo_time(rq))) { |
2016 |
-+ list_move(&rq->queuelist, &next->queuelist); |
2017 |
-+ rq_set_fifo_time(rq, rq_fifo_time(next)); |
2018 |
-+ } |
2019 |
-+ |
2020 |
-+ if (bfqq->next_rq == next) |
2021 |
-+ bfqq->next_rq = rq; |
2022 |
-+ |
2023 |
-+ bfq_remove_request(next); |
2024 |
-+} |
2025 |
-+ |
2026 |
-+/* Must be called with bfqq != NULL */ |
2027 |
-+static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq) |
2028 |
-+{ |
2029 |
-+ BUG_ON(bfqq == NULL); |
2030 |
-+ if (bfq_bfqq_busy(bfqq)) |
2031 |
-+ bfqq->bfqd->raised_busy_queues--; |
2032 |
-+ bfqq->raising_coeff = 1; |
2033 |
-+ bfqq->raising_cur_max_time = 0; |
2034 |
-+ /* Trigger a weight change on the next activation of the queue */ |
2035 |
-+ bfqq->entity.ioprio_changed = 1; |
2036 |
-+} |
2037 |
-+ |
2038 |
-+static void bfq_end_raising_async_queues(struct bfq_data *bfqd, |
2039 |
-+ struct bfq_group *bfqg) |
2040 |
-+{ |
2041 |
-+ int i, j; |
2042 |
-+ |
2043 |
-+ for (i = 0; i < 2; i++) |
2044 |
-+ for (j = 0; j < IOPRIO_BE_NR; j++) |
2045 |
-+ if (bfqg->async_bfqq[i][j] != NULL) |
2046 |
-+ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]); |
2047 |
-+ if (bfqg->async_idle_bfqq != NULL) |
2048 |
-+ bfq_bfqq_end_raising(bfqg->async_idle_bfqq); |
2049 |
-+} |
2050 |
-+ |
2051 |
-+static void bfq_end_raising(struct bfq_data *bfqd) |
2052 |
-+{ |
2053 |
-+ struct bfq_queue *bfqq; |
2054 |
-+ |
2055 |
-+ spin_lock_irq(bfqd->queue->queue_lock); |
2056 |
-+ |
2057 |
-+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) |
2058 |
-+ bfq_bfqq_end_raising(bfqq); |
2059 |
-+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) |
2060 |
-+ bfq_bfqq_end_raising(bfqq); |
2061 |
-+ bfq_end_raising_async(bfqd); |
2062 |
-+ |
2063 |
-+ spin_unlock_irq(bfqd->queue->queue_lock); |
2064 |
-+} |
2065 |
-+ |
2066 |
-+static int bfq_allow_merge(struct request_queue *q, struct request *rq, |
2067 |
-+ struct bio *bio) |
2068 |
-+{ |
2069 |
-+ struct bfq_data *bfqd = q->elevator->elevator_data; |
2070 |
-+ struct bfq_io_cq *bic; |
2071 |
-+ struct bfq_queue *bfqq; |
2072 |
-+ |
2073 |
-+ /* |
2074 |
-+ * Disallow merge of a sync bio into an async request. |
2075 |
-+ */ |
2076 |
-+ if (bfq_bio_sync(bio) && !rq_is_sync(rq)) |
2077 |
-+ return 0; |
2078 |
-+ |
2079 |
-+ /* |
2080 |
-+ * Lookup the bfqq that this bio will be queued with. Allow |
2081 |
-+ * merge only if rq is queued there. |
2082 |
-+ * Queue lock is held here. |
2083 |
-+ */ |
2084 |
-+ bic = bfq_bic_lookup(bfqd, current->io_context); |
2085 |
-+ if (bic == NULL) |
2086 |
-+ return 0; |
2087 |
-+ |
2088 |
-+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); |
2089 |
-+ return bfqq == RQ_BFQQ(rq); |
2090 |
-+} |
2091 |
-+ |
2092 |
-+static void __bfq_set_in_service_queue(struct bfq_data *bfqd, |
2093 |
-+ struct bfq_queue *bfqq) |
2094 |
-+{ |
2095 |
-+ if (bfqq != NULL) { |
2096 |
-+ bfq_mark_bfqq_must_alloc(bfqq); |
2097 |
-+ bfq_mark_bfqq_budget_new(bfqq); |
2098 |
-+ bfq_clear_bfqq_fifo_expire(bfqq); |
2099 |
-+ |
2100 |
-+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; |
2101 |
-+ |
2102 |
-+ bfq_log_bfqq(bfqd, bfqq, |
2103 |
-+ "set_in_service_queue, cur-budget = %lu", |
2104 |
-+ bfqq->entity.budget); |
2105 |
-+ } |
2106 |
-+ |
2107 |
-+ bfqd->in_service_queue = bfqq; |
2108 |
-+} |
2109 |
-+ |
2110 |
-+/* |
2111 |
-+ * Get and set a new queue for service. |
2112 |
-+ */ |
2113 |
-+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd, |
2114 |
-+ struct bfq_queue *bfqq) |
2115 |
-+{ |
2116 |
-+ if (!bfqq) |
2117 |
-+ bfqq = bfq_get_next_queue(bfqd); |
2118 |
-+ else |
2119 |
-+ bfq_get_next_queue_forced(bfqd, bfqq); |
2120 |
-+ |
2121 |
-+ __bfq_set_in_service_queue(bfqd, bfqq); |
2122 |
-+ return bfqq; |
2123 |
-+} |
2124 |
-+ |
2125 |
-+static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd, |
2126 |
-+ struct request *rq) |
2127 |
-+{ |
2128 |
-+ if (blk_rq_pos(rq) >= bfqd->last_position) |
2129 |
-+ return blk_rq_pos(rq) - bfqd->last_position; |
2130 |
-+ else |
2131 |
-+ return bfqd->last_position - blk_rq_pos(rq); |
2132 |
-+} |
2133 |
-+ |
2134 |
-+/* |
2135 |
-+ * Return true if bfqq has no request pending and rq is close enough to |
2136 |
-+ * bfqd->last_position, or if rq is closer to bfqd->last_position than |
2137 |
-+ * bfqq->next_rq |
2138 |
-+ */ |
2139 |
-+static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq) |
2140 |
-+{ |
2141 |
-+ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR; |
2142 |
-+} |
2143 |
-+ |
2144 |
-+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) |
2145 |
-+{ |
2146 |
-+ struct rb_root *root = &bfqd->rq_pos_tree; |
2147 |
-+ struct rb_node *parent, *node; |
2148 |
-+ struct bfq_queue *__bfqq; |
2149 |
-+ sector_t sector = bfqd->last_position; |
2150 |
-+ |
2151 |
-+ if (RB_EMPTY_ROOT(root)) |
2152 |
-+ return NULL; |
2153 |
-+ |
2154 |
-+ /* |
2155 |
-+ * First, if we find a request starting at the end of the last |
2156 |
-+ * request, choose it. |
2157 |
-+ */ |
2158 |
-+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); |
2159 |
-+ if (__bfqq != NULL) |
2160 |
-+ return __bfqq; |
2161 |
-+ |
2162 |
-+ /* |
2163 |
-+ * If the exact sector wasn't found, the parent of the NULL leaf |
2164 |
-+ * will contain the closest sector (rq_pos_tree sorted by next_request |
2165 |
-+ * position). |
2166 |
-+ */ |
2167 |
-+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); |
2168 |
-+ if (bfq_rq_close(bfqd, __bfqq->next_rq)) |
2169 |
-+ return __bfqq; |
2170 |
-+ |
2171 |
-+ if (blk_rq_pos(__bfqq->next_rq) < sector) |
2172 |
-+ node = rb_next(&__bfqq->pos_node); |
2173 |
-+ else |
2174 |
-+ node = rb_prev(&__bfqq->pos_node); |
2175 |
-+ if (node == NULL) |
2176 |
-+ return NULL; |
2177 |
-+ |
2178 |
-+ __bfqq = rb_entry(node, struct bfq_queue, pos_node); |
2179 |
-+ if (bfq_rq_close(bfqd, __bfqq->next_rq)) |
2180 |
-+ return __bfqq; |
2181 |
-+ |
2182 |
-+ return NULL; |
2183 |
-+} |
2184 |
-+ |
2185 |
-+/* |
2186 |
-+ * bfqd - obvious |
2187 |
-+ * cur_bfqq - passed in so that we don't decide that the current queue |
2188 |
-+ * is closely cooperating with itself. |
2189 |
-+ * |
2190 |
-+ * We are assuming that cur_bfqq has dispatched at least one request, |
2191 |
-+ * and that bfqd->last_position reflects a position on the disk associated |
2192 |
-+ * with the I/O issued by cur_bfqq. |
2193 |
-+ */ |
2194 |
-+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, |
2195 |
-+ struct bfq_queue *cur_bfqq) |
2196 |
-+{ |
2197 |
-+ struct bfq_queue *bfqq; |
2198 |
-+ |
2199 |
-+ if (bfq_class_idle(cur_bfqq)) |
2200 |
-+ return NULL; |
2201 |
-+ if (!bfq_bfqq_sync(cur_bfqq)) |
2202 |
-+ return NULL; |
2203 |
-+ if (BFQQ_SEEKY(cur_bfqq)) |
2204 |
-+ return NULL; |
2205 |
-+ |
2206 |
-+ /* If device has only one backlogged bfq_queue, don't search. */ |
2207 |
-+ if (bfqd->busy_queues == 1) |
2208 |
-+ return NULL; |
2209 |
-+ |
2210 |
-+ /* |
2211 |
-+ * We should notice if some of the queues are cooperating, e.g. |
2212 |
-+ * working closely on the same area of the disk. In that case, |
2213 |
-+ * we can group them together and don't waste time idling. |
2214 |
-+ */ |
2215 |
-+ bfqq = bfqq_close(bfqd); |
2216 |
-+ if (bfqq == NULL || bfqq == cur_bfqq) |
2217 |
-+ return NULL; |
2218 |
-+ |
2219 |
-+ /* |
2220 |
-+ * Do not merge queues from different bfq_groups. |
2221 |
-+ */ |
2222 |
-+ if (bfqq->entity.parent != cur_bfqq->entity.parent) |
2223 |
-+ return NULL; |
2224 |
-+ |
2225 |
-+ /* |
2226 |
-+ * It only makes sense to merge sync queues. |
2227 |
-+ */ |
2228 |
-+ if (!bfq_bfqq_sync(bfqq)) |
2229 |
-+ return NULL; |
2230 |
-+ if (BFQQ_SEEKY(bfqq)) |
2231 |
-+ return NULL; |
2232 |
-+ |
2233 |
-+ /* |
2234 |
-+ * Do not merge queues of different priority classes. |
2235 |
-+ */ |
2236 |
-+ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq)) |
2237 |
-+ return NULL; |
2238 |
-+ |
2239 |
-+ return bfqq; |
2240 |
-+} |
2241 |
-+ |
2242 |
-+/* |
2243 |
-+ * If enough samples have been computed, return the current max budget |
2244 |
-+ * stored in bfqd, which is dynamically updated according to the |
2245 |
-+ * estimated disk peak rate; otherwise return the default max budget |
2246 |
-+ */ |
2247 |
-+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd) |
2248 |
-+{ |
2249 |
-+ if (bfqd->budgets_assigned < 194) |
2250 |
-+ return bfq_default_max_budget; |
2251 |
-+ else |
2252 |
-+ return bfqd->bfq_max_budget; |
2253 |
-+} |
2254 |
-+ |
2255 |
-+/* |
2256 |
-+ * Return min budget, which is a fraction of the current or default |
2257 |
-+ * max budget (trying with 1/32) |
2258 |
-+ */ |
2259 |
-+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd) |
2260 |
-+{ |
2261 |
-+ if (bfqd->budgets_assigned < 194) |
2262 |
-+ return bfq_default_max_budget / 32; |
2263 |
-+ else |
2264 |
-+ return bfqd->bfq_max_budget / 32; |
2265 |
-+} |
2266 |
-+ |
2267 |
-+/* |
2268 |
-+ * Decides whether idling should be done for given device and |
2269 |
-+ * given in-service queue. |
2270 |
-+ */ |
2271 |
-+static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd, |
2272 |
-+ struct bfq_queue *in_service_bfqq) |
2273 |
-+{ |
2274 |
-+ if (in_service_bfqq == NULL) |
2275 |
-+ return false; |
2276 |
-+ /* |
2277 |
-+ * If device is SSD it has no seek penalty, disable idling; but |
2278 |
-+ * do so only if: |
2279 |
-+ * - device does not support queuing, otherwise we still have |
2280 |
-+ * a problem with sync vs async workloads; |
2281 |
-+ * - the queue is not weight-raised, to preserve guarantees. |
2282 |
-+ */ |
2283 |
-+ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag && |
2284 |
-+ in_service_bfqq->raising_coeff == 1); |
2285 |
-+} |
2286 |
-+ |
2287 |
-+static void bfq_arm_slice_timer(struct bfq_data *bfqd) |
2288 |
-+{ |
2289 |
-+ struct bfq_queue *bfqq = bfqd->in_service_queue; |
2290 |
-+ struct bfq_io_cq *bic; |
2291 |
-+ unsigned long sl; |
2292 |
-+ |
2293 |
-+ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); |
2294 |
-+ |
2295 |
-+ /* Tasks have exited, don't wait. */ |
2296 |
-+ bic = bfqd->in_service_bic; |
2297 |
-+ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0) |
2298 |
-+ return; |
2299 |
-+ |
2300 |
-+ bfq_mark_bfqq_wait_request(bfqq); |
2301 |
-+ |
2302 |
-+ /* |
2303 |
-+ * We don't want to idle for seeks, but we do want to allow |
2304 |
-+ * fair distribution of slice time for a process doing back-to-back |
2305 |
-+ * seeks. So allow a little bit of time for him to submit a new rq. |
2306 |
-+ * |
2307 |
-+ * To prevent processes with (partly) seeky workloads from |
2308 |
-+ * being too ill-treated, grant them a small fraction of the |
2309 |
-+ * assigned budget before reducing the waiting time to |
2310 |
-+ * BFQ_MIN_TT. This happened to help reduce latency. |
2311 |
-+ */ |
2312 |
-+ sl = bfqd->bfq_slice_idle; |
2313 |
-+ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) && |
2314 |
-+ bfqq->entity.service > bfq_max_budget(bfqd) / 8 && |
2315 |
-+ bfqq->raising_coeff == 1) |
2316 |
-+ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); |
2317 |
-+ else if (bfqq->raising_coeff > 1) |
2318 |
-+ sl = sl * 3; |
2319 |
-+ bfqd->last_idling_start = ktime_get(); |
2320 |
-+ mod_timer(&bfqd->idle_slice_timer, jiffies + sl); |
2321 |
-+ bfq_log(bfqd, "arm idle: %u/%u ms", |
2322 |
-+ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); |
2323 |
-+} |
2324 |
-+ |
2325 |
-+/* |
2326 |
-+ * Set the maximum time for the in-service queue to consume its |
2327 |
-+ * budget. This prevents seeky processes from lowering the disk |
2328 |
-+ * throughput (always guaranteed with a time slice scheme as in CFQ). |
2329 |
-+ */ |
2330 |
-+static void bfq_set_budget_timeout(struct bfq_data *bfqd) |
2331 |
-+{ |
2332 |
-+ struct bfq_queue *bfqq = bfqd->in_service_queue; |
2333 |
-+ unsigned int timeout_coeff; |
2334 |
-+ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time) |
2335 |
-+ timeout_coeff = 1; |
2336 |
-+ else |
2337 |
-+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; |
2338 |
-+ |
2339 |
-+ bfqd->last_budget_start = ktime_get(); |
2340 |
-+ |
2341 |
-+ bfq_clear_bfqq_budget_new(bfqq); |
2342 |
-+ bfqq->budget_timeout = jiffies + |
2343 |
-+ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; |
2344 |
-+ |
2345 |
-+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", |
2346 |
-+ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * |
2347 |
-+ timeout_coeff)); |
2348 |
-+} |
2349 |
-+ |
2350 |
-+/* |
2351 |
-+ * Move request from internal lists to the request queue dispatch list. |
2352 |
-+ */ |
2353 |
-+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) |
2354 |
-+{ |
2355 |
-+ struct bfq_data *bfqd = q->elevator->elevator_data; |
2356 |
-+ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
2357 |
-+ |
2358 |
-+ bfq_remove_request(rq); |
2359 |
-+ bfqq->dispatched++; |
2360 |
-+ elv_dispatch_sort(q, rq); |
2361 |
-+ |
2362 |
-+ if (bfq_bfqq_sync(bfqq)) |
2363 |
-+ bfqd->sync_flight++; |
2364 |
-+} |
2365 |
-+ |
2366 |
-+/* |
2367 |
-+ * Return expired entry, or NULL to just start from scratch in rbtree. |
2368 |
-+ */ |
2369 |
-+static struct request *bfq_check_fifo(struct bfq_queue *bfqq) |
2370 |
-+{ |
2371 |
-+ struct request *rq = NULL; |
2372 |
-+ |
2373 |
-+ if (bfq_bfqq_fifo_expire(bfqq)) |
2374 |
-+ return NULL; |
2375 |
-+ |
2376 |
-+ bfq_mark_bfqq_fifo_expire(bfqq); |
2377 |
-+ |
2378 |
-+ if (list_empty(&bfqq->fifo)) |
2379 |
-+ return NULL; |
2380 |
-+ |
2381 |
-+ rq = rq_entry_fifo(bfqq->fifo.next); |
2382 |
-+ |
2383 |
-+ if (time_before(jiffies, rq_fifo_time(rq))) |
2384 |
-+ return NULL; |
2385 |
-+ |
2386 |
-+ return rq; |
2387 |
-+} |
2388 |
-+ |
2389 |
-+/* |
2390 |
-+ * Must be called with the queue_lock held. |
2391 |
-+ */ |
2392 |
-+static int bfqq_process_refs(struct bfq_queue *bfqq) |
2393 |
-+{ |
2394 |
-+ int process_refs, io_refs; |
2395 |
-+ |
2396 |
-+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; |
2397 |
-+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; |
2398 |
-+ BUG_ON(process_refs < 0); |
2399 |
-+ return process_refs; |
2400 |
-+} |
2401 |
-+ |
2402 |
-+static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) |
2403 |
-+{ |
2404 |
-+ int process_refs, new_process_refs; |
2405 |
-+ struct bfq_queue *__bfqq; |
2406 |
-+ |
2407 |
-+ /* |
2408 |
-+ * If there are no process references on the new_bfqq, then it is |
2409 |
-+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain |
2410 |
-+ * may have dropped their last reference (not just their last process |
2411 |
-+ * reference). |
2412 |
-+ */ |
2413 |
-+ if (!bfqq_process_refs(new_bfqq)) |
2414 |
-+ return; |
2415 |
-+ |
2416 |
-+ /* Avoid a circular list and skip interim queue merges. */ |
2417 |
-+ while ((__bfqq = new_bfqq->new_bfqq)) { |
2418 |
-+ if (__bfqq == bfqq) |
2419 |
-+ return; |
2420 |
-+ new_bfqq = __bfqq; |
2421 |
-+ } |
2422 |
-+ |
2423 |
-+ process_refs = bfqq_process_refs(bfqq); |
2424 |
-+ new_process_refs = bfqq_process_refs(new_bfqq); |
2425 |
-+ /* |
2426 |
-+ * If the process for the bfqq has gone away, there is no |
2427 |
-+ * sense in merging the queues. |
2428 |
-+ */ |
2429 |
-+ if (process_refs == 0 || new_process_refs == 0) |
2430 |
-+ return; |
2431 |
-+ |
2432 |
-+ /* |
2433 |
-+ * Merge in the direction of the lesser amount of work. |
2434 |
-+ */ |
2435 |
-+ if (new_process_refs >= process_refs) { |
2436 |
-+ bfqq->new_bfqq = new_bfqq; |
2437 |
-+ atomic_add(process_refs, &new_bfqq->ref); |
2438 |
-+ } else { |
2439 |
-+ new_bfqq->new_bfqq = bfqq; |
2440 |
-+ atomic_add(new_process_refs, &bfqq->ref); |
2441 |
-+ } |
2442 |
-+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", |
2443 |
-+ new_bfqq->pid); |
2444 |
-+} |
2445 |
-+ |
2446 |
-+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) |
2447 |
-+{ |
2448 |
-+ struct bfq_entity *entity = &bfqq->entity; |
2449 |
-+ return entity->budget - entity->service; |
2450 |
-+} |
2451 |
-+ |
2452 |
-+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
2453 |
-+{ |
2454 |
-+ BUG_ON(bfqq != bfqd->in_service_queue); |
2455 |
-+ |
2456 |
-+ __bfq_bfqd_reset_in_service(bfqd); |
2457 |
-+ |
2458 |
-+ /* |
2459 |
-+ * If this bfqq is shared between multiple processes, check |
2460 |
-+ * to make sure that those processes are still issuing I/Os |
2461 |
-+ * within the mean seek distance. If not, it may be time to |
2462 |
-+ * break the queues apart again. |
2463 |
-+ */ |
2464 |
-+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) |
2465 |
-+ bfq_mark_bfqq_split_coop(bfqq); |
2466 |
-+ |
2467 |
-+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { |
2468 |
-+ /* |
2469 |
-+ * overloading budget_timeout field to store when |
2470 |
-+ * the queue remains with no backlog, used by |
2471 |
-+ * the weight-raising mechanism |
2472 |
-+ */ |
2473 |
-+ bfqq->budget_timeout = jiffies; |
2474 |
-+ bfq_del_bfqq_busy(bfqd, bfqq, 1); |
2475 |
-+ } else { |
2476 |
-+ bfq_activate_bfqq(bfqd, bfqq); |
2477 |
-+ /* |
2478 |
-+ * Resort priority tree of potential close cooperators. |
2479 |
-+ */ |
2480 |
-+ bfq_rq_pos_tree_add(bfqd, bfqq); |
2481 |
-+ } |
2482 |
-+} |
2483 |
-+ |
2484 |
-+/** |
2485 |
-+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. |
2486 |
-+ * @bfqd: device data. |
2487 |
-+ * @bfqq: queue to update. |
2488 |
-+ * @reason: reason for expiration. |
2489 |
-+ * |
2490 |
-+ * Handle the feedback on @bfqq budget. See the body for detailed |
2491 |
-+ * comments. |
2492 |
-+ */ |
2493 |
-+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, |
2494 |
-+ struct bfq_queue *bfqq, |
2495 |
-+ enum bfqq_expiration reason) |
2496 |
-+{ |
2497 |
-+ struct request *next_rq; |
2498 |
-+ unsigned long budget, min_budget; |
2499 |
-+ |
2500 |
-+ budget = bfqq->max_budget; |
2501 |
-+ min_budget = bfq_min_budget(bfqd); |
2502 |
-+ |
2503 |
-+ BUG_ON(bfqq != bfqd->in_service_queue); |
2504 |
-+ |
2505 |
-+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu", |
2506 |
-+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); |
2507 |
-+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu", |
2508 |
-+ budget, bfq_min_budget(bfqd)); |
2509 |
-+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", |
2510 |
-+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); |
2511 |
-+ |
2512 |
-+ if (bfq_bfqq_sync(bfqq)) { |
2513 |
-+ switch (reason) { |
2514 |
-+ /* |
2515 |
-+ * Caveat: in all the following cases we trade latency |
2516 |
-+ * for throughput. |
2517 |
-+ */ |
2518 |
-+ case BFQ_BFQQ_TOO_IDLE: |
2519 |
-+ /* |
2520 |
-+ * This is the only case where we may reduce |
2521 |
-+ * the budget: if there is no requets of the |
2522 |
-+ * process still waiting for completion, then |
2523 |
-+ * we assume (tentatively) that the timer has |
2524 |
-+ * expired because the batch of requests of |
2525 |
-+ * the process could have been served with a |
2526 |
-+ * smaller budget. Hence, betting that |
2527 |
-+ * process will behave in the same way when it |
2528 |
-+ * becomes backlogged again, we reduce its |
2529 |
-+ * next budget. As long as we guess right, |
2530 |
-+ * this budget cut reduces the latency |
2531 |
-+ * experienced by the process. |
2532 |
-+ * |
2533 |
-+ * However, if there are still outstanding |
2534 |
-+ * requests, then the process may have not yet |
2535 |
-+ * issued its next request just because it is |
2536 |
-+ * still waiting for the completion of some of |
2537 |
-+ * the still oustanding ones. So in this |
2538 |
-+ * subcase we do not reduce its budget, on the |
2539 |
-+ * contrary we increase it to possibly boost |
2540 |
-+ * the throughput, as discussed in the |
2541 |
-+ * comments to the BUDGET_TIMEOUT case. |
2542 |
-+ */ |
2543 |
-+ if (bfqq->dispatched > 0) /* still oustanding reqs */ |
2544 |
-+ budget = min(budget * 2, bfqd->bfq_max_budget); |
2545 |
-+ else { |
2546 |
-+ if (budget > 5 * min_budget) |
2547 |
-+ budget -= 4 * min_budget; |
2548 |
-+ else |
2549 |
-+ budget = min_budget; |
2550 |
-+ } |
2551 |
-+ break; |
2552 |
-+ case BFQ_BFQQ_BUDGET_TIMEOUT: |
2553 |
-+ /* |
2554 |
-+ * We double the budget here because: 1) it |
2555 |
-+ * gives the chance to boost the throughput if |
2556 |
-+ * this is not a seeky process (which may have |
2557 |
-+ * bumped into this timeout because of, e.g., |
2558 |
-+ * ZBR), 2) together with charge_full_budget |
2559 |
-+ * it helps give seeky processes higher |
2560 |
-+ * timestamps, and hence be served less |
2561 |
-+ * frequently. |
2562 |
-+ */ |
2563 |
-+ budget = min(budget * 2, bfqd->bfq_max_budget); |
2564 |
-+ break; |
2565 |
-+ case BFQ_BFQQ_BUDGET_EXHAUSTED: |
2566 |
-+ /* |
2567 |
-+ * The process still has backlog, and did not |
2568 |
-+ * let either the budget timeout or the disk |
2569 |
-+ * idling timeout expire. Hence it is not |
2570 |
-+ * seeky, has a short thinktime and may be |
2571 |
-+ * happy with a higher budget too. So |
2572 |
-+ * definitely increase the budget of this good |
2573 |
-+ * candidate to boost the disk throughput. |
2574 |
-+ */ |
2575 |
-+ budget = min(budget * 4, bfqd->bfq_max_budget); |
2576 |
-+ break; |
2577 |
-+ case BFQ_BFQQ_NO_MORE_REQUESTS: |
2578 |
-+ /* |
2579 |
-+ * Leave the budget unchanged. |
2580 |
-+ */ |
2581 |
-+ default: |
2582 |
-+ return; |
2583 |
-+ } |
2584 |
-+ } else /* async queue */ |
2585 |
-+ /* async queues get always the maximum possible budget |
2586 |
-+ * (their ability to dispatch is limited by |
2587 |
-+ * @bfqd->bfq_max_budget_async_rq). |
2588 |
-+ */ |
2589 |
-+ budget = bfqd->bfq_max_budget; |
2590 |
-+ |
2591 |
-+ bfqq->max_budget = budget; |
2592 |
-+ |
2593 |
-+ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 && |
2594 |
-+ bfqq->max_budget > bfqd->bfq_max_budget) |
2595 |
-+ bfqq->max_budget = bfqd->bfq_max_budget; |
2596 |
-+ |
2597 |
-+ /* |
2598 |
-+ * Make sure that we have enough budget for the next request. |
2599 |
-+ * Since the finish time of the bfqq must be kept in sync with |
2600 |
-+ * the budget, be sure to call __bfq_bfqq_expire() after the |
2601 |
-+ * update. |
2602 |
-+ */ |
2603 |
-+ next_rq = bfqq->next_rq; |
2604 |
-+ if (next_rq != NULL) |
2605 |
-+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, |
2606 |
-+ bfq_serv_to_charge(next_rq, bfqq)); |
2607 |
-+ else |
2608 |
-+ bfqq->entity.budget = bfqq->max_budget; |
2609 |
-+ |
2610 |
-+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu", |
2611 |
-+ next_rq != NULL ? blk_rq_sectors(next_rq) : 0, |
2612 |
-+ bfqq->entity.budget); |
2613 |
-+} |
2614 |
-+ |
2615 |
-+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) |
2616 |
-+{ |
2617 |
-+ unsigned long max_budget; |
2618 |
-+ |
2619 |
-+ /* |
2620 |
-+ * The max_budget calculated when autotuning is equal to the |
2621 |
-+ * amount of sectors transfered in timeout_sync at the |
2622 |
-+ * estimated peak rate. |
2623 |
-+ */ |
2624 |
-+ max_budget = (unsigned long)(peak_rate * 1000 * |
2625 |
-+ timeout >> BFQ_RATE_SHIFT); |
2626 |
-+ |
2627 |
-+ return max_budget; |
2628 |
-+} |
2629 |
-+ |
2630 |
-+/* |
2631 |
-+ * In addition to updating the peak rate, checks whether the process |
2632 |
-+ * is "slow", and returns 1 if so. This slow flag is used, in addition |
2633 |
-+ * to the budget timeout, to reduce the amount of service provided to |
2634 |
-+ * seeky processes, and hence reduce their chances to lower the |
2635 |
-+ * throughput. See the code for more details. |
2636 |
-+ */ |
2637 |
-+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
2638 |
-+ int compensate, enum bfqq_expiration reason) |
2639 |
-+{ |
2640 |
-+ u64 bw, usecs, expected, timeout; |
2641 |
-+ ktime_t delta; |
2642 |
-+ int update = 0; |
2643 |
-+ |
2644 |
-+ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) |
2645 |
-+ return 0; |
2646 |
-+ |
2647 |
-+ if (compensate) |
2648 |
-+ delta = bfqd->last_idling_start; |
2649 |
-+ else |
2650 |
-+ delta = ktime_get(); |
2651 |
-+ delta = ktime_sub(delta, bfqd->last_budget_start); |
2652 |
-+ usecs = ktime_to_us(delta); |
2653 |
-+ |
2654 |
-+ /* Don't trust short/unrealistic values. */ |
2655 |
-+ if (usecs < 100 || usecs >= LONG_MAX) |
2656 |
-+ return 0; |
2657 |
-+ |
2658 |
-+ /* |
2659 |
-+ * Calculate the bandwidth for the last slice. We use a 64 bit |
2660 |
-+ * value to store the peak rate, in sectors per usec in fixed |
2661 |
-+ * point math. We do so to have enough precision in the estimate |
2662 |
-+ * and to avoid overflows. |
2663 |
-+ */ |
2664 |
-+ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; |
2665 |
-+ do_div(bw, (unsigned long)usecs); |
2666 |
-+ |
2667 |
-+ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); |
2668 |
-+ |
2669 |
-+ /* |
2670 |
-+ * Use only long (> 20ms) intervals to filter out spikes for |
2671 |
-+ * the peak rate estimation. |
2672 |
-+ */ |
2673 |
-+ if (usecs > 20000) { |
2674 |
-+ if (bw > bfqd->peak_rate || |
2675 |
-+ (!BFQQ_SEEKY(bfqq) && |
2676 |
-+ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { |
2677 |
-+ bfq_log(bfqd, "measured bw =%llu", bw); |
2678 |
-+ /* |
2679 |
-+ * To smooth oscillations use a low-pass filter with |
2680 |
-+ * alpha=7/8, i.e., |
2681 |
-+ * new_rate = (7/8) * old_rate + (1/8) * bw |
2682 |
-+ */ |
2683 |
-+ do_div(bw, 8); |
2684 |
-+ if (bw == 0) |
2685 |
-+ return 0; |
2686 |
-+ bfqd->peak_rate *= 7; |
2687 |
-+ do_div(bfqd->peak_rate, 8); |
2688 |
-+ bfqd->peak_rate += bw; |
2689 |
-+ update = 1; |
2690 |
-+ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); |
2691 |
-+ } |
2692 |
-+ |
2693 |
-+ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; |
2694 |
-+ |
2695 |
-+ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) |
2696 |
-+ bfqd->peak_rate_samples++; |
2697 |
-+ |
2698 |
-+ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && |
2699 |
-+ update && bfqd->bfq_user_max_budget == 0) { |
2700 |
-+ bfqd->bfq_max_budget = |
2701 |
-+ bfq_calc_max_budget(bfqd->peak_rate, timeout); |
2702 |
-+ bfq_log(bfqd, "new max_budget=%lu", |
2703 |
-+ bfqd->bfq_max_budget); |
2704 |
-+ } |
2705 |
-+ } |
2706 |
-+ |
2707 |
-+ /* |
2708 |
-+ * If the process has been served for a too short time |
2709 |
-+ * interval to let its possible sequential accesses prevail on |
2710 |
-+ * the initial seek time needed to move the disk head on the |
2711 |
-+ * first sector it requested, then give the process a chance |
2712 |
-+ * and for the moment return false. |
2713 |
-+ */ |
2714 |
-+ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) |
2715 |
-+ return 0; |
2716 |
-+ |
2717 |
-+ /* |
2718 |
-+ * A process is considered ``slow'' (i.e., seeky, so that we |
2719 |
-+ * cannot treat it fairly in the service domain, as it would |
2720 |
-+ * slow down too much the other processes) if, when a slice |
2721 |
-+ * ends for whatever reason, it has received service at a |
2722 |
-+ * rate that would not be high enough to complete the budget |
2723 |
-+ * before the budget timeout expiration. |
2724 |
-+ */ |
2725 |
-+ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; |
2726 |
-+ |
2727 |
-+ /* |
2728 |
-+ * Caveat: processes doing IO in the slower disk zones will |
2729 |
-+ * tend to be slow(er) even if not seeky. And the estimated |
2730 |
-+ * peak rate will actually be an average over the disk |
2731 |
-+ * surface. Hence, to not be too harsh with unlucky processes, |
2732 |
-+ * we keep a budget/3 margin of safety before declaring a |
2733 |
-+ * process slow. |
2734 |
-+ */ |
2735 |
-+ return expected > (4 * bfqq->entity.budget) / 3; |
2736 |
-+} |
2737 |
-+ |
2738 |
-+/* |
2739 |
-+ * To be deemed as soft real-time, an application must meet two requirements. |
2740 |
-+ * The first is that the application must not require an average bandwidth |
2741 |
-+ * higher than the approximate bandwidth required to playback or record a |
2742 |
-+ * compressed high-definition video. |
2743 |
-+ * The next function is invoked on the completion of the last request of a |
2744 |
-+ * batch, to compute the next-start time instant, soft_rt_next_start, such |
2745 |
-+ * that, if the next request of the application does not arrive before |
2746 |
-+ * soft_rt_next_start, then the above requirement on the bandwidth is met. |
2747 |
-+ * |
2748 |
-+ * The second requirement is that the request pattern of the application is |
2749 |
-+ * isochronous, i.e., that, after issuing a request or a batch of requests, the |
2750 |
-+ * application stops for a while, then issues a new batch, and so on. For this |
2751 |
-+ * reason the next function is invoked to compute soft_rt_next_start only for |
2752 |
-+ * applications that meet this requirement, whereas soft_rt_next_start is set |
2753 |
-+ * to infinity for applications that do not. |
2754 |
-+ * |
2755 |
-+ * Unfortunately, even a greedy application may happen to behave in an |
2756 |
-+ * isochronous way if several processes are competing for the CPUs. In fact, |
2757 |
-+ * in this scenario the application stops issuing requests while the CPUs are |
2758 |
-+ * busy serving other processes, then restarts, then stops again for a while, |
2759 |
-+ * and so on. In addition, if the disk achieves a low enough throughput with |
2760 |
-+ * the request pattern issued by the application, then the above bandwidth |
2761 |
-+ * requirement may happen to be met too. To prevent such a greedy application |
2762 |
-+ * to be deemed as soft real-time, a further rule is used in the computation |
2763 |
-+ * of soft_rt_next_start: soft_rt_next_start must be higher than the current |
2764 |
-+ * time plus the maximum time for which the arrival of a request is waited |
2765 |
-+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This |
2766 |
-+ * filters out greedy applications, as the latter issue instead their next |
2767 |
-+ * request as soon as possible after the last one has been completed (in |
2768 |
-+ * contrast, when a batch of requests is completed, a soft real-time |
2769 |
-+ * application spends some time processing data). |
2770 |
-+ * |
2771 |
-+ * Actually, the last filter may easily generate false positives if: only |
2772 |
-+ * bfqd->bfq_slice_idle is used as a reference time interval, and one or |
2773 |
-+ * both the following two cases occur: |
2774 |
-+ * 1) HZ is so low that the duration of a jiffie is comparable to or higher |
2775 |
-+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with |
2776 |
-+ * HZ=100. |
2777 |
-+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing |
2778 |
-+ * for a while, then suddenly 'jump' by several units to recover the lost |
2779 |
-+ * increments. This seems to happen, e.g., inside virtual machines. |
2780 |
-+ * To address this issue, we do not use as a reference time interval just |
2781 |
-+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In |
2782 |
-+ * particular we add the minimum number of jiffies for which the filter seems |
2783 |
-+ * to be quite precise also in embedded systems and KVM/QEMU virtual machines. |
2784 |
-+ */ |
2785 |
-+static inline u64 bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, |
2786 |
-+ struct bfq_queue *bfqq) |
2787 |
-+{ |
2788 |
-+ return max(bfqq->last_idle_bklogged + |
2789 |
-+ HZ * bfqq->service_from_backlogged / |
2790 |
-+ bfqd->bfq_raising_max_softrt_rate, |
2791 |
-+ (u64)jiffies + bfqq->bfqd->bfq_slice_idle + 4); |
2792 |
-+} |
2793 |
-+ |
2794 |
-+/** |
2795 |
-+ * bfq_bfqq_expire - expire a queue. |
2796 |
-+ * @bfqd: device owning the queue. |
2797 |
-+ * @bfqq: the queue to expire. |
2798 |
-+ * @compensate: if true, compensate for the time spent idling. |
2799 |
-+ * @reason: the reason causing the expiration. |
2800 |
-+ * |
2801 |
-+ * |
2802 |
-+ * If the process associated to the queue is slow (i.e., seeky), or in |
2803 |
-+ * case of budget timeout, or, finally, if it is async, we |
2804 |
-+ * artificially charge it an entire budget (independently of the |
2805 |
-+ * actual service it received). As a consequence, the queue will get |
2806 |
-+ * higher timestamps than the correct ones upon reactivation, and |
2807 |
-+ * hence it will be rescheduled as if it had received more service |
2808 |
-+ * than what it actually received. In the end, this class of processes |
2809 |
-+ * will receive less service in proportion to how slowly they consume |
2810 |
-+ * their budgets (and hence how seriously they tend to lower the |
2811 |
-+ * throughput). |
2812 |
-+ * |
2813 |
-+ * In contrast, when a queue expires because it has been idling for |
2814 |
-+ * too much or because it exhausted its budget, we do not touch the |
2815 |
-+ * amount of service it has received. Hence when the queue will be |
2816 |
-+ * reactivated and its timestamps updated, the latter will be in sync |
2817 |
-+ * with the actual service received by the queue until expiration. |
2818 |
-+ * |
2819 |
-+ * Charging a full budget to the first type of queues and the exact |
2820 |
-+ * service to the others has the effect of using the WF2Q+ policy to |
2821 |
-+ * schedule the former on a timeslice basis, without violating the |
2822 |
-+ * service domain guarantees of the latter. |
2823 |
-+ */ |
2824 |
-+static void bfq_bfqq_expire(struct bfq_data *bfqd, |
2825 |
-+ struct bfq_queue *bfqq, |
2826 |
-+ int compensate, |
2827 |
-+ enum bfqq_expiration reason) |
2828 |
-+{ |
2829 |
-+ int slow; |
2830 |
-+ BUG_ON(bfqq != bfqd->in_service_queue); |
2831 |
-+ |
2832 |
-+ /* Update disk peak rate for autotuning and check whether the |
2833 |
-+ * process is slow (see bfq_update_peak_rate). |
2834 |
-+ */ |
2835 |
-+ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); |
2836 |
-+ |
2837 |
-+ /* |
2838 |
-+ * As above explained, 'punish' slow (i.e., seeky), timed-out |
2839 |
-+ * and async queues, to favor sequential sync workloads. |
2840 |
-+ * |
2841 |
-+ * Processes doing IO in the slower disk zones will tend to be |
2842 |
-+ * slow(er) even if not seeky. Hence, since the estimated peak |
2843 |
-+ * rate is actually an average over the disk surface, these |
2844 |
-+ * processes may timeout just for bad luck. To avoid punishing |
2845 |
-+ * them we do not charge a full budget to a process that |
2846 |
-+ * succeeded in consuming at least 2/3 of its budget. |
2847 |
-+ */ |
2848 |
-+ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && |
2849 |
-+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) |
2850 |
-+ bfq_bfqq_charge_full_budget(bfqq); |
2851 |
-+ |
2852 |
-+ bfqq->service_from_backlogged += bfqq->entity.service; |
2853 |
-+ |
2854 |
-+ if (bfqd->low_latency && bfqq->raising_coeff == 1) |
2855 |
-+ bfqq->last_rais_start_finish = jiffies; |
2856 |
-+ |
2857 |
-+ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) { |
2858 |
-+ if (reason != BFQ_BFQQ_BUDGET_TIMEOUT && |
2859 |
-+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) { |
2860 |
-+ /* |
2861 |
-+ * If we get here, then the request pattern is |
2862 |
-+ * isochronous (see the comments to the function |
2863 |
-+ * bfq_bfqq_softrt_next_start()). However, if the |
2864 |
-+ * queue still has in-flight requests, then it is |
2865 |
-+ * better to postpone the computation of next_start |
2866 |
-+ * to the next request completion. In fact, if we |
2867 |
-+ * computed it now, then the application might pass |
2868 |
-+ * the greedy-application filter improperly, because |
2869 |
-+ * the arrival of its next request may happen to be |
2870 |
-+ * higher than (jiffies + bfqq->bfqd->bfq_slice_idle) |
2871 |
-+ * not because the application is truly soft real- |
2872 |
-+ * time, but just because the application is currently |
2873 |
-+ * waiting for the completion of some request before |
2874 |
-+ * issuing, as quickly as possible, its next request. |
2875 |
-+ */ |
2876 |
-+ if (bfqq->dispatched > 0) { |
2877 |
-+ bfqq->soft_rt_next_start = -1; |
2878 |
-+ bfq_mark_bfqq_softrt_update(bfqq); |
2879 |
-+ } else |
2880 |
-+ bfqq->soft_rt_next_start = |
2881 |
-+ bfq_bfqq_softrt_next_start(bfqd, bfqq); |
2882 |
-+ } else |
2883 |
-+ bfqq->soft_rt_next_start = -1; /* infinity */ |
2884 |
-+ } |
2885 |
-+ |
2886 |
-+ bfq_log_bfqq(bfqd, bfqq, |
2887 |
-+ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow, |
2888 |
-+ bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); |
2889 |
-+ |
2890 |
-+ /* Increase, decrease or leave budget unchanged according to reason */ |
2891 |
-+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); |
2892 |
-+ __bfq_bfqq_expire(bfqd, bfqq); |
2893 |
-+} |
2894 |
-+ |
2895 |
-+/* |
2896 |
-+ * Budget timeout is not implemented through a dedicated timer, but |
2897 |
-+ * just checked on request arrivals and completions, as well as on |
2898 |
-+ * idle timer expirations. |
2899 |
-+ */ |
2900 |
-+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) |
2901 |
-+{ |
2902 |
-+ if (bfq_bfqq_budget_new(bfqq)) |
2903 |
-+ return 0; |
2904 |
-+ |
2905 |
-+ if (time_before(jiffies, bfqq->budget_timeout)) |
2906 |
-+ return 0; |
2907 |
-+ |
2908 |
-+ return 1; |
2909 |
-+} |
2910 |
-+ |
2911 |
-+/* |
2912 |
-+ * If we expire a queue that is waiting for the arrival of a new |
2913 |
-+ * request, we may prevent the fictitious timestamp backshifting that |
2914 |
-+ * allows the guarantees of the queue to be preserved (see [1] for |
2915 |
-+ * this tricky aspect). Hence we return true only if this condition |
2916 |
-+ * does not hold, or if the queue is slow enough to deserve only to be |
2917 |
-+ * kicked off for preserving a high throughput. |
2918 |
-+*/ |
2919 |
-+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) |
2920 |
-+{ |
2921 |
-+ bfq_log_bfqq(bfqq->bfqd, bfqq, |
2922 |
-+ "may_budget_timeout: wr %d left %d timeout %d", |
2923 |
-+ bfq_bfqq_wait_request(bfqq), |
2924 |
-+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, |
2925 |
-+ bfq_bfqq_budget_timeout(bfqq)); |
2926 |
-+ |
2927 |
-+ return (!bfq_bfqq_wait_request(bfqq) || |
2928 |
-+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) |
2929 |
-+ && |
2930 |
-+ bfq_bfqq_budget_timeout(bfqq); |
2931 |
-+} |
2932 |
-+ |
2933 |
-+/* |
2934 |
-+ * For weight-raised queues issuing sync requests, idling is always performed, |
2935 |
-+ * as this is instrumental in guaranteeing a high fraction of the throughput |
2936 |
-+ * to these queues, and hence in guaranteeing a lower latency for their |
2937 |
-+ * requests. See [1] for details. |
2938 |
-+ * |
2939 |
-+ * For non-weight-raised queues, idling is instead disabled if the device is |
2940 |
-+ * NCQ-enabled and non-rotational, as this boosts the throughput on such |
2941 |
-+ * devices. |
2942 |
-+ */ |
2943 |
-+static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq) |
2944 |
-+{ |
2945 |
-+ struct bfq_data *bfqd = bfqq->bfqd; |
2946 |
-+ |
2947 |
-+ return bfq_bfqq_sync(bfqq) && ( |
2948 |
-+ bfqq->raising_coeff > 1 || |
2949 |
-+ (bfq_bfqq_idle_window(bfqq) && |
2950 |
-+ !(bfqd->hw_tag && |
2951 |
-+ (blk_queue_nonrot(bfqd->queue) || |
2952 |
-+ /* |
2953 |
-+ * If there are weight-raised busy queues, then do not idle |
2954 |
-+ * the disk for a sync non-weight-raised queue, and hence |
2955 |
-+ * expire the queue immediately if empty. Combined with the |
2956 |
-+ * timestamping rules of BFQ (see [1] for details), this |
2957 |
-+ * causes sync non-weight-raised queues to get a lower |
2958 |
-+ * fraction of the disk throughput, and hence reduces the rate |
2959 |
-+ * at which the processes associated to these queues ask for |
2960 |
-+ * requests from the request pool. |
2961 |
-+ * |
2962 |
-+ * This is beneficial for weight-raised processes, when the |
2963 |
-+ * system operates in request-pool saturation conditions |
2964 |
-+ * (e.g., in the presence of write hogs). In fact, if |
2965 |
-+ * non-weight-raised processes ask for requests at a lower |
2966 |
-+ * rate, then weight-raised processes have a higher |
2967 |
-+ * probability to get a request from the pool immediately |
2968 |
-+ * (or at least soon) when they need one. Hence they have a |
2969 |
-+ * higher probability to actually get a fraction of the disk |
2970 |
-+ * throughput proportional to their high weight. This is |
2971 |
-+ * especially true with NCQ-enabled drives, which enqueue |
2972 |
-+ * several requests in advance and further reorder |
2973 |
-+ * internally-queued requests. |
2974 |
-+ * |
2975 |
-+ * Mistreating non-weight-raised queues in the above-described |
2976 |
-+ * way, when there are busy weight-raised queues, seems to |
2977 |
-+ * mitigate starvation problems in the presence of heavy write |
2978 |
-+ * workloads and NCQ, and hence to guarantee a higher |
2979 |
-+ * application and system responsiveness in these hostile |
2980 |
-+ * scenarios. |
2981 |
-+ */ |
2982 |
-+ bfqd->raised_busy_queues > 0) |
2983 |
-+ ) |
2984 |
-+ ) |
2985 |
-+ ); |
2986 |
-+} |
2987 |
-+ |
2988 |
-+/* |
2989 |
-+ * If the in-service queue is empty, but it is sync and either of the following |
2990 |
-+ * conditions holds, then: 1) the queue must remain in service and cannot be |
2991 |
-+ * expired, and 2) the disk must be idled to wait for the possible arrival |
2992 |
-+ * of a new request for the queue. The conditions are: |
2993 |
-+ * - the device is rotational and not performing NCQ, and the queue has its |
2994 |
-+ * idle window set (in this case, waiting for a new request for the queue |
2995 |
-+ * is likely to boost the disk throughput); |
2996 |
-+ * - the queue is weight-raised (waiting for the request is necessary to |
2997 |
-+ * provide the queue with fairness and latency guarantees, see [1] for |
2998 |
-+ * details). |
2999 |
-+ */ |
3000 |
-+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) |
3001 |
-+{ |
3002 |
-+ struct bfq_data *bfqd = bfqq->bfqd; |
3003 |
-+ |
3004 |
-+ return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && |
3005 |
-+ bfq_bfqq_must_not_expire(bfqq) && |
3006 |
-+ !bfq_queue_nonrot_noidle(bfqd, bfqq)); |
3007 |
-+} |
3008 |
-+ |
3009 |
-+/* |
3010 |
-+ * Select a queue for service. If we have a current queue in service, |
3011 |
-+ * check whether to continue servicing it, or retrieve and set a new one. |
3012 |
-+ */ |
3013 |
-+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) |
3014 |
-+{ |
3015 |
-+ struct bfq_queue *bfqq, *new_bfqq = NULL; |
3016 |
-+ struct request *next_rq; |
3017 |
-+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; |
3018 |
-+ |
3019 |
-+ bfqq = bfqd->in_service_queue; |
3020 |
-+ if (bfqq == NULL) |
3021 |
-+ goto new_queue; |
3022 |
-+ |
3023 |
-+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); |
3024 |
-+ |
3025 |
-+ /* |
3026 |
-+ * If another queue has a request waiting within our mean seek |
3027 |
-+ * distance, let it run. The expire code will check for close |
3028 |
-+ * cooperators and put the close queue at the front of the |
3029 |
-+ * service tree. If possible, merge the expiring queue with the |
3030 |
-+ * new bfqq. |
3031 |
-+ */ |
3032 |
-+ new_bfqq = bfq_close_cooperator(bfqd, bfqq); |
3033 |
-+ if (new_bfqq != NULL && bfqq->new_bfqq == NULL) |
3034 |
-+ bfq_setup_merge(bfqq, new_bfqq); |
3035 |
-+ |
3036 |
-+ if (bfq_may_expire_for_budg_timeout(bfqq) && |
3037 |
-+ !timer_pending(&bfqd->idle_slice_timer) && |
3038 |
-+ !bfq_bfqq_must_idle(bfqq)) |
3039 |
-+ goto expire; |
3040 |
-+ |
3041 |
-+ next_rq = bfqq->next_rq; |
3042 |
-+ /* |
3043 |
-+ * If bfqq has requests queued and it has enough budget left to |
3044 |
-+ * serve them, keep the queue, otherwise expire it. |
3045 |
-+ */ |
3046 |
-+ if (next_rq != NULL) { |
3047 |
-+ if (bfq_serv_to_charge(next_rq, bfqq) > |
3048 |
-+ bfq_bfqq_budget_left(bfqq)) { |
3049 |
-+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; |
3050 |
-+ goto expire; |
3051 |
-+ } else { |
3052 |
-+ /* |
3053 |
-+ * The idle timer may be pending because we may not |
3054 |
-+ * disable disk idling even when a new request arrives |
3055 |
-+ */ |
3056 |
-+ if (timer_pending(&bfqd->idle_slice_timer)) { |
3057 |
-+ /* |
3058 |
-+ * If we get here: 1) at least a new request |
3059 |
-+ * has arrived but we have not disabled the |
3060 |
-+ * timer because the request was too small, |
3061 |
-+ * 2) then the block layer has unplugged the |
3062 |
-+ * device, causing the dispatch to be invoked. |
3063 |
-+ * |
3064 |
-+ * Since the device is unplugged, now the |
3065 |
-+ * requests are probably large enough to |
3066 |
-+ * provide a reasonable throughput. |
3067 |
-+ * So we disable idling. |
3068 |
-+ */ |
3069 |
-+ bfq_clear_bfqq_wait_request(bfqq); |
3070 |
-+ del_timer(&bfqd->idle_slice_timer); |
3071 |
-+ } |
3072 |
-+ if (new_bfqq == NULL) |
3073 |
-+ goto keep_queue; |
3074 |
-+ else |
3075 |
-+ goto expire; |
3076 |
-+ } |
3077 |
-+ } |
3078 |
-+ |
3079 |
-+ /* |
3080 |
-+ * No requests pending. If the in-service queue has no cooperator and |
3081 |
-+ * still has requests in flight (possibly waiting for a completion) |
3082 |
-+ * or is idling for a new request, then keep it. |
3083 |
-+ */ |
3084 |
-+ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) || |
3085 |
-+ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) { |
3086 |
-+ bfqq = NULL; |
3087 |
-+ goto keep_queue; |
3088 |
-+ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) { |
3089 |
-+ /* |
3090 |
-+ * Expiring the queue because there is a close cooperator, |
3091 |
-+ * cancel timer. |
3092 |
-+ */ |
3093 |
-+ bfq_clear_bfqq_wait_request(bfqq); |
3094 |
-+ del_timer(&bfqd->idle_slice_timer); |
3095 |
-+ } |
3096 |
-+ |
3097 |
-+ reason = BFQ_BFQQ_NO_MORE_REQUESTS; |
3098 |
-+expire: |
3099 |
-+ bfq_bfqq_expire(bfqd, bfqq, 0, reason); |
3100 |
-+new_queue: |
3101 |
-+ bfqq = bfq_set_in_service_queue(bfqd, new_bfqq); |
3102 |
-+ bfq_log(bfqd, "select_queue: new queue %d returned", |
3103 |
-+ bfqq != NULL ? bfqq->pid : 0); |
3104 |
-+keep_queue: |
3105 |
-+ return bfqq; |
3106 |
-+} |
3107 |
-+ |
3108 |
-+static void bfq_update_raising_data(struct bfq_data *bfqd, |
3109 |
-+ struct bfq_queue *bfqq) |
3110 |
-+{ |
3111 |
-+ if (bfqq->raising_coeff > 1) { /* queue is being boosted */ |
3112 |
-+ struct bfq_entity *entity = &bfqq->entity; |
3113 |
-+ |
3114 |
-+ bfq_log_bfqq(bfqd, bfqq, |
3115 |
-+ "raising period dur %u/%u msec, " |
3116 |
-+ "old raising coeff %u, w %d(%d)", |
3117 |
-+ jiffies_to_msecs(jiffies - |
3118 |
-+ bfqq->last_rais_start_finish), |
3119 |
-+ jiffies_to_msecs(bfqq->raising_cur_max_time), |
3120 |
-+ bfqq->raising_coeff, |
3121 |
-+ bfqq->entity.weight, bfqq->entity.orig_weight); |
3122 |
-+ |
3123 |
-+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != |
3124 |
-+ entity->orig_weight * bfqq->raising_coeff); |
3125 |
-+ if (entity->ioprio_changed) |
3126 |
-+ bfq_log_bfqq(bfqd, bfqq, |
3127 |
-+ "WARN: pending prio change"); |
3128 |
-+ /* |
3129 |
-+ * If too much time has elapsed from the beginning |
3130 |
-+ * of this weight-raising, stop it. |
3131 |
-+ */ |
3132 |
-+ if (jiffies - bfqq->last_rais_start_finish > |
3133 |
-+ bfqq->raising_cur_max_time) { |
3134 |
-+ bfqq->last_rais_start_finish = jiffies; |
3135 |
-+ bfq_log_bfqq(bfqd, bfqq, |
3136 |
-+ "wrais ending at %llu msec," |
3137 |
-+ "rais_max_time %u", |
3138 |
-+ bfqq->last_rais_start_finish, |
3139 |
-+ jiffies_to_msecs(bfqq-> |
3140 |
-+ raising_cur_max_time)); |
3141 |
-+ bfq_bfqq_end_raising(bfqq); |
3142 |
-+ __bfq_entity_update_weight_prio( |
3143 |
-+ bfq_entity_service_tree(entity), |
3144 |
-+ entity); |
3145 |
-+ } |
3146 |
-+ } |
3147 |
-+} |
3148 |
-+ |
3149 |
-+/* |
3150 |
-+ * Dispatch one request from bfqq, moving it to the request queue |
3151 |
-+ * dispatch list. |
3152 |
-+ */ |
3153 |
-+static int bfq_dispatch_request(struct bfq_data *bfqd, |
3154 |
-+ struct bfq_queue *bfqq) |
3155 |
-+{ |
3156 |
-+ int dispatched = 0; |
3157 |
-+ struct request *rq; |
3158 |
-+ unsigned long service_to_charge; |
3159 |
-+ |
3160 |
-+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); |
3161 |
-+ |
3162 |
-+ /* Follow expired path, else get first next available. */ |
3163 |
-+ rq = bfq_check_fifo(bfqq); |
3164 |
-+ if (rq == NULL) |
3165 |
-+ rq = bfqq->next_rq; |
3166 |
-+ service_to_charge = bfq_serv_to_charge(rq, bfqq); |
3167 |
-+ |
3168 |
-+ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { |
3169 |
-+ /* |
3170 |
-+ * This may happen if the next rq is chosen |
3171 |
-+ * in fifo order instead of sector order. |
3172 |
-+ * The budget is properly dimensioned |
3173 |
-+ * to be always sufficient to serve the next request |
3174 |
-+ * only if it is chosen in sector order. The reason is |
3175 |
-+ * that it would be quite inefficient and little useful |
3176 |
-+ * to always make sure that the budget is large enough |
3177 |
-+ * to serve even the possible next rq in fifo order. |
3178 |
-+ * In fact, requests are seldom served in fifo order. |
3179 |
-+ * |
3180 |
-+ * Expire the queue for budget exhaustion, and |
3181 |
-+ * make sure that the next act_budget is enough |
3182 |
-+ * to serve the next request, even if it comes |
3183 |
-+ * from the fifo expired path. |
3184 |
-+ */ |
3185 |
-+ bfqq->next_rq = rq; |
3186 |
-+ /* |
3187 |
-+ * Since this dispatch is failed, make sure that |
3188 |
-+ * a new one will be performed |
3189 |
-+ */ |
3190 |
-+ if (!bfqd->rq_in_driver) |
3191 |
-+ bfq_schedule_dispatch(bfqd); |
3192 |
-+ goto expire; |
3193 |
-+ } |
3194 |
-+ |
3195 |
-+ /* Finally, insert request into driver dispatch list. */ |
3196 |
-+ bfq_bfqq_served(bfqq, service_to_charge); |
3197 |
-+ bfq_dispatch_insert(bfqd->queue, rq); |
3198 |
-+ |
3199 |
-+ bfq_update_raising_data(bfqd, bfqq); |
3200 |
-+ |
3201 |
-+ bfq_log_bfqq(bfqd, bfqq, |
3202 |
-+ "dispatched %u sec req (%llu), budg left %lu", |
3203 |
-+ blk_rq_sectors(rq), |
3204 |
-+ (long long unsigned)blk_rq_pos(rq), |
3205 |
-+ bfq_bfqq_budget_left(bfqq)); |
3206 |
-+ |
3207 |
-+ dispatched++; |
3208 |
-+ |
3209 |
-+ if (bfqd->in_service_bic == NULL) { |
3210 |
-+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); |
3211 |
-+ bfqd->in_service_bic = RQ_BIC(rq); |
3212 |
-+ } |
3213 |
-+ |
3214 |
-+ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && |
3215 |
-+ dispatched >= bfqd->bfq_max_budget_async_rq) || |
3216 |
-+ bfq_class_idle(bfqq))) |
3217 |
-+ goto expire; |
3218 |
-+ |
3219 |
-+ return dispatched; |
3220 |
-+ |
3221 |
-+expire: |
3222 |
-+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED); |
3223 |
-+ return dispatched; |
3224 |
-+} |
3225 |
-+ |
3226 |
-+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) |
3227 |
-+{ |
3228 |
-+ int dispatched = 0; |
3229 |
-+ |
3230 |
-+ while (bfqq->next_rq != NULL) { |
3231 |
-+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); |
3232 |
-+ dispatched++; |
3233 |
-+ } |
3234 |
-+ |
3235 |
-+ BUG_ON(!list_empty(&bfqq->fifo)); |
3236 |
-+ return dispatched; |
3237 |
-+} |
3238 |
-+ |
3239 |
-+/* |
3240 |
-+ * Drain our current requests. Used for barriers and when switching |
3241 |
-+ * io schedulers on-the-fly. |
3242 |
-+ */ |
3243 |
-+static int bfq_forced_dispatch(struct bfq_data *bfqd) |
3244 |
-+{ |
3245 |
-+ struct bfq_queue *bfqq, *n; |
3246 |
-+ struct bfq_service_tree *st; |
3247 |
-+ int dispatched = 0; |
3248 |
-+ |
3249 |
-+ bfqq = bfqd->in_service_queue; |
3250 |
-+ if (bfqq != NULL) |
3251 |
-+ __bfq_bfqq_expire(bfqd, bfqq); |
3252 |
-+ |
3253 |
-+ /* |
3254 |
-+ * Loop through classes, and be careful to leave the scheduler |
3255 |
-+ * in a consistent state, as feedback mechanisms and vtime |
3256 |
-+ * updates cannot be disabled during the process. |
3257 |
-+ */ |
3258 |
-+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { |
3259 |
-+ st = bfq_entity_service_tree(&bfqq->entity); |
3260 |
-+ |
3261 |
-+ dispatched += __bfq_forced_dispatch_bfqq(bfqq); |
3262 |
-+ bfqq->max_budget = bfq_max_budget(bfqd); |
3263 |
-+ |
3264 |
-+ bfq_forget_idle(st); |
3265 |
-+ } |
3266 |
-+ |
3267 |
-+ BUG_ON(bfqd->busy_queues != 0); |
3268 |
-+ |
3269 |
-+ return dispatched; |
3270 |
-+} |
3271 |
-+ |
3272 |
-+static int bfq_dispatch_requests(struct request_queue *q, int force) |
3273 |
-+{ |
3274 |
-+ struct bfq_data *bfqd = q->elevator->elevator_data; |
3275 |
-+ struct bfq_queue *bfqq; |
3276 |
-+ int max_dispatch; |
3277 |
-+ |
3278 |
-+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); |
3279 |
-+ if (bfqd->busy_queues == 0) |
3280 |
-+ return 0; |
3281 |
-+ |
3282 |
-+ if (unlikely(force)) |
3283 |
-+ return bfq_forced_dispatch(bfqd); |
3284 |
-+ |
3285 |
-+ bfqq = bfq_select_queue(bfqd); |
3286 |
-+ if (bfqq == NULL) |
3287 |
-+ return 0; |
3288 |
-+ |
3289 |
-+ max_dispatch = bfqd->bfq_quantum; |
3290 |
-+ if (bfq_class_idle(bfqq)) |
3291 |
-+ max_dispatch = 1; |
3292 |
-+ |
3293 |
-+ if (!bfq_bfqq_sync(bfqq)) |
3294 |
-+ max_dispatch = bfqd->bfq_max_budget_async_rq; |
3295 |
-+ |
3296 |
-+ if (bfqq->dispatched >= max_dispatch) { |
3297 |
-+ if (bfqd->busy_queues > 1) |
3298 |
-+ return 0; |
3299 |
-+ if (bfqq->dispatched >= 4 * max_dispatch) |
3300 |
-+ return 0; |
3301 |
-+ } |
3302 |
-+ |
3303 |
-+ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) |
3304 |
-+ return 0; |
3305 |
-+ |
3306 |
-+ bfq_clear_bfqq_wait_request(bfqq); |
3307 |
-+ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); |
3308 |
-+ |
3309 |
-+ if (!bfq_dispatch_request(bfqd, bfqq)) |
3310 |
-+ return 0; |
3311 |
-+ |
3312 |
-+ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)", |
3313 |
-+ bfqq->pid, max_dispatch); |
3314 |
-+ |
3315 |
-+ return 1; |
3316 |
-+} |
3317 |
-+ |
3318 |
-+/* |
3319 |
-+ * Task holds one reference to the queue, dropped when task exits. Each rq |
3320 |
-+ * in-flight on this queue also holds a reference, dropped when rq is freed. |
3321 |
-+ * |
3322 |
-+ * Queue lock must be held here. |
3323 |
-+ */ |
3324 |
-+static void bfq_put_queue(struct bfq_queue *bfqq) |
3325 |
-+{ |
3326 |
-+ struct bfq_data *bfqd = bfqq->bfqd; |
3327 |
-+ |
3328 |
-+ BUG_ON(atomic_read(&bfqq->ref) <= 0); |
3329 |
-+ |
3330 |
-+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, |
3331 |
-+ atomic_read(&bfqq->ref)); |
3332 |
-+ if (!atomic_dec_and_test(&bfqq->ref)) |
3333 |
-+ return; |
3334 |
-+ |
3335 |
-+ BUG_ON(rb_first(&bfqq->sort_list) != NULL); |
3336 |
-+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); |
3337 |
-+ BUG_ON(bfqq->entity.tree != NULL); |
3338 |
-+ BUG_ON(bfq_bfqq_busy(bfqq)); |
3339 |
-+ BUG_ON(bfqd->in_service_queue == bfqq); |
3340 |
-+ |
3341 |
-+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); |
3342 |
-+ |
3343 |
-+ kmem_cache_free(bfq_pool, bfqq); |
3344 |
-+} |
3345 |
-+ |
3346 |
-+static void bfq_put_cooperator(struct bfq_queue *bfqq) |
3347 |
-+{ |
3348 |
-+ struct bfq_queue *__bfqq, *next; |
3349 |
-+ |
3350 |
-+ /* |
3351 |
-+ * If this queue was scheduled to merge with another queue, be |
3352 |
-+ * sure to drop the reference taken on that queue (and others in |
3353 |
-+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. |
3354 |
-+ */ |
3355 |
-+ __bfqq = bfqq->new_bfqq; |
3356 |
-+ while (__bfqq) { |
3357 |
-+ if (__bfqq == bfqq) { |
3358 |
-+ WARN(1, "bfqq->new_bfqq loop detected.\n"); |
3359 |
-+ break; |
3360 |
-+ } |
3361 |
-+ next = __bfqq->new_bfqq; |
3362 |
-+ bfq_put_queue(__bfqq); |
3363 |
-+ __bfqq = next; |
3364 |
-+ } |
3365 |
-+} |
3366 |
-+ |
3367 |
-+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
3368 |
-+{ |
3369 |
-+ if (bfqq == bfqd->in_service_queue) { |
3370 |
-+ __bfq_bfqq_expire(bfqd, bfqq); |
3371 |
-+ bfq_schedule_dispatch(bfqd); |
3372 |
-+ } |
3373 |
-+ |
3374 |
-+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, |
3375 |
-+ atomic_read(&bfqq->ref)); |
3376 |
-+ |
3377 |
-+ bfq_put_cooperator(bfqq); |
3378 |
-+ |
3379 |
-+ bfq_put_queue(bfqq); |
3380 |
-+} |
3381 |
-+ |
3382 |
-+static void bfq_init_icq(struct io_cq *icq) |
3383 |
-+{ |
3384 |
-+ struct bfq_io_cq *bic = icq_to_bic(icq); |
3385 |
-+ |
3386 |
-+ bic->ttime.last_end_request = jiffies; |
3387 |
-+} |
3388 |
-+ |
3389 |
-+static void bfq_exit_icq(struct io_cq *icq) |
3390 |
-+{ |
3391 |
-+ struct bfq_io_cq *bic = icq_to_bic(icq); |
3392 |
-+ struct bfq_data *bfqd = bic_to_bfqd(bic); |
3393 |
-+ |
3394 |
-+ if (bic->bfqq[BLK_RW_ASYNC]) { |
3395 |
-+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); |
3396 |
-+ bic->bfqq[BLK_RW_ASYNC] = NULL; |
3397 |
-+ } |
3398 |
-+ |
3399 |
-+ if (bic->bfqq[BLK_RW_SYNC]) { |
3400 |
-+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); |
3401 |
-+ bic->bfqq[BLK_RW_SYNC] = NULL; |
3402 |
-+ } |
3403 |
-+} |
3404 |
-+ |
3405 |
-+/* |
3406 |
-+ * Update the entity prio values; note that the new values will not |
3407 |
-+ * be used until the next (re)activation. |
3408 |
-+ */ |
3409 |
-+static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) |
3410 |
-+{ |
3411 |
-+ struct task_struct *tsk = current; |
3412 |
-+ int ioprio_class; |
3413 |
-+ |
3414 |
-+ if (!bfq_bfqq_prio_changed(bfqq)) |
3415 |
-+ return; |
3416 |
-+ |
3417 |
-+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); |
3418 |
-+ switch (ioprio_class) { |
3419 |
-+ default: |
3420 |
-+ dev_err(bfqq->bfqd->queue->backing_dev_info.dev, |
3421 |
-+ "bfq: bad prio %x\n", ioprio_class); |
3422 |
-+ case IOPRIO_CLASS_NONE: |
3423 |
-+ /* |
3424 |
-+ * No prio set, inherit CPU scheduling settings. |
3425 |
-+ */ |
3426 |
-+ bfqq->entity.new_ioprio = task_nice_ioprio(tsk); |
3427 |
-+ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk); |
3428 |
-+ break; |
3429 |
-+ case IOPRIO_CLASS_RT: |
3430 |
-+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); |
3431 |
-+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT; |
3432 |
-+ break; |
3433 |
-+ case IOPRIO_CLASS_BE: |
3434 |
-+ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); |
3435 |
-+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; |
3436 |
-+ break; |
3437 |
-+ case IOPRIO_CLASS_IDLE: |
3438 |
-+ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE; |
3439 |
-+ bfqq->entity.new_ioprio = 7; |
3440 |
-+ bfq_clear_bfqq_idle_window(bfqq); |
3441 |
-+ break; |
3442 |
-+ } |
3443 |
-+ |
3444 |
-+ bfqq->entity.ioprio_changed = 1; |
3445 |
-+ |
3446 |
-+ /* |
3447 |
-+ * Keep track of original prio settings in case we have to temporarily |
3448 |
-+ * elevate the priority of this queue. |
3449 |
-+ */ |
3450 |
-+ bfqq->org_ioprio = bfqq->entity.new_ioprio; |
3451 |
-+ bfq_clear_bfqq_prio_changed(bfqq); |
3452 |
-+} |
3453 |
-+ |
3454 |
-+static void bfq_changed_ioprio(struct bfq_io_cq *bic) |
3455 |
-+{ |
3456 |
-+ struct bfq_data *bfqd; |
3457 |
-+ struct bfq_queue *bfqq, *new_bfqq; |
3458 |
-+ struct bfq_group *bfqg; |
3459 |
-+ unsigned long uninitialized_var(flags); |
3460 |
-+ int ioprio = bic->icq.ioc->ioprio; |
3461 |
-+ |
3462 |
-+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), |
3463 |
-+ &flags); |
3464 |
-+ /* |
3465 |
-+ * This condition may trigger on a newly created bic, be sure to drop |
3466 |
-+ * the lock before returning. |
3467 |
-+ */ |
3468 |
-+ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio)) |
3469 |
-+ goto out; |
3470 |
-+ |
3471 |
-+ bfqq = bic->bfqq[BLK_RW_ASYNC]; |
3472 |
-+ if (bfqq != NULL) { |
3473 |
-+ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group, |
3474 |
-+ sched_data); |
3475 |
-+ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic, |
3476 |
-+ GFP_ATOMIC); |
3477 |
-+ if (new_bfqq != NULL) { |
3478 |
-+ bic->bfqq[BLK_RW_ASYNC] = new_bfqq; |
3479 |
-+ bfq_log_bfqq(bfqd, bfqq, |
3480 |
-+ "changed_ioprio: bfqq %p %d", |
3481 |
-+ bfqq, atomic_read(&bfqq->ref)); |
3482 |
-+ bfq_put_queue(bfqq); |
3483 |
-+ } |
3484 |
-+ } |
3485 |
-+ |
3486 |
-+ bfqq = bic->bfqq[BLK_RW_SYNC]; |
3487 |
-+ if (bfqq != NULL) |
3488 |
-+ bfq_mark_bfqq_prio_changed(bfqq); |
3489 |
-+ |
3490 |
-+ bic->ioprio = ioprio; |
3491 |
-+ |
3492 |
-+out: |
3493 |
-+ bfq_put_bfqd_unlock(bfqd, &flags); |
3494 |
-+} |
3495 |
-+ |
3496 |
-+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
3497 |
-+ pid_t pid, int is_sync) |
3498 |
-+{ |
3499 |
-+ RB_CLEAR_NODE(&bfqq->entity.rb_node); |
3500 |
-+ INIT_LIST_HEAD(&bfqq->fifo); |
3501 |
-+ |
3502 |
-+ atomic_set(&bfqq->ref, 0); |
3503 |
-+ bfqq->bfqd = bfqd; |
3504 |
-+ |
3505 |
-+ bfq_mark_bfqq_prio_changed(bfqq); |
3506 |
-+ |
3507 |
-+ if (is_sync) { |
3508 |
-+ if (!bfq_class_idle(bfqq)) |
3509 |
-+ bfq_mark_bfqq_idle_window(bfqq); |
3510 |
-+ bfq_mark_bfqq_sync(bfqq); |
3511 |
-+ } |
3512 |
-+ |
3513 |
-+ /* Tentative initial value to trade off between thr and lat */ |
3514 |
-+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; |
3515 |
-+ bfqq->pid = pid; |
3516 |
-+ |
3517 |
-+ bfqq->raising_coeff = 1; |
3518 |
-+ bfqq->last_rais_start_finish = 0; |
3519 |
-+ bfqq->soft_rt_next_start = -1; |
3520 |
-+} |
3521 |
-+ |
3522 |
-+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, |
3523 |
-+ struct bfq_group *bfqg, |
3524 |
-+ int is_sync, |
3525 |
-+ struct bfq_io_cq *bic, |
3526 |
-+ gfp_t gfp_mask) |
3527 |
-+{ |
3528 |
-+ struct bfq_queue *bfqq, *new_bfqq = NULL; |
3529 |
-+ |
3530 |
-+retry: |
3531 |
-+ /* bic always exists here */ |
3532 |
-+ bfqq = bic_to_bfqq(bic, is_sync); |
3533 |
-+ |
3534 |
-+ /* |
3535 |
-+ * Always try a new alloc if we fall back to the OOM bfqq |
3536 |
-+ * originally, since it should just be a temporary situation. |
3537 |
-+ */ |
3538 |
-+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { |
3539 |
-+ bfqq = NULL; |
3540 |
-+ if (new_bfqq != NULL) { |
3541 |
-+ bfqq = new_bfqq; |
3542 |
-+ new_bfqq = NULL; |
3543 |
-+ } else if (gfp_mask & __GFP_WAIT) { |
3544 |
-+ spin_unlock_irq(bfqd->queue->queue_lock); |
3545 |
-+ new_bfqq = kmem_cache_alloc_node(bfq_pool, |
3546 |
-+ gfp_mask | __GFP_ZERO, |
3547 |
-+ bfqd->queue->node); |
3548 |
-+ spin_lock_irq(bfqd->queue->queue_lock); |
3549 |
-+ if (new_bfqq != NULL) |
3550 |
-+ goto retry; |
3551 |
-+ } else { |
3552 |
-+ bfqq = kmem_cache_alloc_node(bfq_pool, |
3553 |
-+ gfp_mask | __GFP_ZERO, |
3554 |
-+ bfqd->queue->node); |
3555 |
-+ } |
3556 |
-+ |
3557 |
-+ if (bfqq != NULL) { |
3558 |
-+ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync); |
3559 |
-+ bfq_log_bfqq(bfqd, bfqq, "allocated"); |
3560 |
-+ } else { |
3561 |
-+ bfqq = &bfqd->oom_bfqq; |
3562 |
-+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); |
3563 |
-+ } |
3564 |
-+ |
3565 |
-+ bfq_init_prio_data(bfqq, bic); |
3566 |
-+ bfq_init_entity(&bfqq->entity, bfqg); |
3567 |
-+ } |
3568 |
-+ |
3569 |
-+ if (new_bfqq != NULL) |
3570 |
-+ kmem_cache_free(bfq_pool, new_bfqq); |
3571 |
-+ |
3572 |
-+ return bfqq; |
3573 |
-+} |
3574 |
-+ |
3575 |
-+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, |
3576 |
-+ struct bfq_group *bfqg, |
3577 |
-+ int ioprio_class, int ioprio) |
3578 |
-+{ |
3579 |
-+ switch (ioprio_class) { |
3580 |
-+ case IOPRIO_CLASS_RT: |
3581 |
-+ return &bfqg->async_bfqq[0][ioprio]; |
3582 |
-+ case IOPRIO_CLASS_NONE: |
3583 |
-+ ioprio = IOPRIO_NORM; |
3584 |
-+ /* fall through */ |
3585 |
-+ case IOPRIO_CLASS_BE: |
3586 |
-+ return &bfqg->async_bfqq[1][ioprio]; |
3587 |
-+ case IOPRIO_CLASS_IDLE: |
3588 |
-+ return &bfqg->async_idle_bfqq; |
3589 |
-+ default: |
3590 |
-+ BUG(); |
3591 |
-+ } |
3592 |
-+} |
3593 |
-+ |
3594 |
-+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, |
3595 |
-+ struct bfq_group *bfqg, int is_sync, |
3596 |
-+ struct bfq_io_cq *bic, gfp_t gfp_mask) |
3597 |
-+{ |
3598 |
-+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); |
3599 |
-+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); |
3600 |
-+ struct bfq_queue **async_bfqq = NULL; |
3601 |
-+ struct bfq_queue *bfqq = NULL; |
3602 |
-+ |
3603 |
-+ if (!is_sync) { |
3604 |
-+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, |
3605 |
-+ ioprio); |
3606 |
-+ bfqq = *async_bfqq; |
3607 |
-+ } |
3608 |
-+ |
3609 |
-+ if (bfqq == NULL) |
3610 |
-+ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask); |
3611 |
-+ |
3612 |
-+ /* |
3613 |
-+ * Pin the queue now that it's allocated, scheduler exit will prune it. |
3614 |
-+ */ |
3615 |
-+ if (!is_sync && *async_bfqq == NULL) { |
3616 |
-+ atomic_inc(&bfqq->ref); |
3617 |
-+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", |
3618 |
-+ bfqq, atomic_read(&bfqq->ref)); |
3619 |
-+ *async_bfqq = bfqq; |
3620 |
-+ } |
3621 |
-+ |
3622 |
-+ atomic_inc(&bfqq->ref); |
3623 |
-+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, |
3624 |
-+ atomic_read(&bfqq->ref)); |
3625 |
-+ return bfqq; |
3626 |
-+} |
3627 |
-+ |
3628 |
-+static void bfq_update_io_thinktime(struct bfq_data *bfqd, |
3629 |
-+ struct bfq_io_cq *bic) |
3630 |
-+{ |
3631 |
-+ unsigned long elapsed = jiffies - bic->ttime.last_end_request; |
3632 |
-+ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); |
3633 |
-+ |
3634 |
-+ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; |
3635 |
-+ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; |
3636 |
-+ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / |
3637 |
-+ bic->ttime.ttime_samples; |
3638 |
-+} |
3639 |
-+ |
3640 |
-+static void bfq_update_io_seektime(struct bfq_data *bfqd, |
3641 |
-+ struct bfq_queue *bfqq, |
3642 |
-+ struct request *rq) |
3643 |
-+{ |
3644 |
-+ sector_t sdist; |
3645 |
-+ u64 total; |
3646 |
-+ |
3647 |
-+ if (bfqq->last_request_pos < blk_rq_pos(rq)) |
3648 |
-+ sdist = blk_rq_pos(rq) - bfqq->last_request_pos; |
3649 |
-+ else |
3650 |
-+ sdist = bfqq->last_request_pos - blk_rq_pos(rq); |
3651 |
-+ |
3652 |
-+ /* |
3653 |
-+ * Don't allow the seek distance to get too large from the |
3654 |
-+ * odd fragment, pagein, etc. |
3655 |
-+ */ |
3656 |
-+ if (bfqq->seek_samples == 0) /* first request, not really a seek */ |
3657 |
-+ sdist = 0; |
3658 |
-+ else if (bfqq->seek_samples <= 60) /* second & third seek */ |
3659 |
-+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); |
3660 |
-+ else |
3661 |
-+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); |
3662 |
-+ |
3663 |
-+ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; |
3664 |
-+ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; |
3665 |
-+ total = bfqq->seek_total + (bfqq->seek_samples/2); |
3666 |
-+ do_div(total, bfqq->seek_samples); |
3667 |
-+ bfqq->seek_mean = (sector_t)total; |
3668 |
-+ |
3669 |
-+ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, |
3670 |
-+ (u64)bfqq->seek_mean); |
3671 |
-+} |
3672 |
-+ |
3673 |
-+/* |
3674 |
-+ * Disable idle window if the process thinks too long or seeks so much that |
3675 |
-+ * it doesn't matter. |
3676 |
-+ */ |
3677 |
-+static void bfq_update_idle_window(struct bfq_data *bfqd, |
3678 |
-+ struct bfq_queue *bfqq, |
3679 |
-+ struct bfq_io_cq *bic) |
3680 |
-+{ |
3681 |
-+ int enable_idle; |
3682 |
-+ |
3683 |
-+ /* Don't idle for async or idle io prio class. */ |
3684 |
-+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) |
3685 |
-+ return; |
3686 |
-+ |
3687 |
-+ enable_idle = bfq_bfqq_idle_window(bfqq); |
3688 |
-+ |
3689 |
-+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || |
3690 |
-+ bfqd->bfq_slice_idle == 0 || |
3691 |
-+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && |
3692 |
-+ bfqq->raising_coeff == 1)) |
3693 |
-+ enable_idle = 0; |
3694 |
-+ else if (bfq_sample_valid(bic->ttime.ttime_samples)) { |
3695 |
-+ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && |
3696 |
-+ bfqq->raising_coeff == 1) |
3697 |
-+ enable_idle = 0; |
3698 |
-+ else |
3699 |
-+ enable_idle = 1; |
3700 |
-+ } |
3701 |
-+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", |
3702 |
-+ enable_idle); |
3703 |
-+ |
3704 |
-+ if (enable_idle) |
3705 |
-+ bfq_mark_bfqq_idle_window(bfqq); |
3706 |
-+ else |
3707 |
-+ bfq_clear_bfqq_idle_window(bfqq); |
3708 |
-+} |
3709 |
-+ |
3710 |
-+/* |
3711 |
-+ * Called when a new fs request (rq) is added to bfqq. Check if there's |
3712 |
-+ * something we should do about it. |
3713 |
-+ */ |
3714 |
-+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
3715 |
-+ struct request *rq) |
3716 |
-+{ |
3717 |
-+ struct bfq_io_cq *bic = RQ_BIC(rq); |
3718 |
-+ |
3719 |
-+ if (rq->cmd_flags & REQ_META) |
3720 |
-+ bfqq->meta_pending++; |
3721 |
-+ |
3722 |
-+ bfq_update_io_thinktime(bfqd, bic); |
3723 |
-+ bfq_update_io_seektime(bfqd, bfqq, rq); |
3724 |
-+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || |
3725 |
-+ !BFQQ_SEEKY(bfqq)) |
3726 |
-+ bfq_update_idle_window(bfqd, bfqq, bic); |
3727 |
-+ |
3728 |
-+ bfq_log_bfqq(bfqd, bfqq, |
3729 |
-+ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", |
3730 |
-+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), |
3731 |
-+ (long long unsigned)bfqq->seek_mean); |
3732 |
-+ |
3733 |
-+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); |
3734 |
-+ |
3735 |
-+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { |
3736 |
-+ int small_req = bfqq->queued[rq_is_sync(rq)] == 1 && |
3737 |
-+ blk_rq_sectors(rq) < 32; |
3738 |
-+ int budget_timeout = bfq_bfqq_budget_timeout(bfqq); |
3739 |
-+ |
3740 |
-+ /* |
3741 |
-+ * There is just this request queued: if the request |
3742 |
-+ * is small and the queue is not to be expired, then |
3743 |
-+ * just exit. |
3744 |
-+ * |
3745 |
-+ * In this way, if the disk is being idled to wait for |
3746 |
-+ * a new request from the in-service queue, we avoid |
3747 |
-+ * unplugging the device and committing the disk to serve |
3748 |
-+ * just a small request. On the contrary, we wait for |
3749 |
-+ * the block layer to decide when to unplug the device: |
3750 |
-+ * hopefully, new requests will be merged to this one |
3751 |
-+ * quickly, then the device will be unplugged and |
3752 |
-+ * larger requests will be dispatched. |
3753 |
-+ */ |
3754 |
-+ if (small_req && !budget_timeout) |
3755 |
-+ return; |
3756 |
-+ |
3757 |
-+ /* |
3758 |
-+ * A large enough request arrived, or the queue is to |
3759 |
-+ * be expired: in both cases disk idling is to be |
3760 |
-+ * stopped, so clear wait_request flag and reset |
3761 |
-+ * timer. |
3762 |
-+ */ |
3763 |
-+ bfq_clear_bfqq_wait_request(bfqq); |
3764 |
-+ del_timer(&bfqd->idle_slice_timer); |
3765 |
-+ |
3766 |
-+ /* |
3767 |
-+ * The queue is not empty, because a new request just |
3768 |
-+ * arrived. Hence we can safely expire the queue, in |
3769 |
-+ * case of budget timeout, without risking that the |
3770 |
-+ * timestamps of the queue are not updated correctly. |
3771 |
-+ * See [1] for more details. |
3772 |
-+ */ |
3773 |
-+ if (budget_timeout) |
3774 |
-+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); |
3775 |
-+ |
3776 |
-+ /* |
3777 |
-+ * Let the request rip immediately, or let a new queue be |
3778 |
-+ * selected if bfqq has just been expired. |
3779 |
-+ */ |
3780 |
-+ __blk_run_queue(bfqd->queue); |
3781 |
-+ } |
3782 |
-+} |
3783 |
-+ |
3784 |
-+static void bfq_insert_request(struct request_queue *q, struct request *rq) |
3785 |
-+{ |
3786 |
-+ struct bfq_data *bfqd = q->elevator->elevator_data; |
3787 |
-+ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
3788 |
-+ |
3789 |
-+ assert_spin_locked(bfqd->queue->queue_lock); |
3790 |
-+ bfq_init_prio_data(bfqq, RQ_BIC(rq)); |
3791 |
-+ |
3792 |
-+ bfq_add_rq_rb(rq); |
3793 |
-+ |
3794 |
-+ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]); |
3795 |
-+ list_add_tail(&rq->queuelist, &bfqq->fifo); |
3796 |
-+ |
3797 |
-+ bfq_rq_enqueued(bfqd, bfqq, rq); |
3798 |
-+} |
3799 |
-+ |
3800 |
-+static void bfq_update_hw_tag(struct bfq_data *bfqd) |
3801 |
-+{ |
3802 |
-+ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, |
3803 |
-+ bfqd->rq_in_driver); |
3804 |
-+ |
3805 |
-+ if (bfqd->hw_tag == 1) |
3806 |
-+ return; |
3807 |
-+ |
3808 |
-+ /* |
3809 |
-+ * This sample is valid if the number of outstanding requests |
3810 |
-+ * is large enough to allow a queueing behavior. Note that the |
3811 |
-+ * sum is not exact, as it's not taking into account deactivated |
3812 |
-+ * requests. |
3813 |
-+ */ |
3814 |
-+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) |
3815 |
-+ return; |
3816 |
-+ |
3817 |
-+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) |
3818 |
-+ return; |
3819 |
-+ |
3820 |
-+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; |
3821 |
-+ bfqd->max_rq_in_driver = 0; |
3822 |
-+ bfqd->hw_tag_samples = 0; |
3823 |
-+} |
3824 |
-+ |
3825 |
-+static void bfq_completed_request(struct request_queue *q, struct request *rq) |
3826 |
-+{ |
3827 |
-+ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
3828 |
-+ struct bfq_data *bfqd = bfqq->bfqd; |
3829 |
-+ const int sync = rq_is_sync(rq); |
3830 |
-+ |
3831 |
-+ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)", |
3832 |
-+ blk_rq_sectors(rq), sync); |
3833 |
-+ |
3834 |
-+ bfq_update_hw_tag(bfqd); |
3835 |
-+ |
3836 |
-+ WARN_ON(!bfqd->rq_in_driver); |
3837 |
-+ WARN_ON(!bfqq->dispatched); |
3838 |
-+ bfqd->rq_in_driver--; |
3839 |
-+ bfqq->dispatched--; |
3840 |
-+ |
3841 |
-+ if (bfq_bfqq_sync(bfqq)) |
3842 |
-+ bfqd->sync_flight--; |
3843 |
-+ |
3844 |
-+ if (sync) |
3845 |
-+ RQ_BIC(rq)->ttime.last_end_request = jiffies; |
3846 |
-+ |
3847 |
-+ /* |
3848 |
-+ * The computation of softrt_next_start was scheduled for the next |
3849 |
-+ * request completion: it is now time to compute it. |
3850 |
-+ */ |
3851 |
-+ if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list)) |
3852 |
-+ bfqq->soft_rt_next_start = |
3853 |
-+ bfq_bfqq_softrt_next_start(bfqd, bfqq); |
3854 |
-+ |
3855 |
-+ /* |
3856 |
-+ * If this is the in-service queue, check if it needs to be expired, |
3857 |
-+ * or if we want to idle in case it has no pending requests. |
3858 |
-+ */ |
3859 |
-+ if (bfqd->in_service_queue == bfqq) { |
3860 |
-+ if (bfq_bfqq_budget_new(bfqq)) |
3861 |
-+ bfq_set_budget_timeout(bfqd); |
3862 |
-+ |
3863 |
-+ if (bfq_bfqq_must_idle(bfqq)) { |
3864 |
-+ bfq_arm_slice_timer(bfqd); |
3865 |
-+ goto out; |
3866 |
-+ } else if (bfq_may_expire_for_budg_timeout(bfqq)) |
3867 |
-+ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); |
3868 |
-+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && |
3869 |
-+ (bfqq->dispatched == 0 || |
3870 |
-+ !bfq_bfqq_must_not_expire(bfqq))) |
3871 |
-+ bfq_bfqq_expire(bfqd, bfqq, 0, |
3872 |
-+ BFQ_BFQQ_NO_MORE_REQUESTS); |
3873 |
-+ } |
3874 |
-+ |
3875 |
-+ if (!bfqd->rq_in_driver) |
3876 |
-+ bfq_schedule_dispatch(bfqd); |
3877 |
-+ |
3878 |
-+out: |
3879 |
-+ return; |
3880 |
-+} |
3881 |
-+ |
3882 |
-+static inline int __bfq_may_queue(struct bfq_queue *bfqq) |
3883 |
-+{ |
3884 |
-+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { |
3885 |
-+ bfq_clear_bfqq_must_alloc(bfqq); |
3886 |
-+ return ELV_MQUEUE_MUST; |
3887 |
-+ } |
3888 |
-+ |
3889 |
-+ return ELV_MQUEUE_MAY; |
3890 |
-+} |
3891 |
-+ |
3892 |
-+static int bfq_may_queue(struct request_queue *q, int rw) |
3893 |
-+{ |
3894 |
-+ struct bfq_data *bfqd = q->elevator->elevator_data; |
3895 |
-+ struct task_struct *tsk = current; |
3896 |
-+ struct bfq_io_cq *bic; |
3897 |
-+ struct bfq_queue *bfqq; |
3898 |
-+ |
3899 |
-+ /* |
3900 |
-+ * Don't force setup of a queue from here, as a call to may_queue |
3901 |
-+ * does not necessarily imply that a request actually will be queued. |
3902 |
-+ * So just lookup a possibly existing queue, or return 'may queue' |
3903 |
-+ * if that fails. |
3904 |
-+ */ |
3905 |
-+ bic = bfq_bic_lookup(bfqd, tsk->io_context); |
3906 |
-+ if (bic == NULL) |
3907 |
-+ return ELV_MQUEUE_MAY; |
3908 |
-+ |
3909 |
-+ bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); |
3910 |
-+ if (bfqq != NULL) { |
3911 |
-+ bfq_init_prio_data(bfqq, bic); |
3912 |
-+ |
3913 |
-+ return __bfq_may_queue(bfqq); |
3914 |
-+ } |
3915 |
-+ |
3916 |
-+ return ELV_MQUEUE_MAY; |
3917 |
-+} |
3918 |
-+ |
3919 |
-+/* |
3920 |
-+ * Queue lock held here. |
3921 |
-+ */ |
3922 |
-+static void bfq_put_request(struct request *rq) |
3923 |
-+{ |
3924 |
-+ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
3925 |
-+ |
3926 |
-+ if (bfqq != NULL) { |
3927 |
-+ const int rw = rq_data_dir(rq); |
3928 |
-+ |
3929 |
-+ BUG_ON(!bfqq->allocated[rw]); |
3930 |
-+ bfqq->allocated[rw]--; |
3931 |
-+ |
3932 |
-+ rq->elv.priv[0] = NULL; |
3933 |
-+ rq->elv.priv[1] = NULL; |
3934 |
-+ |
3935 |
-+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", |
3936 |
-+ bfqq, atomic_read(&bfqq->ref)); |
3937 |
-+ bfq_put_queue(bfqq); |
3938 |
-+ } |
3939 |
-+} |
3940 |
-+ |
3941 |
-+static struct bfq_queue * |
3942 |
-+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, |
3943 |
-+ struct bfq_queue *bfqq) |
3944 |
-+{ |
3945 |
-+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", |
3946 |
-+ (long unsigned)bfqq->new_bfqq->pid); |
3947 |
-+ bic_set_bfqq(bic, bfqq->new_bfqq, 1); |
3948 |
-+ bfq_mark_bfqq_coop(bfqq->new_bfqq); |
3949 |
-+ bfq_put_queue(bfqq); |
3950 |
-+ return bic_to_bfqq(bic, 1); |
3951 |
-+} |
3952 |
-+ |
3953 |
-+/* |
3954 |
-+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this |
3955 |
-+ * was the last process referring to said bfqq. |
3956 |
-+ */ |
3957 |
-+static struct bfq_queue * |
3958 |
-+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) |
3959 |
-+{ |
3960 |
-+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); |
3961 |
-+ if (bfqq_process_refs(bfqq) == 1) { |
3962 |
-+ bfqq->pid = current->pid; |
3963 |
-+ bfq_clear_bfqq_coop(bfqq); |
3964 |
-+ bfq_clear_bfqq_split_coop(bfqq); |
3965 |
-+ return bfqq; |
3966 |
-+ } |
3967 |
-+ |
3968 |
-+ bic_set_bfqq(bic, NULL, 1); |
3969 |
-+ |
3970 |
-+ bfq_put_cooperator(bfqq); |
3971 |
-+ |
3972 |
-+ bfq_put_queue(bfqq); |
3973 |
-+ return NULL; |
3974 |
-+} |
3975 |
-+ |
3976 |
-+/* |
3977 |
-+ * Allocate bfq data structures associated with this request. |
3978 |
-+ */ |
3979 |
-+static int bfq_set_request(struct request_queue *q, struct request *rq, |
3980 |
-+ struct bio *bio, gfp_t gfp_mask) |
3981 |
-+{ |
3982 |
-+ struct bfq_data *bfqd = q->elevator->elevator_data; |
3983 |
-+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); |
3984 |
-+ const int rw = rq_data_dir(rq); |
3985 |
-+ const int is_sync = rq_is_sync(rq); |
3986 |
-+ struct bfq_queue *bfqq; |
3987 |
-+ struct bfq_group *bfqg; |
3988 |
-+ unsigned long flags; |
3989 |
-+ |
3990 |
-+ might_sleep_if(gfp_mask & __GFP_WAIT); |
3991 |
-+ |
3992 |
-+ bfq_changed_ioprio(bic); |
3993 |
-+ |
3994 |
-+ spin_lock_irqsave(q->queue_lock, flags); |
3995 |
-+ |
3996 |
-+ if (bic == NULL) |
3997 |
-+ goto queue_fail; |
3998 |
-+ |
3999 |
-+ bfqg = bfq_bic_update_cgroup(bic); |
4000 |
-+ |
4001 |
-+new_queue: |
4002 |
-+ bfqq = bic_to_bfqq(bic, is_sync); |
4003 |
-+ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { |
4004 |
-+ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask); |
4005 |
-+ bic_set_bfqq(bic, bfqq, is_sync); |
4006 |
-+ } else { |
4007 |
-+ /* |
4008 |
-+ * If the queue was seeky for too long, break it apart. |
4009 |
-+ */ |
4010 |
-+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { |
4011 |
-+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); |
4012 |
-+ bfqq = bfq_split_bfqq(bic, bfqq); |
4013 |
-+ if (!bfqq) |
4014 |
-+ goto new_queue; |
4015 |
-+ } |
4016 |
-+ |
4017 |
-+ /* |
4018 |
-+ * Check to see if this queue is scheduled to merge with |
4019 |
-+ * another closely cooperating queue. The merging of queues |
4020 |
-+ * happens here as it must be done in process context. |
4021 |
-+ * The reference on new_bfqq was taken in merge_bfqqs. |
4022 |
-+ */ |
4023 |
-+ if (bfqq->new_bfqq != NULL) |
4024 |
-+ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq); |
4025 |
-+ } |
4026 |
-+ |
4027 |
-+ bfqq->allocated[rw]++; |
4028 |
-+ atomic_inc(&bfqq->ref); |
4029 |
-+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, |
4030 |
-+ atomic_read(&bfqq->ref)); |
4031 |
-+ |
4032 |
-+ rq->elv.priv[0] = bic; |
4033 |
-+ rq->elv.priv[1] = bfqq; |
4034 |
-+ |
4035 |
-+ spin_unlock_irqrestore(q->queue_lock, flags); |
4036 |
-+ |
4037 |
-+ return 0; |
4038 |
-+ |
4039 |
-+queue_fail: |
4040 |
-+ bfq_schedule_dispatch(bfqd); |
4041 |
-+ spin_unlock_irqrestore(q->queue_lock, flags); |
4042 |
-+ |
4043 |
-+ return 1; |
4044 |
-+} |
4045 |
-+ |
4046 |
-+static void bfq_kick_queue(struct work_struct *work) |
4047 |
-+{ |
4048 |
-+ struct bfq_data *bfqd = |
4049 |
-+ container_of(work, struct bfq_data, unplug_work); |
4050 |
-+ struct request_queue *q = bfqd->queue; |
4051 |
-+ |
4052 |
-+ spin_lock_irq(q->queue_lock); |
4053 |
-+ __blk_run_queue(q); |
4054 |
-+ spin_unlock_irq(q->queue_lock); |
4055 |
-+} |
4056 |
-+ |
4057 |
-+/* |
4058 |
-+ * Handler of the expiration of the timer running if the in-service queue |
4059 |
-+ * is idling inside its time slice. |
4060 |
-+ */ |
4061 |
-+static void bfq_idle_slice_timer(unsigned long data) |
4062 |
-+{ |
4063 |
-+ struct bfq_data *bfqd = (struct bfq_data *)data; |
4064 |
-+ struct bfq_queue *bfqq; |
4065 |
-+ unsigned long flags; |
4066 |
-+ enum bfqq_expiration reason; |
4067 |
-+ |
4068 |
-+ spin_lock_irqsave(bfqd->queue->queue_lock, flags); |
4069 |
-+ |
4070 |
-+ bfqq = bfqd->in_service_queue; |
4071 |
-+ /* |
4072 |
-+ * Theoretical race here: the in-service queue can be NULL or different |
4073 |
-+ * from the queue that was idling if the timer handler spins on |
4074 |
-+ * the queue_lock and a new request arrives for the current |
4075 |
-+ * queue and there is a full dispatch cycle that changes the |
4076 |
-+ * in-service queue. This can hardly happen, but in the worst case |
4077 |
-+ * we just expire a queue too early. |
4078 |
-+ */ |
4079 |
-+ if (bfqq != NULL) { |
4080 |
-+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); |
4081 |
-+ if (bfq_bfqq_budget_timeout(bfqq)) |
4082 |
-+ /* |
4083 |
-+ * Also here the queue can be safely expired |
4084 |
-+ * for budget timeout without wasting |
4085 |
-+ * guarantees |
4086 |
-+ */ |
4087 |
-+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; |
4088 |
-+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) |
4089 |
-+ /* |
4090 |
-+ * The queue may not be empty upon timer expiration, |
4091 |
-+ * because we may not disable the timer when the first |
4092 |
-+ * request of the in-service queue arrives during |
4093 |
-+ * disk idling |
4094 |
-+ */ |
4095 |
-+ reason = BFQ_BFQQ_TOO_IDLE; |
4096 |
-+ else |
4097 |
-+ goto schedule_dispatch; |
4098 |
-+ |
4099 |
-+ bfq_bfqq_expire(bfqd, bfqq, 1, reason); |
4100 |
-+ } |
4101 |
-+ |
4102 |
-+schedule_dispatch: |
4103 |
-+ bfq_schedule_dispatch(bfqd); |
4104 |
-+ |
4105 |
-+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); |
4106 |
-+} |
4107 |
-+ |
4108 |
-+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) |
4109 |
-+{ |
4110 |
-+ del_timer_sync(&bfqd->idle_slice_timer); |
4111 |
-+ cancel_work_sync(&bfqd->unplug_work); |
4112 |
-+} |
4113 |
-+ |
4114 |
-+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd, |
4115 |
-+ struct bfq_queue **bfqq_ptr) |
4116 |
-+{ |
4117 |
-+ struct bfq_group *root_group = bfqd->root_group; |
4118 |
-+ struct bfq_queue *bfqq = *bfqq_ptr; |
4119 |
-+ |
4120 |
-+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); |
4121 |
-+ if (bfqq != NULL) { |
4122 |
-+ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); |
4123 |
-+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", |
4124 |
-+ bfqq, atomic_read(&bfqq->ref)); |
4125 |
-+ bfq_put_queue(bfqq); |
4126 |
-+ *bfqq_ptr = NULL; |
4127 |
-+ } |
4128 |
-+} |
4129 |
-+ |
4130 |
-+/* |
4131 |
-+ * Release all the bfqg references to its async queues. If we are |
4132 |
-+ * deallocating the group these queues may still contain requests, so |
4133 |
-+ * we reparent them to the root cgroup (i.e., the only one that will |
4134 |
-+ * exist for sure untill all the requests on a device are gone). |
4135 |
-+ */ |
4136 |
-+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) |
4137 |
-+{ |
4138 |
-+ int i, j; |
4139 |
-+ |
4140 |
-+ for (i = 0; i < 2; i++) |
4141 |
-+ for (j = 0; j < IOPRIO_BE_NR; j++) |
4142 |
-+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); |
4143 |
-+ |
4144 |
-+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); |
4145 |
-+} |
4146 |
-+ |
4147 |
-+static void bfq_exit_queue(struct elevator_queue *e) |
4148 |
-+{ |
4149 |
-+ struct bfq_data *bfqd = e->elevator_data; |
4150 |
-+ struct request_queue *q = bfqd->queue; |
4151 |
-+ struct bfq_queue *bfqq, *n; |
4152 |
-+ |
4153 |
-+ bfq_shutdown_timer_wq(bfqd); |
4154 |
-+ |
4155 |
-+ spin_lock_irq(q->queue_lock); |
4156 |
-+ |
4157 |
-+ BUG_ON(bfqd->in_service_queue != NULL); |
4158 |
-+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) |
4159 |
-+ bfq_deactivate_bfqq(bfqd, bfqq, 0); |
4160 |
-+ |
4161 |
-+ bfq_disconnect_groups(bfqd); |
4162 |
-+ spin_unlock_irq(q->queue_lock); |
4163 |
-+ |
4164 |
-+ bfq_shutdown_timer_wq(bfqd); |
4165 |
-+ |
4166 |
-+ synchronize_rcu(); |
4167 |
-+ |
4168 |
-+ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); |
4169 |
-+ |
4170 |
-+ bfq_free_root_group(bfqd); |
4171 |
-+ kfree(bfqd); |
4172 |
-+} |
4173 |
-+ |
4174 |
-+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) |
4175 |
-+{ |
4176 |
-+ struct bfq_group *bfqg; |
4177 |
-+ struct bfq_data *bfqd; |
4178 |
-+ struct elevator_queue *eq; |
4179 |
-+ |
4180 |
-+ eq = elevator_alloc(q, e); |
4181 |
-+ if (eq == NULL) |
4182 |
-+ return -ENOMEM; |
4183 |
-+ |
4184 |
-+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); |
4185 |
-+ if (bfqd == NULL) { |
4186 |
-+ kobject_put(&eq->kobj); |
4187 |
-+ return -ENOMEM; |
4188 |
-+ } |
4189 |
-+ eq->elevator_data = bfqd; |
4190 |
-+ |
4191 |
-+ /* |
4192 |
-+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. |
4193 |
-+ * Grab a permanent reference to it, so that the normal code flow |
4194 |
-+ * will not attempt to free it. |
4195 |
-+ */ |
4196 |
-+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0); |
4197 |
-+ atomic_inc(&bfqd->oom_bfqq.ref); |
4198 |
-+ |
4199 |
-+ bfqd->queue = q; |
4200 |
-+ |
4201 |
-+ spin_lock_irq(q->queue_lock); |
4202 |
-+ q->elevator = eq; |
4203 |
-+ spin_unlock_irq(q->queue_lock); |
4204 |
-+ |
4205 |
-+ bfqg = bfq_alloc_root_group(bfqd, q->node); |
4206 |
-+ if (bfqg == NULL) { |
4207 |
-+ kfree(bfqd); |
4208 |
-+ kobject_put(&eq->kobj); |
4209 |
-+ return -ENOMEM; |
4210 |
-+ } |
4211 |
-+ |
4212 |
-+ bfqd->root_group = bfqg; |
4213 |
-+ |
4214 |
-+ init_timer(&bfqd->idle_slice_timer); |
4215 |
-+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; |
4216 |
-+ bfqd->idle_slice_timer.data = (unsigned long)bfqd; |
4217 |
-+ |
4218 |
-+ bfqd->rq_pos_tree = RB_ROOT; |
4219 |
-+ |
4220 |
-+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); |
4221 |
-+ |
4222 |
-+ INIT_LIST_HEAD(&bfqd->active_list); |
4223 |
-+ INIT_LIST_HEAD(&bfqd->idle_list); |
4224 |
-+ |
4225 |
-+ bfqd->hw_tag = -1; |
4226 |
-+ |
4227 |
-+ bfqd->bfq_max_budget = bfq_default_max_budget; |
4228 |
-+ |
4229 |
-+ bfqd->bfq_quantum = bfq_quantum; |
4230 |
-+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; |
4231 |
-+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; |
4232 |
-+ bfqd->bfq_back_max = bfq_back_max; |
4233 |
-+ bfqd->bfq_back_penalty = bfq_back_penalty; |
4234 |
-+ bfqd->bfq_slice_idle = bfq_slice_idle; |
4235 |
-+ bfqd->bfq_class_idle_last_service = 0; |
4236 |
-+ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; |
4237 |
-+ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; |
4238 |
-+ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; |
4239 |
-+ |
4240 |
-+ bfqd->low_latency = true; |
4241 |
-+ |
4242 |
-+ bfqd->bfq_raising_coeff = 20; |
4243 |
-+ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300); |
4244 |
-+ bfqd->bfq_raising_max_time = 0; |
4245 |
-+ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000); |
4246 |
-+ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500); |
4247 |
-+ bfqd->bfq_raising_max_softrt_rate = 7000; /* |
4248 |
-+ * Approximate rate required |
4249 |
-+ * to playback or record a |
4250 |
-+ * high-definition compressed |
4251 |
-+ * video. |
4252 |
-+ */ |
4253 |
-+ bfqd->raised_busy_queues = 0; |
4254 |
-+ |
4255 |
-+ /* Initially estimate the device's peak rate as the reference rate */ |
4256 |
-+ if (blk_queue_nonrot(bfqd->queue)) { |
4257 |
-+ bfqd->RT_prod = R_nonrot * T_nonrot; |
4258 |
-+ bfqd->peak_rate = R_nonrot; |
4259 |
-+ } else { |
4260 |
-+ bfqd->RT_prod = R_rot * T_rot; |
4261 |
-+ bfqd->peak_rate = R_rot; |
4262 |
-+ } |
4263 |
-+ |
4264 |
-+ return 0; |
4265 |
-+} |
4266 |
-+ |
4267 |
-+static void bfq_slab_kill(void) |
4268 |
-+{ |
4269 |
-+ if (bfq_pool != NULL) |
4270 |
-+ kmem_cache_destroy(bfq_pool); |
4271 |
-+} |
4272 |
-+ |
4273 |
-+static int __init bfq_slab_setup(void) |
4274 |
-+{ |
4275 |
-+ bfq_pool = KMEM_CACHE(bfq_queue, 0); |
4276 |
-+ if (bfq_pool == NULL) |
4277 |
-+ return -ENOMEM; |
4278 |
-+ return 0; |
4279 |
-+} |
4280 |
-+ |
4281 |
-+static ssize_t bfq_var_show(unsigned int var, char *page) |
4282 |
-+{ |
4283 |
-+ return sprintf(page, "%d\n", var); |
4284 |
-+} |
4285 |
-+ |
4286 |
-+static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count) |
4287 |
-+{ |
4288 |
-+ unsigned long new_val; |
4289 |
-+ int ret = kstrtoul(page, 10, &new_val); |
4290 |
-+ |
4291 |
-+ if (ret == 0) |
4292 |
-+ *var = new_val; |
4293 |
-+ |
4294 |
-+ return count; |
4295 |
-+} |
4296 |
-+ |
4297 |
-+static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page) |
4298 |
-+{ |
4299 |
-+ struct bfq_data *bfqd = e->elevator_data; |
4300 |
-+ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ? |
4301 |
-+ jiffies_to_msecs(bfqd->bfq_raising_max_time) : |
4302 |
-+ jiffies_to_msecs(bfq_wrais_duration(bfqd))); |
4303 |
-+} |
4304 |
-+ |
4305 |
-+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) |
4306 |
-+{ |
4307 |
-+ struct bfq_queue *bfqq; |
4308 |
-+ struct bfq_data *bfqd = e->elevator_data; |
4309 |
-+ ssize_t num_char = 0; |
4310 |
-+ |
4311 |
-+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", |
4312 |
-+ bfqd->queued); |
4313 |
-+ |
4314 |
-+ spin_lock_irq(bfqd->queue->queue_lock); |
4315 |
-+ |
4316 |
-+ num_char += sprintf(page + num_char, "Active:\n"); |
4317 |
-+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { |
4318 |
-+ num_char += sprintf(page + num_char, |
4319 |
-+ "pid%d: weight %hu, nr_queued %d %d," |
4320 |
-+ " dur %d/%u\n", |
4321 |
-+ bfqq->pid, |
4322 |
-+ bfqq->entity.weight, |
4323 |
-+ bfqq->queued[0], |
4324 |
-+ bfqq->queued[1], |
4325 |
-+ jiffies_to_msecs(jiffies - |
4326 |
-+ bfqq->last_rais_start_finish), |
4327 |
-+ jiffies_to_msecs(bfqq->raising_cur_max_time)); |
4328 |
-+ } |
4329 |
-+ |
4330 |
-+ num_char += sprintf(page + num_char, "Idle:\n"); |
4331 |
-+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { |
4332 |
-+ num_char += sprintf(page + num_char, |
4333 |
-+ "pid%d: weight %hu, dur %d/%u\n", |
4334 |
-+ bfqq->pid, |
4335 |
-+ bfqq->entity.weight, |
4336 |
-+ jiffies_to_msecs(jiffies - |
4337 |
-+ bfqq->last_rais_start_finish), |
4338 |
-+ jiffies_to_msecs(bfqq->raising_cur_max_time)); |
4339 |
-+ } |
4340 |
-+ |
4341 |
-+ spin_unlock_irq(bfqd->queue->queue_lock); |
4342 |
-+ |
4343 |
-+ return num_char; |
4344 |
-+} |
4345 |
-+ |
4346 |
-+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ |
4347 |
-+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ |
4348 |
-+{ \ |
4349 |
-+ struct bfq_data *bfqd = e->elevator_data; \ |
4350 |
-+ unsigned int __data = __VAR; \ |
4351 |
-+ if (__CONV) \ |
4352 |
-+ __data = jiffies_to_msecs(__data); \ |
4353 |
-+ return bfq_var_show(__data, (page)); \ |
4354 |
-+} |
4355 |
-+SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0); |
4356 |
-+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); |
4357 |
-+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); |
4358 |
-+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); |
4359 |
-+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); |
4360 |
-+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); |
4361 |
-+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); |
4362 |
-+SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0); |
4363 |
-+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); |
4364 |
-+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); |
4365 |
-+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); |
4366 |
-+SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0); |
4367 |
-+SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1); |
4368 |
-+SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time, |
4369 |
-+ 1); |
4370 |
-+SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show, |
4371 |
-+ bfqd->bfq_raising_min_inter_arr_async, |
4372 |
-+ 1); |
4373 |
-+SHOW_FUNCTION(bfq_raising_max_softrt_rate_show, |
4374 |
-+ bfqd->bfq_raising_max_softrt_rate, 0); |
4375 |
-+#undef SHOW_FUNCTION |
4376 |
-+ |
4377 |
-+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ |
4378 |
-+static ssize_t \ |
4379 |
-+__FUNC(struct elevator_queue *e, const char *page, size_t count) \ |
4380 |
-+{ \ |
4381 |
-+ struct bfq_data *bfqd = e->elevator_data; \ |
4382 |
-+ unsigned long uninitialized_var(__data); \ |
4383 |
-+ int ret = bfq_var_store(&__data, (page), count); \ |
4384 |
-+ if (__data < (MIN)) \ |
4385 |
-+ __data = (MIN); \ |
4386 |
-+ else if (__data > (MAX)) \ |
4387 |
-+ __data = (MAX); \ |
4388 |
-+ if (__CONV) \ |
4389 |
-+ *(__PTR) = msecs_to_jiffies(__data); \ |
4390 |
-+ else \ |
4391 |
-+ *(__PTR) = __data; \ |
4392 |
-+ return ret; \ |
4393 |
-+} |
4394 |
-+STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0); |
4395 |
-+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, |
4396 |
-+ INT_MAX, 1); |
4397 |
-+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, |
4398 |
-+ INT_MAX, 1); |
4399 |
-+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); |
4400 |
-+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, |
4401 |
-+ INT_MAX, 0); |
4402 |
-+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); |
4403 |
-+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, |
4404 |
-+ 1, INT_MAX, 0); |
4405 |
-+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, |
4406 |
-+ INT_MAX, 1); |
4407 |
-+STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1, |
4408 |
-+ INT_MAX, 0); |
4409 |
-+STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0, |
4410 |
-+ INT_MAX, 1); |
4411 |
-+STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0, |
4412 |
-+ INT_MAX, 1); |
4413 |
-+STORE_FUNCTION(bfq_raising_min_idle_time_store, |
4414 |
-+ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1); |
4415 |
-+STORE_FUNCTION(bfq_raising_min_inter_arr_async_store, |
4416 |
-+ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1); |
4417 |
-+STORE_FUNCTION(bfq_raising_max_softrt_rate_store, |
4418 |
-+ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0); |
4419 |
-+#undef STORE_FUNCTION |
4420 |
-+ |
4421 |
-+/* do nothing for the moment */ |
4422 |
-+static ssize_t bfq_weights_store(struct elevator_queue *e, |
4423 |
-+ const char *page, size_t count) |
4424 |
-+{ |
4425 |
-+ return count; |
4426 |
-+} |
4427 |
-+ |
4428 |
-+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) |
4429 |
-+{ |
4430 |
-+ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); |
4431 |
-+ |
4432 |
-+ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) |
4433 |
-+ return bfq_calc_max_budget(bfqd->peak_rate, timeout); |
4434 |
-+ else |
4435 |
-+ return bfq_default_max_budget; |
4436 |
-+} |
4437 |
-+ |
4438 |
-+static ssize_t bfq_max_budget_store(struct elevator_queue *e, |
4439 |
-+ const char *page, size_t count) |
4440 |
-+{ |
4441 |
-+ struct bfq_data *bfqd = e->elevator_data; |
4442 |
-+ unsigned long uninitialized_var(__data); |
4443 |
-+ int ret = bfq_var_store(&__data, (page), count); |
4444 |
-+ |
4445 |
-+ if (__data == 0) |
4446 |
-+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); |
4447 |
-+ else { |
4448 |
-+ if (__data > INT_MAX) |
4449 |
-+ __data = INT_MAX; |
4450 |
-+ bfqd->bfq_max_budget = __data; |
4451 |
-+ } |
4452 |
-+ |
4453 |
-+ bfqd->bfq_user_max_budget = __data; |
4454 |
-+ |
4455 |
-+ return ret; |
4456 |
-+} |
4457 |
-+ |
4458 |
-+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, |
4459 |
-+ const char *page, size_t count) |
4460 |
-+{ |
4461 |
-+ struct bfq_data *bfqd = e->elevator_data; |
4462 |
-+ unsigned long uninitialized_var(__data); |
4463 |
-+ int ret = bfq_var_store(&__data, (page), count); |
4464 |
-+ |
4465 |
-+ if (__data < 1) |
4466 |
-+ __data = 1; |
4467 |
-+ else if (__data > INT_MAX) |
4468 |
-+ __data = INT_MAX; |
4469 |
-+ |
4470 |
-+ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); |
4471 |
-+ if (bfqd->bfq_user_max_budget == 0) |
4472 |
-+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); |
4473 |
-+ |
4474 |
-+ return ret; |
4475 |
-+} |
4476 |
-+ |
4477 |
-+static ssize_t bfq_low_latency_store(struct elevator_queue *e, |
4478 |
-+ const char *page, size_t count) |
4479 |
-+{ |
4480 |
-+ struct bfq_data *bfqd = e->elevator_data; |
4481 |
-+ unsigned long uninitialized_var(__data); |
4482 |
-+ int ret = bfq_var_store(&__data, (page), count); |
4483 |
-+ |
4484 |
-+ if (__data > 1) |
4485 |
-+ __data = 1; |
4486 |
-+ if (__data == 0 && bfqd->low_latency != 0) |
4487 |
-+ bfq_end_raising(bfqd); |
4488 |
-+ bfqd->low_latency = __data; |
4489 |
-+ |
4490 |
-+ return ret; |
4491 |
-+} |
4492 |
-+ |
4493 |
-+#define BFQ_ATTR(name) \ |
4494 |
-+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) |
4495 |
-+ |
4496 |
-+static struct elv_fs_entry bfq_attrs[] = { |
4497 |
-+ BFQ_ATTR(quantum), |
4498 |
-+ BFQ_ATTR(fifo_expire_sync), |
4499 |
-+ BFQ_ATTR(fifo_expire_async), |
4500 |
-+ BFQ_ATTR(back_seek_max), |
4501 |
-+ BFQ_ATTR(back_seek_penalty), |
4502 |
-+ BFQ_ATTR(slice_idle), |
4503 |
-+ BFQ_ATTR(max_budget), |
4504 |
-+ BFQ_ATTR(max_budget_async_rq), |
4505 |
-+ BFQ_ATTR(timeout_sync), |
4506 |
-+ BFQ_ATTR(timeout_async), |
4507 |
-+ BFQ_ATTR(low_latency), |
4508 |
-+ BFQ_ATTR(raising_coeff), |
4509 |
-+ BFQ_ATTR(raising_max_time), |
4510 |
-+ BFQ_ATTR(raising_rt_max_time), |
4511 |
-+ BFQ_ATTR(raising_min_idle_time), |
4512 |
-+ BFQ_ATTR(raising_min_inter_arr_async), |
4513 |
-+ BFQ_ATTR(raising_max_softrt_rate), |
4514 |
-+ BFQ_ATTR(weights), |
4515 |
-+ __ATTR_NULL |
4516 |
-+}; |
4517 |
-+ |
4518 |
-+static struct elevator_type iosched_bfq = { |
4519 |
-+ .ops = { |
4520 |
-+ .elevator_merge_fn = bfq_merge, |
4521 |
-+ .elevator_merged_fn = bfq_merged_request, |
4522 |
-+ .elevator_merge_req_fn = bfq_merged_requests, |
4523 |
-+ .elevator_allow_merge_fn = bfq_allow_merge, |
4524 |
-+ .elevator_dispatch_fn = bfq_dispatch_requests, |
4525 |
-+ .elevator_add_req_fn = bfq_insert_request, |
4526 |
-+ .elevator_activate_req_fn = bfq_activate_request, |
4527 |
-+ .elevator_deactivate_req_fn = bfq_deactivate_request, |
4528 |
-+ .elevator_completed_req_fn = bfq_completed_request, |
4529 |
-+ .elevator_former_req_fn = elv_rb_former_request, |
4530 |
-+ .elevator_latter_req_fn = elv_rb_latter_request, |
4531 |
-+ .elevator_init_icq_fn = bfq_init_icq, |
4532 |
-+ .elevator_exit_icq_fn = bfq_exit_icq, |
4533 |
-+ .elevator_set_req_fn = bfq_set_request, |
4534 |
-+ .elevator_put_req_fn = bfq_put_request, |
4535 |
-+ .elevator_may_queue_fn = bfq_may_queue, |
4536 |
-+ .elevator_init_fn = bfq_init_queue, |
4537 |
-+ .elevator_exit_fn = bfq_exit_queue, |
4538 |
-+ }, |
4539 |
-+ .icq_size = sizeof(struct bfq_io_cq), |
4540 |
-+ .icq_align = __alignof__(struct bfq_io_cq), |
4541 |
-+ .elevator_attrs = bfq_attrs, |
4542 |
-+ .elevator_name = "bfq", |
4543 |
-+ .elevator_owner = THIS_MODULE, |
4544 |
-+}; |
4545 |
-+ |
4546 |
-+static int __init bfq_init(void) |
4547 |
-+{ |
4548 |
-+ /* |
4549 |
-+ * Can be 0 on HZ < 1000 setups. |
4550 |
-+ */ |
4551 |
-+ if (bfq_slice_idle == 0) |
4552 |
-+ bfq_slice_idle = 1; |
4553 |
-+ |
4554 |
-+ if (bfq_timeout_async == 0) |
4555 |
-+ bfq_timeout_async = 1; |
4556 |
-+ |
4557 |
-+ if (bfq_slab_setup()) |
4558 |
-+ return -ENOMEM; |
4559 |
-+ |
4560 |
-+ elv_register(&iosched_bfq); |
4561 |
-+ printk(KERN_INFO "BFQ I/O-scheduler version: v7"); |
4562 |
-+ |
4563 |
-+ return 0; |
4564 |
-+} |
4565 |
-+ |
4566 |
-+static void __exit bfq_exit(void) |
4567 |
-+{ |
4568 |
-+ elv_unregister(&iosched_bfq); |
4569 |
-+ bfq_slab_kill(); |
4570 |
-+} |
4571 |
-+ |
4572 |
-+module_init(bfq_init); |
4573 |
-+module_exit(bfq_exit); |
4574 |
-+ |
4575 |
-+MODULE_AUTHOR("Fabio Checconi, Paolo Valente"); |
4576 |
-+MODULE_LICENSE("GPL"); |
4577 |
-+MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler"); |
4578 |
-diff --git a/block/bfq-sched.c b/block/bfq-sched.c |
4579 |
-new file mode 100644 |
4580 |
-index 0000000..30df81c |
4581 |
---- /dev/null |
4582 |
-+++ b/block/bfq-sched.c |
4583 |
-@@ -0,0 +1,1077 @@ |
4584 |
-+/* |
4585 |
-+ * BFQ: Hierarchical B-WF2Q+ scheduler. |
4586 |
-+ * |
4587 |
-+ * Based on ideas and code from CFQ: |
4588 |
-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
4589 |
-+ * |
4590 |
-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
4591 |
-+ * Paolo Valente <paolo.valente@×××××××.it> |
4592 |
-+ * |
4593 |
-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
4594 |
-+ */ |
4595 |
-+ |
4596 |
-+#ifdef CONFIG_CGROUP_BFQIO |
4597 |
-+#define for_each_entity(entity) \ |
4598 |
-+ for (; entity != NULL; entity = entity->parent) |
4599 |
-+ |
4600 |
-+#define for_each_entity_safe(entity, parent) \ |
4601 |
-+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) |
4602 |
-+ |
4603 |
-+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, |
4604 |
-+ int extract, |
4605 |
-+ struct bfq_data *bfqd); |
4606 |
-+ |
4607 |
-+static inline void bfq_update_budget(struct bfq_entity *next_active) |
4608 |
-+{ |
4609 |
-+ struct bfq_entity *bfqg_entity; |
4610 |
-+ struct bfq_group *bfqg; |
4611 |
-+ struct bfq_sched_data *group_sd; |
4612 |
-+ |
4613 |
-+ BUG_ON(next_active == NULL); |
4614 |
-+ |
4615 |
-+ group_sd = next_active->sched_data; |
4616 |
-+ |
4617 |
-+ bfqg = container_of(group_sd, struct bfq_group, sched_data); |
4618 |
-+ /* |
4619 |
-+ * bfq_group's my_entity field is not NULL only if the group |
4620 |
-+ * is not the root group. We must not touch the root entity |
4621 |
-+ * as it must never become an active entity. |
4622 |
-+ */ |
4623 |
-+ bfqg_entity = bfqg->my_entity; |
4624 |
-+ if (bfqg_entity != NULL) |
4625 |
-+ bfqg_entity->budget = next_active->budget; |
4626 |
-+} |
4627 |
-+ |
4628 |
-+static int bfq_update_next_active(struct bfq_sched_data *sd) |
4629 |
-+{ |
4630 |
-+ struct bfq_entity *next_active; |
4631 |
-+ |
4632 |
-+ if (sd->active_entity != NULL) |
4633 |
-+ /* will update/requeue at the end of service */ |
4634 |
-+ return 0; |
4635 |
-+ |
4636 |
-+ /* |
4637 |
-+ * NOTE: this can be improved in many ways, such as returning |
4638 |
-+ * 1 (and thus propagating upwards the update) only when the |
4639 |
-+ * budget changes, or caching the bfqq that will be scheduled |
4640 |
-+ * next from this subtree. By now we worry more about |
4641 |
-+ * correctness than about performance... |
4642 |
-+ */ |
4643 |
-+ next_active = bfq_lookup_next_entity(sd, 0, NULL); |
4644 |
-+ sd->next_active = next_active; |
4645 |
-+ |
4646 |
-+ if (next_active != NULL) |
4647 |
-+ bfq_update_budget(next_active); |
4648 |
-+ |
4649 |
-+ return 1; |
4650 |
-+} |
4651 |
-+ |
4652 |
-+static inline void bfq_check_next_active(struct bfq_sched_data *sd, |
4653 |
-+ struct bfq_entity *entity) |
4654 |
-+{ |
4655 |
-+ BUG_ON(sd->next_active != entity); |
4656 |
-+} |
4657 |
-+#else |
4658 |
-+#define for_each_entity(entity) \ |
4659 |
-+ for (; entity != NULL; entity = NULL) |
4660 |
-+ |
4661 |
-+#define for_each_entity_safe(entity, parent) \ |
4662 |
-+ for (parent = NULL; entity != NULL; entity = parent) |
4663 |
-+ |
4664 |
-+static inline int bfq_update_next_active(struct bfq_sched_data *sd) |
4665 |
-+{ |
4666 |
-+ return 0; |
4667 |
-+} |
4668 |
-+ |
4669 |
-+static inline void bfq_check_next_active(struct bfq_sched_data *sd, |
4670 |
-+ struct bfq_entity *entity) |
4671 |
-+{ |
4672 |
-+} |
4673 |
-+ |
4674 |
-+static inline void bfq_update_budget(struct bfq_entity *next_active) |
4675 |
-+{ |
4676 |
-+} |
4677 |
-+#endif |
4678 |
-+ |
4679 |
-+/* |
4680 |
-+ * Shift for timestamp calculations. This actually limits the maximum |
4681 |
-+ * service allowed in one timestamp delta (small shift values increase it), |
4682 |
-+ * the maximum total weight that can be used for the queues in the system |
4683 |
-+ * (big shift values increase it), and the period of virtual time wraparounds. |
4684 |
-+ */ |
4685 |
-+#define WFQ_SERVICE_SHIFT 22 |
4686 |
-+ |
4687 |
-+/** |
4688 |
-+ * bfq_gt - compare two timestamps. |
4689 |
-+ * @a: first ts. |
4690 |
-+ * @b: second ts. |
4691 |
-+ * |
4692 |
-+ * Return @a > @b, dealing with wrapping correctly. |
4693 |
-+ */ |
4694 |
-+static inline int bfq_gt(u64 a, u64 b) |
4695 |
-+{ |
4696 |
-+ return (s64)(a - b) > 0; |
4697 |
-+} |
4698 |
-+ |
4699 |
-+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) |
4700 |
-+{ |
4701 |
-+ struct bfq_queue *bfqq = NULL; |
4702 |
-+ |
4703 |
-+ BUG_ON(entity == NULL); |
4704 |
-+ |
4705 |
-+ if (entity->my_sched_data == NULL) |
4706 |
-+ bfqq = container_of(entity, struct bfq_queue, entity); |
4707 |
-+ |
4708 |
-+ return bfqq; |
4709 |
-+} |
4710 |
-+ |
4711 |
-+ |
4712 |
-+/** |
4713 |
-+ * bfq_delta - map service into the virtual time domain. |
4714 |
-+ * @service: amount of service. |
4715 |
-+ * @weight: scale factor (weight of an entity or weight sum). |
4716 |
-+ */ |
4717 |
-+static inline u64 bfq_delta(unsigned long service, |
4718 |
-+ unsigned long weight) |
4719 |
-+{ |
4720 |
-+ u64 d = (u64)service << WFQ_SERVICE_SHIFT; |
4721 |
-+ |
4722 |
-+ do_div(d, weight); |
4723 |
-+ return d; |
4724 |
-+} |
4725 |
-+ |
4726 |
-+/** |
4727 |
-+ * bfq_calc_finish - assign the finish time to an entity. |
4728 |
-+ * @entity: the entity to act upon. |
4729 |
-+ * @service: the service to be charged to the entity. |
4730 |
-+ */ |
4731 |
-+static inline void bfq_calc_finish(struct bfq_entity *entity, |
4732 |
-+ unsigned long service) |
4733 |
-+{ |
4734 |
-+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
4735 |
-+ |
4736 |
-+ BUG_ON(entity->weight == 0); |
4737 |
-+ |
4738 |
-+ entity->finish = entity->start + |
4739 |
-+ bfq_delta(service, entity->weight); |
4740 |
-+ |
4741 |
-+ if (bfqq != NULL) { |
4742 |
-+ bfq_log_bfqq(bfqq->bfqd, bfqq, |
4743 |
-+ "calc_finish: serv %lu, w %d", |
4744 |
-+ service, entity->weight); |
4745 |
-+ bfq_log_bfqq(bfqq->bfqd, bfqq, |
4746 |
-+ "calc_finish: start %llu, finish %llu, delta %llu", |
4747 |
-+ entity->start, entity->finish, |
4748 |
-+ bfq_delta(service, entity->weight)); |
4749 |
-+ } |
4750 |
-+} |
4751 |
-+ |
4752 |
-+/** |
4753 |
-+ * bfq_entity_of - get an entity from a node. |
4754 |
-+ * @node: the node field of the entity. |
4755 |
-+ * |
4756 |
-+ * Convert a node pointer to the relative entity. This is used only |
4757 |
-+ * to simplify the logic of some functions and not as the generic |
4758 |
-+ * conversion mechanism because, e.g., in the tree walking functions, |
4759 |
-+ * the check for a %NULL value would be redundant. |
4760 |
-+ */ |
4761 |
-+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node) |
4762 |
-+{ |
4763 |
-+ struct bfq_entity *entity = NULL; |
4764 |
-+ |
4765 |
-+ if (node != NULL) |
4766 |
-+ entity = rb_entry(node, struct bfq_entity, rb_node); |
4767 |
-+ |
4768 |
-+ return entity; |
4769 |
-+} |
4770 |
-+ |
4771 |
-+/** |
4772 |
-+ * bfq_extract - remove an entity from a tree. |
4773 |
-+ * @root: the tree root. |
4774 |
-+ * @entity: the entity to remove. |
4775 |
-+ */ |
4776 |
-+static inline void bfq_extract(struct rb_root *root, |
4777 |
-+ struct bfq_entity *entity) |
4778 |
-+{ |
4779 |
-+ BUG_ON(entity->tree != root); |
4780 |
-+ |
4781 |
-+ entity->tree = NULL; |
4782 |
-+ rb_erase(&entity->rb_node, root); |
4783 |
-+} |
4784 |
-+ |
4785 |
-+/** |
4786 |
-+ * bfq_idle_extract - extract an entity from the idle tree. |
4787 |
-+ * @st: the service tree of the owning @entity. |
4788 |
-+ * @entity: the entity being removed. |
4789 |
-+ */ |
4790 |
-+static void bfq_idle_extract(struct bfq_service_tree *st, |
4791 |
-+ struct bfq_entity *entity) |
4792 |
-+{ |
4793 |
-+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
4794 |
-+ struct rb_node *next; |
4795 |
-+ |
4796 |
-+ BUG_ON(entity->tree != &st->idle); |
4797 |
-+ |
4798 |
-+ if (entity == st->first_idle) { |
4799 |
-+ next = rb_next(&entity->rb_node); |
4800 |
-+ st->first_idle = bfq_entity_of(next); |
4801 |
-+ } |
4802 |
-+ |
4803 |
-+ if (entity == st->last_idle) { |
4804 |
-+ next = rb_prev(&entity->rb_node); |
4805 |
-+ st->last_idle = bfq_entity_of(next); |
4806 |
-+ } |
4807 |
-+ |
4808 |
-+ bfq_extract(&st->idle, entity); |
4809 |
-+ |
4810 |
-+ if (bfqq != NULL) |
4811 |
-+ list_del(&bfqq->bfqq_list); |
4812 |
-+} |
4813 |
-+ |
4814 |
-+/** |
4815 |
-+ * bfq_insert - generic tree insertion. |
4816 |
-+ * @root: tree root. |
4817 |
-+ * @entity: entity to insert. |
4818 |
-+ * |
4819 |
-+ * This is used for the idle and the active tree, since they are both |
4820 |
-+ * ordered by finish time. |
4821 |
-+ */ |
4822 |
-+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) |
4823 |
-+{ |
4824 |
-+ struct bfq_entity *entry; |
4825 |
-+ struct rb_node **node = &root->rb_node; |
4826 |
-+ struct rb_node *parent = NULL; |
4827 |
-+ |
4828 |
-+ BUG_ON(entity->tree != NULL); |
4829 |
-+ |
4830 |
-+ while (*node != NULL) { |
4831 |
-+ parent = *node; |
4832 |
-+ entry = rb_entry(parent, struct bfq_entity, rb_node); |
4833 |
-+ |
4834 |
-+ if (bfq_gt(entry->finish, entity->finish)) |
4835 |
-+ node = &parent->rb_left; |
4836 |
-+ else |
4837 |
-+ node = &parent->rb_right; |
4838 |
-+ } |
4839 |
-+ |
4840 |
-+ rb_link_node(&entity->rb_node, parent, node); |
4841 |
-+ rb_insert_color(&entity->rb_node, root); |
4842 |
-+ |
4843 |
-+ entity->tree = root; |
4844 |
-+} |
4845 |
-+ |
4846 |
-+/** |
4847 |
-+ * bfq_update_min - update the min_start field of a entity. |
4848 |
-+ * @entity: the entity to update. |
4849 |
-+ * @node: one of its children. |
4850 |
-+ * |
4851 |
-+ * This function is called when @entity may store an invalid value for |
4852 |
-+ * min_start due to updates to the active tree. The function assumes |
4853 |
-+ * that the subtree rooted at @node (which may be its left or its right |
4854 |
-+ * child) has a valid min_start value. |
4855 |
-+ */ |
4856 |
-+static inline void bfq_update_min(struct bfq_entity *entity, |
4857 |
-+ struct rb_node *node) |
4858 |
-+{ |
4859 |
-+ struct bfq_entity *child; |
4860 |
-+ |
4861 |
-+ if (node != NULL) { |
4862 |
-+ child = rb_entry(node, struct bfq_entity, rb_node); |
4863 |
-+ if (bfq_gt(entity->min_start, child->min_start)) |
4864 |
-+ entity->min_start = child->min_start; |
4865 |
-+ } |
4866 |
-+} |
4867 |
-+ |
4868 |
-+/** |
4869 |
-+ * bfq_update_active_node - recalculate min_start. |
4870 |
-+ * @node: the node to update. |
4871 |
-+ * |
4872 |
-+ * @node may have changed position or one of its children may have moved, |
4873 |
-+ * this function updates its min_start value. The left and right subtrees |
4874 |
-+ * are assumed to hold a correct min_start value. |
4875 |
-+ */ |
4876 |
-+static inline void bfq_update_active_node(struct rb_node *node) |
4877 |
-+{ |
4878 |
-+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); |
4879 |
-+ |
4880 |
-+ entity->min_start = entity->start; |
4881 |
-+ bfq_update_min(entity, node->rb_right); |
4882 |
-+ bfq_update_min(entity, node->rb_left); |
4883 |
-+} |
4884 |
-+ |
4885 |
-+/** |
4886 |
-+ * bfq_update_active_tree - update min_start for the whole active tree. |
4887 |
-+ * @node: the starting node. |
4888 |
-+ * |
4889 |
-+ * @node must be the deepest modified node after an update. This function |
4890 |
-+ * updates its min_start using the values held by its children, assuming |
4891 |
-+ * that they did not change, and then updates all the nodes that may have |
4892 |
-+ * changed in the path to the root. The only nodes that may have changed |
4893 |
-+ * are the ones in the path or their siblings. |
4894 |
-+ */ |
4895 |
-+static void bfq_update_active_tree(struct rb_node *node) |
4896 |
-+{ |
4897 |
-+ struct rb_node *parent; |
4898 |
-+ |
4899 |
-+up: |
4900 |
-+ bfq_update_active_node(node); |
4901 |
-+ |
4902 |
-+ parent = rb_parent(node); |
4903 |
-+ if (parent == NULL) |
4904 |
-+ return; |
4905 |
-+ |
4906 |
-+ if (node == parent->rb_left && parent->rb_right != NULL) |
4907 |
-+ bfq_update_active_node(parent->rb_right); |
4908 |
-+ else if (parent->rb_left != NULL) |
4909 |
-+ bfq_update_active_node(parent->rb_left); |
4910 |
-+ |
4911 |
-+ node = parent; |
4912 |
-+ goto up; |
4913 |
-+} |
4914 |
-+ |
4915 |
-+/** |
4916 |
-+ * bfq_active_insert - insert an entity in the active tree of its group/device. |
4917 |
-+ * @st: the service tree of the entity. |
4918 |
-+ * @entity: the entity being inserted. |
4919 |
-+ * |
4920 |
-+ * The active tree is ordered by finish time, but an extra key is kept |
4921 |
-+ * per each node, containing the minimum value for the start times of |
4922 |
-+ * its children (and the node itself), so it's possible to search for |
4923 |
-+ * the eligible node with the lowest finish time in logarithmic time. |
4924 |
-+ */ |
4925 |
-+static void bfq_active_insert(struct bfq_service_tree *st, |
4926 |
-+ struct bfq_entity *entity) |
4927 |
-+{ |
4928 |
-+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
4929 |
-+ struct rb_node *node = &entity->rb_node; |
4930 |
-+ |
4931 |
-+ bfq_insert(&st->active, entity); |
4932 |
-+ |
4933 |
-+ if (node->rb_left != NULL) |
4934 |
-+ node = node->rb_left; |
4935 |
-+ else if (node->rb_right != NULL) |
4936 |
-+ node = node->rb_right; |
4937 |
-+ |
4938 |
-+ bfq_update_active_tree(node); |
4939 |
-+ |
4940 |
-+ if (bfqq != NULL) |
4941 |
-+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); |
4942 |
-+} |
4943 |
-+ |
4944 |
-+/** |
4945 |
-+ * bfq_ioprio_to_weight - calc a weight from an ioprio. |
4946 |
-+ * @ioprio: the ioprio value to convert. |
4947 |
-+ */ |
4948 |
-+static unsigned short bfq_ioprio_to_weight(int ioprio) |
4949 |
-+{ |
4950 |
-+ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); |
4951 |
-+ return IOPRIO_BE_NR - ioprio; |
4952 |
-+} |
4953 |
-+ |
4954 |
-+/** |
4955 |
-+ * bfq_weight_to_ioprio - calc an ioprio from a weight. |
4956 |
-+ * @weight: the weight value to convert. |
4957 |
-+ * |
4958 |
-+ * To preserve as mush as possible the old only-ioprio user interface, |
4959 |
-+ * 0 is used as an escape ioprio value for weights (numerically) equal or |
4960 |
-+ * larger than IOPRIO_BE_NR |
4961 |
-+ */ |
4962 |
-+static unsigned short bfq_weight_to_ioprio(int weight) |
4963 |
-+{ |
4964 |
-+ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); |
4965 |
-+ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; |
4966 |
-+} |
4967 |
-+ |
4968 |
-+static inline void bfq_get_entity(struct bfq_entity *entity) |
4969 |
-+{ |
4970 |
-+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
4971 |
-+ struct bfq_sched_data *sd; |
4972 |
-+ |
4973 |
-+ if (bfqq != NULL) { |
4974 |
-+ sd = entity->sched_data; |
4975 |
-+ atomic_inc(&bfqq->ref); |
4976 |
-+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", |
4977 |
-+ bfqq, atomic_read(&bfqq->ref)); |
4978 |
-+ } |
4979 |
-+} |
4980 |
-+ |
4981 |
-+/** |
4982 |
-+ * bfq_find_deepest - find the deepest node that an extraction can modify. |
4983 |
-+ * @node: the node being removed. |
4984 |
-+ * |
4985 |
-+ * Do the first step of an extraction in an rb tree, looking for the |
4986 |
-+ * node that will replace @node, and returning the deepest node that |
4987 |
-+ * the following modifications to the tree can touch. If @node is the |
4988 |
-+ * last node in the tree return %NULL. |
4989 |
-+ */ |
4990 |
-+static struct rb_node *bfq_find_deepest(struct rb_node *node) |
4991 |
-+{ |
4992 |
-+ struct rb_node *deepest; |
4993 |
-+ |
4994 |
-+ if (node->rb_right == NULL && node->rb_left == NULL) |
4995 |
-+ deepest = rb_parent(node); |
4996 |
-+ else if (node->rb_right == NULL) |
4997 |
-+ deepest = node->rb_left; |
4998 |
-+ else if (node->rb_left == NULL) |
4999 |
-+ deepest = node->rb_right; |
5000 |
-+ else { |
5001 |
-+ deepest = rb_next(node); |
5002 |
-+ if (deepest->rb_right != NULL) |
5003 |
-+ deepest = deepest->rb_right; |
5004 |
-+ else if (rb_parent(deepest) != node) |
5005 |
-+ deepest = rb_parent(deepest); |
5006 |
-+ } |
5007 |
-+ |
5008 |
-+ return deepest; |
5009 |
-+} |
5010 |
-+ |
5011 |
-+/** |
5012 |
-+ * bfq_active_extract - remove an entity from the active tree. |
5013 |
-+ * @st: the service_tree containing the tree. |
5014 |
-+ * @entity: the entity being removed. |
5015 |
-+ */ |
5016 |
-+static void bfq_active_extract(struct bfq_service_tree *st, |
5017 |
-+ struct bfq_entity *entity) |
5018 |
-+{ |
5019 |
-+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
5020 |
-+ struct rb_node *node; |
5021 |
-+ |
5022 |
-+ node = bfq_find_deepest(&entity->rb_node); |
5023 |
-+ bfq_extract(&st->active, entity); |
5024 |
-+ |
5025 |
-+ if (node != NULL) |
5026 |
-+ bfq_update_active_tree(node); |
5027 |
-+ |
5028 |
-+ if (bfqq != NULL) |
5029 |
-+ list_del(&bfqq->bfqq_list); |
5030 |
-+} |
5031 |
-+ |
5032 |
-+/** |
5033 |
-+ * bfq_idle_insert - insert an entity into the idle tree. |
5034 |
-+ * @st: the service tree containing the tree. |
5035 |
-+ * @entity: the entity to insert. |
5036 |
-+ */ |
5037 |
-+static void bfq_idle_insert(struct bfq_service_tree *st, |
5038 |
-+ struct bfq_entity *entity) |
5039 |
-+{ |
5040 |
-+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
5041 |
-+ struct bfq_entity *first_idle = st->first_idle; |
5042 |
-+ struct bfq_entity *last_idle = st->last_idle; |
5043 |
-+ |
5044 |
-+ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish)) |
5045 |
-+ st->first_idle = entity; |
5046 |
-+ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish)) |
5047 |
-+ st->last_idle = entity; |
5048 |
-+ |
5049 |
-+ bfq_insert(&st->idle, entity); |
5050 |
-+ |
5051 |
-+ if (bfqq != NULL) |
5052 |
-+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); |
5053 |
-+} |
5054 |
-+ |
5055 |
-+/** |
5056 |
-+ * bfq_forget_entity - remove an entity from the wfq trees. |
5057 |
-+ * @st: the service tree. |
5058 |
-+ * @entity: the entity being removed. |
5059 |
-+ * |
5060 |
-+ * Update the device status and forget everything about @entity, putting |
5061 |
-+ * the device reference to it, if it is a queue. Entities belonging to |
5062 |
-+ * groups are not refcounted. |
5063 |
-+ */ |
5064 |
-+static void bfq_forget_entity(struct bfq_service_tree *st, |
5065 |
-+ struct bfq_entity *entity) |
5066 |
-+{ |
5067 |
-+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
5068 |
-+ struct bfq_sched_data *sd; |
5069 |
-+ |
5070 |
-+ BUG_ON(!entity->on_st); |
5071 |
-+ |
5072 |
-+ entity->on_st = 0; |
5073 |
-+ st->wsum -= entity->weight; |
5074 |
-+ if (bfqq != NULL) { |
5075 |
-+ sd = entity->sched_data; |
5076 |
-+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", |
5077 |
-+ bfqq, atomic_read(&bfqq->ref)); |
5078 |
-+ bfq_put_queue(bfqq); |
5079 |
-+ } |
5080 |
-+} |
5081 |
-+ |
5082 |
-+/** |
5083 |
-+ * bfq_put_idle_entity - release the idle tree ref of an entity. |
5084 |
-+ * @st: service tree for the entity. |
5085 |
-+ * @entity: the entity being released. |
5086 |
-+ */ |
5087 |
-+static void bfq_put_idle_entity(struct bfq_service_tree *st, |
5088 |
-+ struct bfq_entity *entity) |
5089 |
-+{ |
5090 |
-+ bfq_idle_extract(st, entity); |
5091 |
-+ bfq_forget_entity(st, entity); |
5092 |
-+} |
5093 |
-+ |
5094 |
-+/** |
5095 |
-+ * bfq_forget_idle - update the idle tree if necessary. |
5096 |
-+ * @st: the service tree to act upon. |
5097 |
-+ * |
5098 |
-+ * To preserve the global O(log N) complexity we only remove one entry here; |
5099 |
-+ * as the idle tree will not grow indefinitely this can be done safely. |
5100 |
-+ */ |
5101 |
-+static void bfq_forget_idle(struct bfq_service_tree *st) |
5102 |
-+{ |
5103 |
-+ struct bfq_entity *first_idle = st->first_idle; |
5104 |
-+ struct bfq_entity *last_idle = st->last_idle; |
5105 |
-+ |
5106 |
-+ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL && |
5107 |
-+ !bfq_gt(last_idle->finish, st->vtime)) { |
5108 |
-+ /* |
5109 |
-+ * Forget the whole idle tree, increasing the vtime past |
5110 |
-+ * the last finish time of idle entities. |
5111 |
-+ */ |
5112 |
-+ st->vtime = last_idle->finish; |
5113 |
-+ } |
5114 |
-+ |
5115 |
-+ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime)) |
5116 |
-+ bfq_put_idle_entity(st, first_idle); |
5117 |
-+} |
5118 |
-+ |
5119 |
-+static struct bfq_service_tree * |
5120 |
-+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, |
5121 |
-+ struct bfq_entity *entity) |
5122 |
-+{ |
5123 |
-+ struct bfq_service_tree *new_st = old_st; |
5124 |
-+ |
5125 |
-+ if (entity->ioprio_changed) { |
5126 |
-+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
5127 |
-+ |
5128 |
-+ BUG_ON(old_st->wsum < entity->weight); |
5129 |
-+ old_st->wsum -= entity->weight; |
5130 |
-+ |
5131 |
-+ if (entity->new_weight != entity->orig_weight) { |
5132 |
-+ entity->orig_weight = entity->new_weight; |
5133 |
-+ entity->ioprio = |
5134 |
-+ bfq_weight_to_ioprio(entity->orig_weight); |
5135 |
-+ } else if (entity->new_ioprio != entity->ioprio) { |
5136 |
-+ entity->ioprio = entity->new_ioprio; |
5137 |
-+ entity->orig_weight = |
5138 |
-+ bfq_ioprio_to_weight(entity->ioprio); |
5139 |
-+ } else |
5140 |
-+ entity->new_weight = entity->orig_weight = |
5141 |
-+ bfq_ioprio_to_weight(entity->ioprio); |
5142 |
-+ |
5143 |
-+ entity->ioprio_class = entity->new_ioprio_class; |
5144 |
-+ entity->ioprio_changed = 0; |
5145 |
-+ |
5146 |
-+ /* |
5147 |
-+ * NOTE: here we may be changing the weight too early, |
5148 |
-+ * this will cause unfairness. The correct approach |
5149 |
-+ * would have required additional complexity to defer |
5150 |
-+ * weight changes to the proper time instants (i.e., |
5151 |
-+ * when entity->finish <= old_st->vtime). |
5152 |
-+ */ |
5153 |
-+ new_st = bfq_entity_service_tree(entity); |
5154 |
-+ entity->weight = entity->orig_weight * |
5155 |
-+ (bfqq != NULL ? bfqq->raising_coeff : 1); |
5156 |
-+ new_st->wsum += entity->weight; |
5157 |
-+ |
5158 |
-+ if (new_st != old_st) |
5159 |
-+ entity->start = new_st->vtime; |
5160 |
-+ } |
5161 |
-+ |
5162 |
-+ return new_st; |
5163 |
-+} |
5164 |
-+ |
5165 |
-+/** |
5166 |
-+ * bfq_bfqq_served - update the scheduler status after selection for service. |
5167 |
-+ * @bfqq: the queue being served. |
5168 |
-+ * @served: bytes to transfer. |
5169 |
-+ * |
5170 |
-+ * NOTE: this can be optimized, as the timestamps of upper level entities |
5171 |
-+ * are synchronized every time a new bfqq is selected for service. By now, |
5172 |
-+ * we keep it to better check consistency. |
5173 |
-+ */ |
5174 |
-+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) |
5175 |
-+{ |
5176 |
-+ struct bfq_entity *entity = &bfqq->entity; |
5177 |
-+ struct bfq_service_tree *st; |
5178 |
-+ |
5179 |
-+ for_each_entity(entity) { |
5180 |
-+ st = bfq_entity_service_tree(entity); |
5181 |
-+ |
5182 |
-+ entity->service += served; |
5183 |
-+ BUG_ON(entity->service > entity->budget); |
5184 |
-+ BUG_ON(st->wsum == 0); |
5185 |
-+ |
5186 |
-+ st->vtime += bfq_delta(served, st->wsum); |
5187 |
-+ bfq_forget_idle(st); |
5188 |
-+ } |
5189 |
-+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served); |
5190 |
-+} |
5191 |
-+ |
5192 |
-+/** |
5193 |
-+ * bfq_bfqq_charge_full_budget - set the service to the entity budget. |
5194 |
-+ * @bfqq: the queue that needs a service update. |
5195 |
-+ * |
5196 |
-+ * When it's not possible to be fair in the service domain, because |
5197 |
-+ * a queue is not consuming its budget fast enough (the meaning of |
5198 |
-+ * fast depends on the timeout parameter), we charge it a full |
5199 |
-+ * budget. In this way we should obtain a sort of time-domain |
5200 |
-+ * fairness among all the seeky/slow queues. |
5201 |
-+ */ |
5202 |
-+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) |
5203 |
-+{ |
5204 |
-+ struct bfq_entity *entity = &bfqq->entity; |
5205 |
-+ |
5206 |
-+ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); |
5207 |
-+ |
5208 |
-+ bfq_bfqq_served(bfqq, entity->budget - entity->service); |
5209 |
-+} |
5210 |
-+ |
5211 |
-+/** |
5212 |
-+ * __bfq_activate_entity - activate an entity. |
5213 |
-+ * @entity: the entity being activated. |
5214 |
-+ * |
5215 |
-+ * Called whenever an entity is activated, i.e., it is not active and one |
5216 |
-+ * of its children receives a new request, or has to be reactivated due to |
5217 |
-+ * budget exhaustion. It uses the current budget of the entity (and the |
5218 |
-+ * service received if @entity is active) of the queue to calculate its |
5219 |
-+ * timestamps. |
5220 |
-+ */ |
5221 |
-+static void __bfq_activate_entity(struct bfq_entity *entity) |
5222 |
-+{ |
5223 |
-+ struct bfq_sched_data *sd = entity->sched_data; |
5224 |
-+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); |
5225 |
-+ |
5226 |
-+ if (entity == sd->active_entity) { |
5227 |
-+ BUG_ON(entity->tree != NULL); |
5228 |
-+ /* |
5229 |
-+ * If we are requeueing the current entity we have |
5230 |
-+ * to take care of not charging to it service it has |
5231 |
-+ * not received. |
5232 |
-+ */ |
5233 |
-+ bfq_calc_finish(entity, entity->service); |
5234 |
-+ entity->start = entity->finish; |
5235 |
-+ sd->active_entity = NULL; |
5236 |
-+ } else if (entity->tree == &st->active) { |
5237 |
-+ /* |
5238 |
-+ * Requeueing an entity due to a change of some |
5239 |
-+ * next_active entity below it. We reuse the old |
5240 |
-+ * start time. |
5241 |
-+ */ |
5242 |
-+ bfq_active_extract(st, entity); |
5243 |
-+ } else if (entity->tree == &st->idle) { |
5244 |
-+ /* |
5245 |
-+ * Must be on the idle tree, bfq_idle_extract() will |
5246 |
-+ * check for that. |
5247 |
-+ */ |
5248 |
-+ bfq_idle_extract(st, entity); |
5249 |
-+ entity->start = bfq_gt(st->vtime, entity->finish) ? |
5250 |
-+ st->vtime : entity->finish; |
5251 |
-+ } else { |
5252 |
-+ /* |
5253 |
-+ * The finish time of the entity may be invalid, and |
5254 |
-+ * it is in the past for sure, otherwise the queue |
5255 |
-+ * would have been on the idle tree. |
5256 |
-+ */ |
5257 |
-+ entity->start = st->vtime; |
5258 |
-+ st->wsum += entity->weight; |
5259 |
-+ bfq_get_entity(entity); |
5260 |
-+ |
5261 |
-+ BUG_ON(entity->on_st); |
5262 |
-+ entity->on_st = 1; |
5263 |
-+ } |
5264 |
-+ |
5265 |
-+ st = __bfq_entity_update_weight_prio(st, entity); |
5266 |
-+ bfq_calc_finish(entity, entity->budget); |
5267 |
-+ bfq_active_insert(st, entity); |
5268 |
-+} |
5269 |
-+ |
5270 |
-+/** |
5271 |
-+ * bfq_activate_entity - activate an entity and its ancestors if necessary. |
5272 |
-+ * @entity: the entity to activate. |
5273 |
-+ * |
5274 |
-+ * Activate @entity and all the entities on the path from it to the root. |
5275 |
-+ */ |
5276 |
-+static void bfq_activate_entity(struct bfq_entity *entity) |
5277 |
-+{ |
5278 |
-+ struct bfq_sched_data *sd; |
5279 |
-+ |
5280 |
-+ for_each_entity(entity) { |
5281 |
-+ __bfq_activate_entity(entity); |
5282 |
-+ |
5283 |
-+ sd = entity->sched_data; |
5284 |
-+ if (!bfq_update_next_active(sd)) |
5285 |
-+ /* |
5286 |
-+ * No need to propagate the activation to the |
5287 |
-+ * upper entities, as they will be updated when |
5288 |
-+ * the active entity is rescheduled. |
5289 |
-+ */ |
5290 |
-+ break; |
5291 |
-+ } |
5292 |
-+} |
5293 |
-+ |
5294 |
-+/** |
5295 |
-+ * __bfq_deactivate_entity - deactivate an entity from its service tree. |
5296 |
-+ * @entity: the entity to deactivate. |
5297 |
-+ * @requeue: if false, the entity will not be put into the idle tree. |
5298 |
-+ * |
5299 |
-+ * Deactivate an entity, independently from its previous state. If the |
5300 |
-+ * entity was not on a service tree just return, otherwise if it is on |
5301 |
-+ * any scheduler tree, extract it from that tree, and if necessary |
5302 |
-+ * and if the caller did not specify @requeue, put it on the idle tree. |
5303 |
-+ * |
5304 |
-+ * Return %1 if the caller should update the entity hierarchy, i.e., |
5305 |
-+ * if the entity was under service or if it was the next_active for |
5306 |
-+ * its sched_data; return %0 otherwise. |
5307 |
-+ */ |
5308 |
-+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) |
5309 |
-+{ |
5310 |
-+ struct bfq_sched_data *sd = entity->sched_data; |
5311 |
-+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); |
5312 |
-+ int was_active = entity == sd->active_entity; |
5313 |
-+ int ret = 0; |
5314 |
-+ |
5315 |
-+ if (!entity->on_st) |
5316 |
-+ return 0; |
5317 |
-+ |
5318 |
-+ BUG_ON(was_active && entity->tree != NULL); |
5319 |
-+ |
5320 |
-+ if (was_active) { |
5321 |
-+ bfq_calc_finish(entity, entity->service); |
5322 |
-+ sd->active_entity = NULL; |
5323 |
-+ } else if (entity->tree == &st->active) |
5324 |
-+ bfq_active_extract(st, entity); |
5325 |
-+ else if (entity->tree == &st->idle) |
5326 |
-+ bfq_idle_extract(st, entity); |
5327 |
-+ else if (entity->tree != NULL) |
5328 |
-+ BUG(); |
5329 |
-+ |
5330 |
-+ if (was_active || sd->next_active == entity) |
5331 |
-+ ret = bfq_update_next_active(sd); |
5332 |
-+ |
5333 |
-+ if (!requeue || !bfq_gt(entity->finish, st->vtime)) |
5334 |
-+ bfq_forget_entity(st, entity); |
5335 |
-+ else |
5336 |
-+ bfq_idle_insert(st, entity); |
5337 |
-+ |
5338 |
-+ BUG_ON(sd->active_entity == entity); |
5339 |
-+ BUG_ON(sd->next_active == entity); |
5340 |
-+ |
5341 |
-+ return ret; |
5342 |
-+} |
5343 |
-+ |
5344 |
-+/** |
5345 |
-+ * bfq_deactivate_entity - deactivate an entity. |
5346 |
-+ * @entity: the entity to deactivate. |
5347 |
-+ * @requeue: true if the entity can be put on the idle tree |
5348 |
-+ */ |
5349 |
-+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) |
5350 |
-+{ |
5351 |
-+ struct bfq_sched_data *sd; |
5352 |
-+ struct bfq_entity *parent; |
5353 |
-+ |
5354 |
-+ for_each_entity_safe(entity, parent) { |
5355 |
-+ sd = entity->sched_data; |
5356 |
-+ |
5357 |
-+ if (!__bfq_deactivate_entity(entity, requeue)) |
5358 |
-+ /* |
5359 |
-+ * The parent entity is still backlogged, and |
5360 |
-+ * we don't need to update it as it is still |
5361 |
-+ * under service. |
5362 |
-+ */ |
5363 |
-+ break; |
5364 |
-+ |
5365 |
-+ if (sd->next_active != NULL) |
5366 |
-+ /* |
5367 |
-+ * The parent entity is still backlogged and |
5368 |
-+ * the budgets on the path towards the root |
5369 |
-+ * need to be updated. |
5370 |
-+ */ |
5371 |
-+ goto update; |
5372 |
-+ |
5373 |
-+ /* |
5374 |
-+ * If we reach there the parent is no more backlogged and |
5375 |
-+ * we want to propagate the dequeue upwards. |
5376 |
-+ */ |
5377 |
-+ requeue = 1; |
5378 |
-+ } |
5379 |
-+ |
5380 |
-+ return; |
5381 |
-+ |
5382 |
-+update: |
5383 |
-+ entity = parent; |
5384 |
-+ for_each_entity(entity) { |
5385 |
-+ __bfq_activate_entity(entity); |
5386 |
-+ |
5387 |
-+ sd = entity->sched_data; |
5388 |
-+ if (!bfq_update_next_active(sd)) |
5389 |
-+ break; |
5390 |
-+ } |
5391 |
-+} |
5392 |
-+ |
5393 |
-+/** |
5394 |
-+ * bfq_update_vtime - update vtime if necessary. |
5395 |
-+ * @st: the service tree to act upon. |
5396 |
-+ * |
5397 |
-+ * If necessary update the service tree vtime to have at least one |
5398 |
-+ * eligible entity, skipping to its start time. Assumes that the |
5399 |
-+ * active tree of the device is not empty. |
5400 |
-+ * |
5401 |
-+ * NOTE: this hierarchical implementation updates vtimes quite often, |
5402 |
-+ * we may end up with reactivated tasks getting timestamps after a |
5403 |
-+ * vtime skip done because we needed a ->first_active entity on some |
5404 |
-+ * intermediate node. |
5405 |
-+ */ |
5406 |
-+static void bfq_update_vtime(struct bfq_service_tree *st) |
5407 |
-+{ |
5408 |
-+ struct bfq_entity *entry; |
5409 |
-+ struct rb_node *node = st->active.rb_node; |
5410 |
-+ |
5411 |
-+ entry = rb_entry(node, struct bfq_entity, rb_node); |
5412 |
-+ if (bfq_gt(entry->min_start, st->vtime)) { |
5413 |
-+ st->vtime = entry->min_start; |
5414 |
-+ bfq_forget_idle(st); |
5415 |
-+ } |
5416 |
-+} |
5417 |
-+ |
5418 |
-+/** |
5419 |
-+ * bfq_first_active - find the eligible entity with the smallest finish time |
5420 |
-+ * @st: the service tree to select from. |
5421 |
-+ * |
5422 |
-+ * This function searches the first schedulable entity, starting from the |
5423 |
-+ * root of the tree and going on the left every time on this side there is |
5424 |
-+ * a subtree with at least one eligible (start >= vtime) entity. The path |
5425 |
-+ * on the right is followed only if a) the left subtree contains no eligible |
5426 |
-+ * entities and b) no eligible entity has been found yet. |
5427 |
-+ */ |
5428 |
-+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) |
5429 |
-+{ |
5430 |
-+ struct bfq_entity *entry, *first = NULL; |
5431 |
-+ struct rb_node *node = st->active.rb_node; |
5432 |
-+ |
5433 |
-+ while (node != NULL) { |
5434 |
-+ entry = rb_entry(node, struct bfq_entity, rb_node); |
5435 |
-+left: |
5436 |
-+ if (!bfq_gt(entry->start, st->vtime)) |
5437 |
-+ first = entry; |
5438 |
-+ |
5439 |
-+ BUG_ON(bfq_gt(entry->min_start, st->vtime)); |
5440 |
-+ |
5441 |
-+ if (node->rb_left != NULL) { |
5442 |
-+ entry = rb_entry(node->rb_left, |
5443 |
-+ struct bfq_entity, rb_node); |
5444 |
-+ if (!bfq_gt(entry->min_start, st->vtime)) { |
5445 |
-+ node = node->rb_left; |
5446 |
-+ goto left; |
5447 |
-+ } |
5448 |
-+ } |
5449 |
-+ if (first != NULL) |
5450 |
-+ break; |
5451 |
-+ node = node->rb_right; |
5452 |
-+ } |
5453 |
-+ |
5454 |
-+ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active)); |
5455 |
-+ return first; |
5456 |
-+} |
5457 |
-+ |
5458 |
-+/** |
5459 |
-+ * __bfq_lookup_next_entity - return the first eligible entity in @st. |
5460 |
-+ * @st: the service tree. |
5461 |
-+ * |
5462 |
-+ * Update the virtual time in @st and return the first eligible entity |
5463 |
-+ * it contains. |
5464 |
-+ */ |
5465 |
-+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, |
5466 |
-+ bool force) |
5467 |
-+{ |
5468 |
-+ struct bfq_entity *entity, *new_next_active = NULL; |
5469 |
-+ |
5470 |
-+ if (RB_EMPTY_ROOT(&st->active)) |
5471 |
-+ return NULL; |
5472 |
-+ |
5473 |
-+ bfq_update_vtime(st); |
5474 |
-+ entity = bfq_first_active_entity(st); |
5475 |
-+ BUG_ON(bfq_gt(entity->start, st->vtime)); |
5476 |
-+ |
5477 |
-+ /* |
5478 |
-+ * If the chosen entity does not match with the sched_data's |
5479 |
-+ * next_active and we are forcedly serving the IDLE priority |
5480 |
-+ * class tree, bubble up budget update. |
5481 |
-+ */ |
5482 |
-+ if (unlikely(force && entity != entity->sched_data->next_active)) { |
5483 |
-+ new_next_active = entity; |
5484 |
-+ for_each_entity(new_next_active) |
5485 |
-+ bfq_update_budget(new_next_active); |
5486 |
-+ } |
5487 |
-+ |
5488 |
-+ return entity; |
5489 |
-+} |
5490 |
-+ |
5491 |
-+/** |
5492 |
-+ * bfq_lookup_next_entity - return the first eligible entity in @sd. |
5493 |
-+ * @sd: the sched_data. |
5494 |
-+ * @extract: if true the returned entity will be also extracted from @sd. |
5495 |
-+ * |
5496 |
-+ * NOTE: since we cache the next_active entity at each level of the |
5497 |
-+ * hierarchy, the complexity of the lookup can be decreased with |
5498 |
-+ * absolutely no effort just returning the cached next_active value; |
5499 |
-+ * we prefer to do full lookups to test the consistency of * the data |
5500 |
-+ * structures. |
5501 |
-+ */ |
5502 |
-+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, |
5503 |
-+ int extract, |
5504 |
-+ struct bfq_data *bfqd) |
5505 |
-+{ |
5506 |
-+ struct bfq_service_tree *st = sd->service_tree; |
5507 |
-+ struct bfq_entity *entity; |
5508 |
-+ int i = 0; |
5509 |
-+ |
5510 |
-+ BUG_ON(sd->active_entity != NULL); |
5511 |
-+ |
5512 |
-+ if (bfqd != NULL && |
5513 |
-+ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { |
5514 |
-+ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, |
5515 |
-+ true); |
5516 |
-+ if (entity != NULL) { |
5517 |
-+ i = BFQ_IOPRIO_CLASSES - 1; |
5518 |
-+ bfqd->bfq_class_idle_last_service = jiffies; |
5519 |
-+ sd->next_active = entity; |
5520 |
-+ } |
5521 |
-+ } |
5522 |
-+ for (; i < BFQ_IOPRIO_CLASSES; i++) { |
5523 |
-+ entity = __bfq_lookup_next_entity(st + i, false); |
5524 |
-+ if (entity != NULL) { |
5525 |
-+ if (extract) { |
5526 |
-+ bfq_check_next_active(sd, entity); |
5527 |
-+ bfq_active_extract(st + i, entity); |
5528 |
-+ sd->active_entity = entity; |
5529 |
-+ sd->next_active = NULL; |
5530 |
-+ } |
5531 |
-+ break; |
5532 |
-+ } |
5533 |
-+ } |
5534 |
-+ |
5535 |
-+ return entity; |
5536 |
-+} |
5537 |
-+ |
5538 |
-+/* |
5539 |
-+ * Get next queue for service. |
5540 |
-+ */ |
5541 |
-+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) |
5542 |
-+{ |
5543 |
-+ struct bfq_entity *entity = NULL; |
5544 |
-+ struct bfq_sched_data *sd; |
5545 |
-+ struct bfq_queue *bfqq; |
5546 |
-+ |
5547 |
-+ BUG_ON(bfqd->in_service_queue != NULL); |
5548 |
-+ |
5549 |
-+ if (bfqd->busy_queues == 0) |
5550 |
-+ return NULL; |
5551 |
-+ |
5552 |
-+ sd = &bfqd->root_group->sched_data; |
5553 |
-+ for (; sd != NULL; sd = entity->my_sched_data) { |
5554 |
-+ entity = bfq_lookup_next_entity(sd, 1, bfqd); |
5555 |
-+ BUG_ON(entity == NULL); |
5556 |
-+ entity->service = 0; |
5557 |
-+ } |
5558 |
-+ |
5559 |
-+ bfqq = bfq_entity_to_bfqq(entity); |
5560 |
-+ BUG_ON(bfqq == NULL); |
5561 |
-+ |
5562 |
-+ return bfqq; |
5563 |
-+} |
5564 |
-+ |
5565 |
-+/* |
5566 |
-+ * Forced extraction of the given queue. |
5567 |
-+ */ |
5568 |
-+static void bfq_get_next_queue_forced(struct bfq_data *bfqd, |
5569 |
-+ struct bfq_queue *bfqq) |
5570 |
-+{ |
5571 |
-+ struct bfq_entity *entity; |
5572 |
-+ struct bfq_sched_data *sd; |
5573 |
-+ |
5574 |
-+ BUG_ON(bfqd->in_service_queue != NULL); |
5575 |
-+ |
5576 |
-+ entity = &bfqq->entity; |
5577 |
-+ /* |
5578 |
-+ * Bubble up extraction/update from the leaf to the root. |
5579 |
-+ */ |
5580 |
-+ for_each_entity(entity) { |
5581 |
-+ sd = entity->sched_data; |
5582 |
-+ bfq_update_budget(entity); |
5583 |
-+ bfq_update_vtime(bfq_entity_service_tree(entity)); |
5584 |
-+ bfq_active_extract(bfq_entity_service_tree(entity), entity); |
5585 |
-+ sd->active_entity = entity; |
5586 |
-+ sd->next_active = NULL; |
5587 |
-+ entity->service = 0; |
5588 |
-+ } |
5589 |
-+ |
5590 |
-+ return; |
5591 |
-+} |
5592 |
-+ |
5593 |
-+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) |
5594 |
-+{ |
5595 |
-+ if (bfqd->in_service_bic != NULL) { |
5596 |
-+ put_io_context(bfqd->in_service_bic->icq.ioc); |
5597 |
-+ bfqd->in_service_bic = NULL; |
5598 |
-+ } |
5599 |
-+ |
5600 |
-+ bfqd->in_service_queue = NULL; |
5601 |
-+ del_timer(&bfqd->idle_slice_timer); |
5602 |
-+} |
5603 |
-+ |
5604 |
-+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
5605 |
-+ int requeue) |
5606 |
-+{ |
5607 |
-+ struct bfq_entity *entity = &bfqq->entity; |
5608 |
-+ |
5609 |
-+ if (bfqq == bfqd->in_service_queue) |
5610 |
-+ __bfq_bfqd_reset_in_service(bfqd); |
5611 |
-+ |
5612 |
-+ bfq_deactivate_entity(entity, requeue); |
5613 |
-+} |
5614 |
-+ |
5615 |
-+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
5616 |
-+{ |
5617 |
-+ struct bfq_entity *entity = &bfqq->entity; |
5618 |
-+ |
5619 |
-+ bfq_activate_entity(entity); |
5620 |
-+} |
5621 |
-+ |
5622 |
-+/* |
5623 |
-+ * Called when the bfqq no longer has requests pending, remove it from |
5624 |
-+ * the service tree. |
5625 |
-+ */ |
5626 |
-+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
5627 |
-+ int requeue) |
5628 |
-+{ |
5629 |
-+ BUG_ON(!bfq_bfqq_busy(bfqq)); |
5630 |
-+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); |
5631 |
-+ |
5632 |
-+ bfq_log_bfqq(bfqd, bfqq, "del from busy"); |
5633 |
-+ |
5634 |
-+ bfq_clear_bfqq_busy(bfqq); |
5635 |
-+ |
5636 |
-+ BUG_ON(bfqd->busy_queues == 0); |
5637 |
-+ bfqd->busy_queues--; |
5638 |
-+ if (bfqq->raising_coeff > 1) |
5639 |
-+ bfqd->raised_busy_queues--; |
5640 |
-+ |
5641 |
-+ bfq_deactivate_bfqq(bfqd, bfqq, requeue); |
5642 |
-+} |
5643 |
-+ |
5644 |
-+/* |
5645 |
-+ * Called when an inactive queue receives a new request. |
5646 |
-+ */ |
5647 |
-+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
5648 |
-+{ |
5649 |
-+ BUG_ON(bfq_bfqq_busy(bfqq)); |
5650 |
-+ BUG_ON(bfqq == bfqd->in_service_queue); |
5651 |
-+ |
5652 |
-+ bfq_log_bfqq(bfqd, bfqq, "add to busy"); |
5653 |
-+ |
5654 |
-+ bfq_activate_bfqq(bfqd, bfqq); |
5655 |
-+ |
5656 |
-+ bfq_mark_bfqq_busy(bfqq); |
5657 |
-+ bfqd->busy_queues++; |
5658 |
-+ if (bfqq->raising_coeff > 1) |
5659 |
-+ bfqd->raised_busy_queues++; |
5660 |
-+} |
5661 |
-diff --git a/block/bfq.h b/block/bfq.h |
5662 |
-new file mode 100644 |
5663 |
-index 0000000..68b28e3 |
5664 |
---- /dev/null |
5665 |
-+++ b/block/bfq.h |
5666 |
-@@ -0,0 +1,614 @@ |
5667 |
-+/* |
5668 |
-+ * BFQ-v7 for 3.13.0: data structures and common functions prototypes. |
5669 |
-+ * |
5670 |
-+ * Based on ideas and code from CFQ: |
5671 |
-+ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
5672 |
-+ * |
5673 |
-+ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
5674 |
-+ * Paolo Valente <paolo.valente@×××××××.it> |
5675 |
-+ * |
5676 |
-+ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
5677 |
-+ */ |
5678 |
-+ |
5679 |
-+#ifndef _BFQ_H |
5680 |
-+#define _BFQ_H |
5681 |
-+ |
5682 |
-+#include <linux/blktrace_api.h> |
5683 |
-+#include <linux/hrtimer.h> |
5684 |
-+#include <linux/ioprio.h> |
5685 |
-+#include <linux/rbtree.h> |
5686 |
-+ |
5687 |
-+#define BFQ_IOPRIO_CLASSES 3 |
5688 |
-+#define BFQ_CL_IDLE_TIMEOUT (HZ/5) |
5689 |
-+ |
5690 |
-+#define BFQ_MIN_WEIGHT 1 |
5691 |
-+#define BFQ_MAX_WEIGHT 1000 |
5692 |
-+ |
5693 |
-+#define BFQ_DEFAULT_GRP_WEIGHT 10 |
5694 |
-+#define BFQ_DEFAULT_GRP_IOPRIO 0 |
5695 |
-+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE |
5696 |
-+ |
5697 |
-+struct bfq_entity; |
5698 |
-+ |
5699 |
-+/** |
5700 |
-+ * struct bfq_service_tree - per ioprio_class service tree. |
5701 |
-+ * @active: tree for active entities (i.e., those backlogged). |
5702 |
-+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). |
5703 |
-+ * @first_idle: idle entity with minimum F_i. |
5704 |
-+ * @last_idle: idle entity with maximum F_i. |
5705 |
-+ * @vtime: scheduler virtual time. |
5706 |
-+ * @wsum: scheduler weight sum; active and idle entities contribute to it. |
5707 |
-+ * |
5708 |
-+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each |
5709 |
-+ * ioprio_class has its own independent scheduler, and so its own |
5710 |
-+ * bfq_service_tree. All the fields are protected by the queue lock |
5711 |
-+ * of the containing bfqd. |
5712 |
-+ */ |
5713 |
-+struct bfq_service_tree { |
5714 |
-+ struct rb_root active; |
5715 |
-+ struct rb_root idle; |
5716 |
-+ |
5717 |
-+ struct bfq_entity *first_idle; |
5718 |
-+ struct bfq_entity *last_idle; |
5719 |
-+ |
5720 |
-+ u64 vtime; |
5721 |
-+ unsigned long wsum; |
5722 |
-+}; |
5723 |
-+ |
5724 |
-+/** |
5725 |
-+ * struct bfq_sched_data - multi-class scheduler. |
5726 |
-+ * @active_entity: entity under service. |
5727 |
-+ * @next_active: head-of-the-line entity in the scheduler. |
5728 |
-+ * @service_tree: array of service trees, one per ioprio_class. |
5729 |
-+ * |
5730 |
-+ * bfq_sched_data is the basic scheduler queue. It supports three |
5731 |
-+ * ioprio_classes, and can be used either as a toplevel queue or as |
5732 |
-+ * an intermediate queue on a hierarchical setup. |
5733 |
-+ * @next_active points to the active entity of the sched_data service |
5734 |
-+ * trees that will be scheduled next. |
5735 |
-+ * |
5736 |
-+ * The supported ioprio_classes are the same as in CFQ, in descending |
5737 |
-+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. |
5738 |
-+ * Requests from higher priority queues are served before all the |
5739 |
-+ * requests from lower priority queues; among requests of the same |
5740 |
-+ * queue requests are served according to B-WF2Q+. |
5741 |
-+ * All the fields are protected by the queue lock of the containing bfqd. |
5742 |
-+ */ |
5743 |
-+struct bfq_sched_data { |
5744 |
-+ struct bfq_entity *active_entity; |
5745 |
-+ struct bfq_entity *next_active; |
5746 |
-+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; |
5747 |
-+}; |
5748 |
-+ |
5749 |
-+/** |
5750 |
-+ * struct bfq_entity - schedulable entity. |
5751 |
-+ * @rb_node: service_tree member. |
5752 |
-+ * @on_st: flag, true if the entity is on a tree (either the active or |
5753 |
-+ * the idle one of its service_tree). |
5754 |
-+ * @finish: B-WF2Q+ finish timestamp (aka F_i). |
5755 |
-+ * @start: B-WF2Q+ start timestamp (aka S_i). |
5756 |
-+ * @tree: tree the entity is enqueued into; %NULL if not on a tree. |
5757 |
-+ * @min_start: minimum start time of the (active) subtree rooted at |
5758 |
-+ * this entity; used for O(log N) lookups into active trees. |
5759 |
-+ * @service: service received during the last round of service. |
5760 |
-+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. |
5761 |
-+ * @weight: weight of the queue |
5762 |
-+ * @parent: parent entity, for hierarchical scheduling. |
5763 |
-+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the |
5764 |
-+ * associated scheduler queue, %NULL on leaf nodes. |
5765 |
-+ * @sched_data: the scheduler queue this entity belongs to. |
5766 |
-+ * @ioprio: the ioprio in use. |
5767 |
-+ * @new_weight: when a weight change is requested, the new weight value. |
5768 |
-+ * @orig_weight: original weight, used to implement weight boosting |
5769 |
-+ * @new_ioprio: when an ioprio change is requested, the new ioprio value. |
5770 |
-+ * @ioprio_class: the ioprio_class in use. |
5771 |
-+ * @new_ioprio_class: when an ioprio_class change is requested, the new |
5772 |
-+ * ioprio_class value. |
5773 |
-+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or |
5774 |
-+ * ioprio_class change. |
5775 |
-+ * |
5776 |
-+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the |
5777 |
-+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each |
5778 |
-+ * entity belongs to the sched_data of the parent group in the cgroup |
5779 |
-+ * hierarchy. Non-leaf entities have also their own sched_data, stored |
5780 |
-+ * in @my_sched_data. |
5781 |
-+ * |
5782 |
-+ * Each entity stores independently its priority values; this would |
5783 |
-+ * allow different weights on different devices, but this |
5784 |
-+ * functionality is not exported to userspace by now. Priorities and |
5785 |
-+ * weights are updated lazily, first storing the new values into the |
5786 |
-+ * new_* fields, then setting the @ioprio_changed flag. As soon as |
5787 |
-+ * there is a transition in the entity state that allows the priority |
5788 |
-+ * update to take place the effective and the requested priority |
5789 |
-+ * values are synchronized. |
5790 |
-+ * |
5791 |
-+ * Unless cgroups are used, the weight value is calculated from the |
5792 |
-+ * ioprio to export the same interface as CFQ. When dealing with |
5793 |
-+ * ``well-behaved'' queues (i.e., queues that do not spend too much |
5794 |
-+ * time to consume their budget and have true sequential behavior, and |
5795 |
-+ * when there are no external factors breaking anticipation) the |
5796 |
-+ * relative weights at each level of the cgroups hierarchy should be |
5797 |
-+ * guaranteed. All the fields are protected by the queue lock of the |
5798 |
-+ * containing bfqd. |
5799 |
-+ */ |
5800 |
-+struct bfq_entity { |
5801 |
-+ struct rb_node rb_node; |
5802 |
-+ |
5803 |
-+ int on_st; |
5804 |
-+ |
5805 |
-+ u64 finish; |
5806 |
-+ u64 start; |
5807 |
-+ |
5808 |
-+ struct rb_root *tree; |
5809 |
-+ |
5810 |
-+ u64 min_start; |
5811 |
-+ |
5812 |
-+ unsigned long service, budget; |
5813 |
-+ unsigned short weight, new_weight; |
5814 |
-+ unsigned short orig_weight; |
5815 |
-+ |
5816 |
-+ struct bfq_entity *parent; |
5817 |
-+ |
5818 |
-+ struct bfq_sched_data *my_sched_data; |
5819 |
-+ struct bfq_sched_data *sched_data; |
5820 |
-+ |
5821 |
-+ unsigned short ioprio, new_ioprio; |
5822 |
-+ unsigned short ioprio_class, new_ioprio_class; |
5823 |
-+ |
5824 |
-+ int ioprio_changed; |
5825 |
-+}; |
5826 |
-+ |
5827 |
-+struct bfq_group; |
5828 |
-+ |
5829 |
-+/** |
5830 |
-+ * struct bfq_queue - leaf schedulable entity. |
5831 |
-+ * @ref: reference counter. |
5832 |
-+ * @bfqd: parent bfq_data. |
5833 |
-+ * @new_bfqq: shared bfq_queue if queue is cooperating with |
5834 |
-+ * one or more other queues. |
5835 |
-+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree). |
5836 |
-+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree). |
5837 |
-+ * @sort_list: sorted list of pending requests. |
5838 |
-+ * @next_rq: if fifo isn't expired, next request to serve. |
5839 |
-+ * @queued: nr of requests queued in @sort_list. |
5840 |
-+ * @allocated: currently allocated requests. |
5841 |
-+ * @meta_pending: pending metadata requests. |
5842 |
-+ * @fifo: fifo list of requests in sort_list. |
5843 |
-+ * @entity: entity representing this queue in the scheduler. |
5844 |
-+ * @max_budget: maximum budget allowed from the feedback mechanism. |
5845 |
-+ * @budget_timeout: budget expiration (in jiffies). |
5846 |
-+ * @dispatched: number of requests on the dispatch list or inside driver. |
5847 |
-+ * @org_ioprio: saved ioprio during boosted periods. |
5848 |
-+ * @flags: status flags. |
5849 |
-+ * @bfqq_list: node for active/idle bfqq list inside our bfqd. |
5850 |
-+ * @seek_samples: number of seeks sampled |
5851 |
-+ * @seek_total: sum of the distances of the seeks sampled |
5852 |
-+ * @seek_mean: mean seek distance |
5853 |
-+ * @last_request_pos: position of the last request enqueued |
5854 |
-+ * @pid: pid of the process owning the queue, used for logging purposes. |
5855 |
-+ * @last_rais_start_time: last (idle -> weight-raised) transition attempt |
5856 |
-+ * @raising_cur_max_time: current max raising time for this queue |
5857 |
-+ * @last_idle_bklogged: time of the last transition of the @bfq_queue from |
5858 |
-+ * idle to backlogged |
5859 |
-+ * @service_from_backlogged: cumulative service received from the @bfq_queue |
5860 |
-+ * since the last transition from idle to backlogged |
5861 |
-+ * |
5862 |
-+ * A bfq_queue is a leaf request queue; it can be associated to an io_context |
5863 |
-+ * or more (if it is an async one). @cgroup holds a reference to the |
5864 |
-+ * cgroup, to be sure that it does not disappear while a bfqq still |
5865 |
-+ * references it (mostly to avoid races between request issuing and task |
5866 |
-+ * migration followed by cgroup distruction). |
5867 |
-+ * All the fields are protected by the queue lock of the containing bfqd. |
5868 |
-+ */ |
5869 |
-+struct bfq_queue { |
5870 |
-+ atomic_t ref; |
5871 |
-+ struct bfq_data *bfqd; |
5872 |
-+ |
5873 |
-+ /* fields for cooperating queues handling */ |
5874 |
-+ struct bfq_queue *new_bfqq; |
5875 |
-+ struct rb_node pos_node; |
5876 |
-+ struct rb_root *pos_root; |
5877 |
-+ |
5878 |
-+ struct rb_root sort_list; |
5879 |
-+ struct request *next_rq; |
5880 |
-+ int queued[2]; |
5881 |
-+ int allocated[2]; |
5882 |
-+ int meta_pending; |
5883 |
-+ struct list_head fifo; |
5884 |
-+ |
5885 |
-+ struct bfq_entity entity; |
5886 |
-+ |
5887 |
-+ unsigned long max_budget; |
5888 |
-+ unsigned long budget_timeout; |
5889 |
-+ |
5890 |
-+ int dispatched; |
5891 |
-+ |
5892 |
-+ unsigned short org_ioprio; |
5893 |
-+ |
5894 |
-+ unsigned int flags; |
5895 |
-+ |
5896 |
-+ struct list_head bfqq_list; |
5897 |
-+ |
5898 |
-+ unsigned int seek_samples; |
5899 |
-+ u64 seek_total; |
5900 |
-+ sector_t seek_mean; |
5901 |
-+ sector_t last_request_pos; |
5902 |
-+ |
5903 |
-+ pid_t pid; |
5904 |
-+ |
5905 |
-+ /* weight-raising fields */ |
5906 |
-+ unsigned int raising_cur_max_time; |
5907 |
-+ unsigned long soft_rt_next_start; |
5908 |
-+ u64 last_rais_start_finish; |
5909 |
-+ unsigned int raising_coeff; |
5910 |
-+ u64 last_idle_bklogged; |
5911 |
-+ unsigned long service_from_backlogged; |
5912 |
-+}; |
5913 |
-+ |
5914 |
-+/** |
5915 |
-+ * struct bfq_ttime - per process thinktime stats. |
5916 |
-+ * @ttime_total: total process thinktime |
5917 |
-+ * @ttime_samples: number of thinktime samples |
5918 |
-+ * @ttime_mean: average process thinktime |
5919 |
-+ */ |
5920 |
-+struct bfq_ttime { |
5921 |
-+ unsigned long last_end_request; |
5922 |
-+ |
5923 |
-+ unsigned long ttime_total; |
5924 |
-+ unsigned long ttime_samples; |
5925 |
-+ unsigned long ttime_mean; |
5926 |
-+}; |
5927 |
-+ |
5928 |
-+/** |
5929 |
-+ * struct bfq_io_cq - per (request_queue, io_context) structure. |
5930 |
-+ * @icq: associated io_cq structure |
5931 |
-+ * @bfqq: array of two process queues, the sync and the async |
5932 |
-+ * @ttime: associated @bfq_ttime struct |
5933 |
-+ */ |
5934 |
-+struct bfq_io_cq { |
5935 |
-+ struct io_cq icq; /* must be the first member */ |
5936 |
-+ struct bfq_queue *bfqq[2]; |
5937 |
-+ struct bfq_ttime ttime; |
5938 |
-+ int ioprio; |
5939 |
-+}; |
5940 |
-+ |
5941 |
-+/** |
5942 |
-+ * struct bfq_data - per device data structure. |
5943 |
-+ * @queue: request queue for the managed device. |
5944 |
-+ * @root_group: root bfq_group for the device. |
5945 |
-+ * @rq_pos_tree: rbtree sorted by next_request position, |
5946 |
-+ * used when determining if two or more queues |
5947 |
-+ * have interleaving requests (see bfq_close_cooperator). |
5948 |
-+ * @busy_queues: number of bfq_queues containing requests (including the |
5949 |
-+ * queue under service, even if it is idling). |
5950 |
-+ * @raised_busy_queues: number of weight-raised busy bfq_queues. |
5951 |
-+ * @queued: number of queued requests. |
5952 |
-+ * @rq_in_driver: number of requests dispatched and waiting for completion. |
5953 |
-+ * @sync_flight: number of sync requests in the driver. |
5954 |
-+ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples |
5955 |
-+ * completed requests . |
5956 |
-+ * @hw_tag_samples: nr of samples used to calculate hw_tag. |
5957 |
-+ * @hw_tag: flag set to one if the driver is showing a queueing behavior. |
5958 |
-+ * @budgets_assigned: number of budgets assigned. |
5959 |
-+ * @idle_slice_timer: timer set when idling for the next sequential request |
5960 |
-+ * from the queue under service. |
5961 |
-+ * @unplug_work: delayed work to restart dispatching on the request queue. |
5962 |
-+ * @in_service_queue: bfq_queue under service. |
5963 |
-+ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. |
5964 |
-+ * @last_position: on-disk position of the last served request. |
5965 |
-+ * @last_budget_start: beginning of the last budget. |
5966 |
-+ * @last_idling_start: beginning of the last idle slice. |
5967 |
-+ * @peak_rate: peak transfer rate observed for a budget. |
5968 |
-+ * @peak_rate_samples: number of samples used to calculate @peak_rate. |
5969 |
-+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling. |
5970 |
-+ * @group_list: list of all the bfq_groups active on the device. |
5971 |
-+ * @active_list: list of all the bfq_queues active on the device. |
5972 |
-+ * @idle_list: list of all the bfq_queues idle on the device. |
5973 |
-+ * @bfq_quantum: max number of requests dispatched per dispatch round. |
5974 |
-+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires |
5975 |
-+ * requests are served in fifo order. |
5976 |
-+ * @bfq_back_penalty: weight of backward seeks wrt forward ones. |
5977 |
-+ * @bfq_back_max: maximum allowed backward seek. |
5978 |
-+ * @bfq_slice_idle: maximum idling time. |
5979 |
-+ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning). |
5980 |
-+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to |
5981 |
-+ * async queues. |
5982 |
-+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to |
5983 |
-+ * to prevent seeky queues to impose long latencies to well |
5984 |
-+ * behaved ones (this also implies that seeky queues cannot |
5985 |
-+ * receive guarantees in the service domain; after a timeout |
5986 |
-+ * they are charged for the whole allocated budget, to try |
5987 |
-+ * to preserve a behavior reasonably fair among them, but |
5988 |
-+ * without service-domain guarantees). |
5989 |
-+ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted |
5990 |
-+ * queue is multiplied |
5991 |
-+ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies) |
5992 |
-+ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes |
5993 |
-+ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising |
5994 |
-+ * may be reactivated for a queue (in jiffies) |
5995 |
-+ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals |
5996 |
-+ * after which weight-raising may be |
5997 |
-+ * reactivated for an already busy queue |
5998 |
-+ * (in jiffies) |
5999 |
-+ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue, |
6000 |
-+ * sectors per seconds |
6001 |
-+ * @RT_prod: cached value of the product R*T used for computing the maximum |
6002 |
-+ * duration of the weight raising automatically |
6003 |
-+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions |
6004 |
-+ * |
6005 |
-+ * All the fields are protected by the @queue lock. |
6006 |
-+ */ |
6007 |
-+struct bfq_data { |
6008 |
-+ struct request_queue *queue; |
6009 |
-+ |
6010 |
-+ struct bfq_group *root_group; |
6011 |
-+ |
6012 |
-+ struct rb_root rq_pos_tree; |
6013 |
-+ |
6014 |
-+ int busy_queues; |
6015 |
-+ int raised_busy_queues; |
6016 |
-+ int queued; |
6017 |
-+ int rq_in_driver; |
6018 |
-+ int sync_flight; |
6019 |
-+ |
6020 |
-+ int max_rq_in_driver; |
6021 |
-+ int hw_tag_samples; |
6022 |
-+ int hw_tag; |
6023 |
-+ |
6024 |
-+ int budgets_assigned; |
6025 |
-+ |
6026 |
-+ struct timer_list idle_slice_timer; |
6027 |
-+ struct work_struct unplug_work; |
6028 |
-+ |
6029 |
-+ struct bfq_queue *in_service_queue; |
6030 |
-+ struct bfq_io_cq *in_service_bic; |
6031 |
-+ |
6032 |
-+ sector_t last_position; |
6033 |
-+ |
6034 |
-+ ktime_t last_budget_start; |
6035 |
-+ ktime_t last_idling_start; |
6036 |
-+ int peak_rate_samples; |
6037 |
-+ u64 peak_rate; |
6038 |
-+ unsigned long bfq_max_budget; |
6039 |
-+ |
6040 |
-+ struct hlist_head group_list; |
6041 |
-+ struct list_head active_list; |
6042 |
-+ struct list_head idle_list; |
6043 |
-+ |
6044 |
-+ unsigned int bfq_quantum; |
6045 |
-+ unsigned int bfq_fifo_expire[2]; |
6046 |
-+ unsigned int bfq_back_penalty; |
6047 |
-+ unsigned int bfq_back_max; |
6048 |
-+ unsigned int bfq_slice_idle; |
6049 |
-+ u64 bfq_class_idle_last_service; |
6050 |
-+ |
6051 |
-+ unsigned int bfq_user_max_budget; |
6052 |
-+ unsigned int bfq_max_budget_async_rq; |
6053 |
-+ unsigned int bfq_timeout[2]; |
6054 |
-+ |
6055 |
-+ bool low_latency; |
6056 |
-+ |
6057 |
-+ /* parameters of the low_latency heuristics */ |
6058 |
-+ unsigned int bfq_raising_coeff; |
6059 |
-+ unsigned int bfq_raising_max_time; |
6060 |
-+ unsigned int bfq_raising_rt_max_time; |
6061 |
-+ unsigned int bfq_raising_min_idle_time; |
6062 |
-+ unsigned long bfq_raising_min_inter_arr_async; |
6063 |
-+ unsigned int bfq_raising_max_softrt_rate; |
6064 |
-+ u64 RT_prod; |
6065 |
-+ |
6066 |
-+ struct bfq_queue oom_bfqq; |
6067 |
-+}; |
6068 |
-+ |
6069 |
-+enum bfqq_state_flags { |
6070 |
-+ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */ |
6071 |
-+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ |
6072 |
-+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ |
6073 |
-+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ |
6074 |
-+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ |
6075 |
-+ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */ |
6076 |
-+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ |
6077 |
-+ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ |
6078 |
-+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ |
6079 |
-+ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */ |
6080 |
-+ BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */ |
6081 |
-+}; |
6082 |
-+ |
6083 |
-+#define BFQ_BFQQ_FNS(name) \ |
6084 |
-+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ |
6085 |
-+{ \ |
6086 |
-+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ |
6087 |
-+} \ |
6088 |
-+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ |
6089 |
-+{ \ |
6090 |
-+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ |
6091 |
-+} \ |
6092 |
-+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ |
6093 |
-+{ \ |
6094 |
-+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ |
6095 |
-+} |
6096 |
-+ |
6097 |
-+BFQ_BFQQ_FNS(busy); |
6098 |
-+BFQ_BFQQ_FNS(wait_request); |
6099 |
-+BFQ_BFQQ_FNS(must_alloc); |
6100 |
-+BFQ_BFQQ_FNS(fifo_expire); |
6101 |
-+BFQ_BFQQ_FNS(idle_window); |
6102 |
-+BFQ_BFQQ_FNS(prio_changed); |
6103 |
-+BFQ_BFQQ_FNS(sync); |
6104 |
-+BFQ_BFQQ_FNS(budget_new); |
6105 |
-+BFQ_BFQQ_FNS(coop); |
6106 |
-+BFQ_BFQQ_FNS(split_coop); |
6107 |
-+BFQ_BFQQ_FNS(softrt_update); |
6108 |
-+#undef BFQ_BFQQ_FNS |
6109 |
-+ |
6110 |
-+/* Logging facilities. */ |
6111 |
-+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ |
6112 |
-+ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) |
6113 |
-+ |
6114 |
-+#define bfq_log(bfqd, fmt, args...) \ |
6115 |
-+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) |
6116 |
-+ |
6117 |
-+/* Expiration reasons. */ |
6118 |
-+enum bfqq_expiration { |
6119 |
-+ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */ |
6120 |
-+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ |
6121 |
-+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ |
6122 |
-+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ |
6123 |
-+}; |
6124 |
-+ |
6125 |
-+#ifdef CONFIG_CGROUP_BFQIO |
6126 |
-+/** |
6127 |
-+ * struct bfq_group - per (device, cgroup) data structure. |
6128 |
-+ * @entity: schedulable entity to insert into the parent group sched_data. |
6129 |
-+ * @sched_data: own sched_data, to contain child entities (they may be |
6130 |
-+ * both bfq_queues and bfq_groups). |
6131 |
-+ * @group_node: node to be inserted into the bfqio_cgroup->group_data |
6132 |
-+ * list of the containing cgroup's bfqio_cgroup. |
6133 |
-+ * @bfqd_node: node to be inserted into the @bfqd->group_list list |
6134 |
-+ * of the groups active on the same device; used for cleanup. |
6135 |
-+ * @bfqd: the bfq_data for the device this group acts upon. |
6136 |
-+ * @async_bfqq: array of async queues for all the tasks belonging to |
6137 |
-+ * the group, one queue per ioprio value per ioprio_class, |
6138 |
-+ * except for the idle class that has only one queue. |
6139 |
-+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). |
6140 |
-+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used |
6141 |
-+ * to avoid too many special cases during group creation/migration. |
6142 |
-+ * |
6143 |
-+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup |
6144 |
-+ * there is a set of bfq_groups, each one collecting the lower-level |
6145 |
-+ * entities belonging to the group that are acting on the same device. |
6146 |
-+ * |
6147 |
-+ * Locking works as follows: |
6148 |
-+ * o @group_node is protected by the bfqio_cgroup lock, and is accessed |
6149 |
-+ * via RCU from its readers. |
6150 |
-+ * o @bfqd is protected by the queue lock, RCU is used to access it |
6151 |
-+ * from the readers. |
6152 |
-+ * o All the other fields are protected by the @bfqd queue lock. |
6153 |
-+ */ |
6154 |
-+struct bfq_group { |
6155 |
-+ struct bfq_entity entity; |
6156 |
-+ struct bfq_sched_data sched_data; |
6157 |
-+ |
6158 |
-+ struct hlist_node group_node; |
6159 |
-+ struct hlist_node bfqd_node; |
6160 |
-+ |
6161 |
-+ void *bfqd; |
6162 |
-+ |
6163 |
-+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; |
6164 |
-+ struct bfq_queue *async_idle_bfqq; |
6165 |
-+ |
6166 |
-+ struct bfq_entity *my_entity; |
6167 |
-+}; |
6168 |
-+ |
6169 |
-+/** |
6170 |
-+ * struct bfqio_cgroup - bfq cgroup data structure. |
6171 |
-+ * @css: subsystem state for bfq in the containing cgroup. |
6172 |
-+ * @online: flag marked when the subsystem is inserted. |
6173 |
-+ * @weight: cgroup weight. |
6174 |
-+ * @ioprio: cgroup ioprio. |
6175 |
-+ * @ioprio_class: cgroup ioprio_class. |
6176 |
-+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data. |
6177 |
-+ * @group_data: list containing the bfq_group belonging to this cgroup. |
6178 |
-+ * |
6179 |
-+ * @group_data is accessed using RCU, with @lock protecting the updates, |
6180 |
-+ * @ioprio and @ioprio_class are protected by @lock. |
6181 |
-+ */ |
6182 |
-+struct bfqio_cgroup { |
6183 |
-+ struct cgroup_subsys_state css; |
6184 |
-+ bool online; |
6185 |
-+ |
6186 |
-+ unsigned short weight, ioprio, ioprio_class; |
6187 |
-+ |
6188 |
-+ spinlock_t lock; |
6189 |
-+ struct hlist_head group_data; |
6190 |
-+}; |
6191 |
-+#else |
6192 |
-+struct bfq_group { |
6193 |
-+ struct bfq_sched_data sched_data; |
6194 |
-+ |
6195 |
-+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; |
6196 |
-+ struct bfq_queue *async_idle_bfqq; |
6197 |
-+}; |
6198 |
-+#endif |
6199 |
-+ |
6200 |
-+static inline struct bfq_service_tree * |
6201 |
-+bfq_entity_service_tree(struct bfq_entity *entity) |
6202 |
-+{ |
6203 |
-+ struct bfq_sched_data *sched_data = entity->sched_data; |
6204 |
-+ unsigned int idx = entity->ioprio_class - 1; |
6205 |
-+ |
6206 |
-+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); |
6207 |
-+ BUG_ON(sched_data == NULL); |
6208 |
-+ |
6209 |
-+ return sched_data->service_tree + idx; |
6210 |
-+} |
6211 |
-+ |
6212 |
-+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, |
6213 |
-+ int is_sync) |
6214 |
-+{ |
6215 |
-+ return bic->bfqq[!!is_sync]; |
6216 |
-+} |
6217 |
-+ |
6218 |
-+static inline void bic_set_bfqq(struct bfq_io_cq *bic, |
6219 |
-+ struct bfq_queue *bfqq, int is_sync) |
6220 |
-+{ |
6221 |
-+ bic->bfqq[!!is_sync] = bfqq; |
6222 |
-+} |
6223 |
-+ |
6224 |
-+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) |
6225 |
-+{ |
6226 |
-+ return bic->icq.q->elevator->elevator_data; |
6227 |
-+} |
6228 |
-+ |
6229 |
-+/** |
6230 |
-+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. |
6231 |
-+ * @ptr: a pointer to a bfqd. |
6232 |
-+ * @flags: storage for the flags to be saved. |
6233 |
-+ * |
6234 |
-+ * This function allows bfqg->bfqd to be protected by the |
6235 |
-+ * queue lock of the bfqd they reference; the pointer is dereferenced |
6236 |
-+ * under RCU, so the storage for bfqd is assured to be safe as long |
6237 |
-+ * as the RCU read side critical section does not end. After the |
6238 |
-+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be |
6239 |
-+ * sure that no other writer accessed it. If we raced with a writer, |
6240 |
-+ * the function returns NULL, with the queue unlocked, otherwise it |
6241 |
-+ * returns the dereferenced pointer, with the queue locked. |
6242 |
-+ */ |
6243 |
-+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr, |
6244 |
-+ unsigned long *flags) |
6245 |
-+{ |
6246 |
-+ struct bfq_data *bfqd; |
6247 |
-+ |
6248 |
-+ rcu_read_lock(); |
6249 |
-+ bfqd = rcu_dereference(*(struct bfq_data **)ptr); |
6250 |
-+ |
6251 |
-+ if (bfqd != NULL) { |
6252 |
-+ spin_lock_irqsave(bfqd->queue->queue_lock, *flags); |
6253 |
-+ if (*ptr == bfqd) |
6254 |
-+ goto out; |
6255 |
-+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); |
6256 |
-+ } |
6257 |
-+ |
6258 |
-+ bfqd = NULL; |
6259 |
-+out: |
6260 |
-+ rcu_read_unlock(); |
6261 |
-+ return bfqd; |
6262 |
-+} |
6263 |
-+ |
6264 |
-+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd, |
6265 |
-+ unsigned long *flags) |
6266 |
-+{ |
6267 |
-+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); |
6268 |
-+} |
6269 |
-+ |
6270 |
-+static void bfq_changed_ioprio(struct bfq_io_cq *bic); |
6271 |
-+static void bfq_put_queue(struct bfq_queue *bfqq); |
6272 |
-+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); |
6273 |
-+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, |
6274 |
-+ struct bfq_group *bfqg, int is_sync, |
6275 |
-+ struct bfq_io_cq *bic, gfp_t gfp_mask); |
6276 |
-+static void bfq_end_raising_async_queues(struct bfq_data *bfqd, |
6277 |
-+ struct bfq_group *bfqg); |
6278 |
-+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); |
6279 |
-+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); |
6280 |
-+#endif |
6281 |
--- |
6282 |
-1.8.5.2 |
6283 |
- |
6284 |
|
6285 |
Added: genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1 |
6286 |
=================================================================== |
6287 |
--- genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1 (rev 0) |
6288 |
+++ genpatches-2.6/trunk/3.13/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1 2014-02-07 15:42:35 UTC (rev 2666) |
6289 |
@@ -0,0 +1,6040 @@ |
6290 |
+From be5107dc591f7ae692ca7cceecbba72e4c174c37 Mon Sep 17 00:00:00 2001 |
6291 |
+From: Paolo Valente <paolo.valente@×××××××.it> |
6292 |
+Date: Thu, 9 May 2013 19:10:02 +0200 |
6293 |
+Subject: [PATCH 2/3] block: introduce the BFQ-v7r1 I/O sched for 3.13 |
6294 |
+ |
6295 |
+Add the BFQ-v7r1 I/O scheduler to 3.13. |
6296 |
+The general structure is borrowed from CFQ, as much of the code for |
6297 |
+handling I/O contexts Over time, several useful features have been |
6298 |
+ported from CFQ as well (details in the changelog in README.BFQ). A |
6299 |
+(bfq_)queue is associated to each task doing I/O on a device, and each |
6300 |
+time a scheduling decision has to be made a queue is selected and served |
6301 |
+until it expires. |
6302 |
+ |
6303 |
+ - Slices are given in the service domain: tasks are assigned |
6304 |
+ budgets, measured in number of sectors. Once got the disk, a task |
6305 |
+ must however consume its assigned budget within a configurable |
6306 |
+ maximum time (by default, the maximum possible value of the |
6307 |
+ budgets is automatically computed to comply with this timeout). |
6308 |
+ This allows the desired latency vs "throughput boosting" tradeoff |
6309 |
+ to be set. |
6310 |
+ |
6311 |
+ - Budgets are scheduled according to a variant of WF2Q+, implemented |
6312 |
+ using an augmented rb-tree to take eligibility into account while |
6313 |
+ preserving an O(log N) overall complexity. |
6314 |
+ |
6315 |
+ - A low-latency tunable is provided; if enabled, both interactive |
6316 |
+ and soft real-time applications are guaranteed a very low latency. |
6317 |
+ |
6318 |
+ - Latency guarantees are preserved also in the presence of NCQ. |
6319 |
+ |
6320 |
+ - Also with flash-based devices, a high throughput is achieved |
6321 |
+ while still preserving latency guarantees. |
6322 |
+ |
6323 |
+ - BFQ features Early Queue Merge (EQM), a sort of fusion of the |
6324 |
+ cooperating-queue-merging and the preemption mechanisms present |
6325 |
+ in CFQ. EQM is in fact a unified mechanism that tries to get a |
6326 |
+ sequential read pattern, and hence a high throughput, with any |
6327 |
+ set of processes performing interleaved I/O over a contiguous |
6328 |
+ sequence of sectors. |
6329 |
+ |
6330 |
+ - BFQ supports full hierarchical scheduling, exporting a cgroups |
6331 |
+ interface. Since each node has a full scheduler, each group can |
6332 |
+ be assigned its own weight. |
6333 |
+ |
6334 |
+ - If the cgroups interface is not used, only I/O priorities can be |
6335 |
+ assigned to processes, with ioprio values mapped to weights |
6336 |
+ with the relation weight = IOPRIO_BE_NR - ioprio. |
6337 |
+ |
6338 |
+ - ioprio classes are served in strict priority order, i.e., lower |
6339 |
+ priority queues are not served as long as there are higher |
6340 |
+ priority queues. Among queues in the same class the bandwidth is |
6341 |
+ distributed in proportion to the weight of each queue. A very |
6342 |
+ thin extra bandwidth is however guaranteed to the Idle class, to |
6343 |
+ prevent it from starving. |
6344 |
+ |
6345 |
+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it> |
6346 |
+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com> |
6347 |
+--- |
6348 |
+ block/bfq-cgroup.c | 911 ++++++++++++++ |
6349 |
+ block/bfq-ioc.c | 36 + |
6350 |
+ block/bfq-iosched.c | 3298 +++++++++++++++++++++++++++++++++++++++++++++++++++ |
6351 |
+ block/bfq-sched.c | 1078 +++++++++++++++++ |
6352 |
+ block/bfq.h | 614 ++++++++++ |
6353 |
+ 5 files changed, 5937 insertions(+) |
6354 |
+ create mode 100644 block/bfq-cgroup.c |
6355 |
+ create mode 100644 block/bfq-ioc.c |
6356 |
+ create mode 100644 block/bfq-iosched.c |
6357 |
+ create mode 100644 block/bfq-sched.c |
6358 |
+ create mode 100644 block/bfq.h |
6359 |
+ |
6360 |
+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c |
6361 |
+new file mode 100644 |
6362 |
+index 0000000..79a288a |
6363 |
+--- /dev/null |
6364 |
++++ b/block/bfq-cgroup.c |
6365 |
+@@ -0,0 +1,911 @@ |
6366 |
++/* |
6367 |
++ * BFQ: CGROUPS support. |
6368 |
++ * |
6369 |
++ * Based on ideas and code from CFQ: |
6370 |
++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
6371 |
++ * |
6372 |
++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
6373 |
++ * Paolo Valente <paolo.valente@×××××××.it> |
6374 |
++ * |
6375 |
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
6376 |
++ * |
6377 |
++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. |
6378 |
++ */ |
6379 |
++ |
6380 |
++#ifdef CONFIG_CGROUP_BFQIO |
6381 |
++ |
6382 |
++static DEFINE_MUTEX(bfqio_mutex); |
6383 |
++ |
6384 |
++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp) |
6385 |
++{ |
6386 |
++ return bgrp ? !bgrp->online : false; |
6387 |
++} |
6388 |
++ |
6389 |
++static struct bfqio_cgroup bfqio_root_cgroup = { |
6390 |
++ .weight = BFQ_DEFAULT_GRP_WEIGHT, |
6391 |
++ .ioprio = BFQ_DEFAULT_GRP_IOPRIO, |
6392 |
++ .ioprio_class = BFQ_DEFAULT_GRP_CLASS, |
6393 |
++}; |
6394 |
++ |
6395 |
++static inline void bfq_init_entity(struct bfq_entity *entity, |
6396 |
++ struct bfq_group *bfqg) |
6397 |
++{ |
6398 |
++ entity->weight = entity->new_weight; |
6399 |
++ entity->orig_weight = entity->new_weight; |
6400 |
++ entity->ioprio = entity->new_ioprio; |
6401 |
++ entity->ioprio_class = entity->new_ioprio_class; |
6402 |
++ entity->parent = bfqg->my_entity; |
6403 |
++ entity->sched_data = &bfqg->sched_data; |
6404 |
++} |
6405 |
++ |
6406 |
++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css) |
6407 |
++{ |
6408 |
++ return css ? container_of(css, struct bfqio_cgroup, css) : NULL; |
6409 |
++} |
6410 |
++ |
6411 |
++/* |
6412 |
++ * Search the bfq_group for bfqd into the hash table (by now only a list) |
6413 |
++ * of bgrp. Must be called under rcu_read_lock(). |
6414 |
++ */ |
6415 |
++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp, |
6416 |
++ struct bfq_data *bfqd) |
6417 |
++{ |
6418 |
++ struct bfq_group *bfqg; |
6419 |
++ void *key; |
6420 |
++ |
6421 |
++ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) { |
6422 |
++ key = rcu_dereference(bfqg->bfqd); |
6423 |
++ if (key == bfqd) |
6424 |
++ return bfqg; |
6425 |
++ } |
6426 |
++ |
6427 |
++ return NULL; |
6428 |
++} |
6429 |
++ |
6430 |
++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp, |
6431 |
++ struct bfq_group *bfqg) |
6432 |
++{ |
6433 |
++ struct bfq_entity *entity = &bfqg->entity; |
6434 |
++ |
6435 |
++ /* |
6436 |
++ * If the weight of the entity has never been set via the sysfs |
6437 |
++ * interface, then bgrp->weight == 0. In this case we initialize |
6438 |
++ * the weight from the current ioprio value. Otherwise, the group |
6439 |
++ * weight, if set, has priority over the ioprio value. |
6440 |
++ */ |
6441 |
++ if (bgrp->weight == 0) { |
6442 |
++ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio); |
6443 |
++ entity->new_ioprio = bgrp->ioprio; |
6444 |
++ } else { |
6445 |
++ entity->new_weight = bgrp->weight; |
6446 |
++ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight); |
6447 |
++ } |
6448 |
++ entity->orig_weight = entity->weight = entity->new_weight; |
6449 |
++ entity->ioprio = entity->new_ioprio; |
6450 |
++ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class; |
6451 |
++ entity->my_sched_data = &bfqg->sched_data; |
6452 |
++} |
6453 |
++ |
6454 |
++static inline void bfq_group_set_parent(struct bfq_group *bfqg, |
6455 |
++ struct bfq_group *parent) |
6456 |
++{ |
6457 |
++ struct bfq_entity *entity; |
6458 |
++ |
6459 |
++ BUG_ON(parent == NULL); |
6460 |
++ BUG_ON(bfqg == NULL); |
6461 |
++ |
6462 |
++ entity = &bfqg->entity; |
6463 |
++ entity->parent = parent->my_entity; |
6464 |
++ entity->sched_data = &parent->sched_data; |
6465 |
++} |
6466 |
++ |
6467 |
++/** |
6468 |
++ * bfq_group_chain_alloc - allocate a chain of groups. |
6469 |
++ * @bfqd: queue descriptor. |
6470 |
++ * @css: the leaf cgroup_subsys_state this chain starts from. |
6471 |
++ * |
6472 |
++ * Allocate a chain of groups starting from the one belonging to |
6473 |
++ * @cgroup up to the root cgroup. Stop if a cgroup on the chain |
6474 |
++ * to the root has already an allocated group on @bfqd. |
6475 |
++ */ |
6476 |
++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd, |
6477 |
++ struct cgroup_subsys_state *css) |
6478 |
++{ |
6479 |
++ struct bfqio_cgroup *bgrp; |
6480 |
++ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL; |
6481 |
++ |
6482 |
++ for (; css != NULL; css = css->parent) { |
6483 |
++ bgrp = css_to_bfqio(css); |
6484 |
++ |
6485 |
++ bfqg = bfqio_lookup_group(bgrp, bfqd); |
6486 |
++ if (bfqg != NULL) { |
6487 |
++ /* |
6488 |
++ * All the cgroups in the path from there to the |
6489 |
++ * root must have a bfq_group for bfqd, so we don't |
6490 |
++ * need any more allocations. |
6491 |
++ */ |
6492 |
++ break; |
6493 |
++ } |
6494 |
++ |
6495 |
++ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC); |
6496 |
++ if (bfqg == NULL) |
6497 |
++ goto cleanup; |
6498 |
++ |
6499 |
++ bfq_group_init_entity(bgrp, bfqg); |
6500 |
++ bfqg->my_entity = &bfqg->entity; |
6501 |
++ |
6502 |
++ if (leaf == NULL) { |
6503 |
++ leaf = bfqg; |
6504 |
++ prev = leaf; |
6505 |
++ } else { |
6506 |
++ bfq_group_set_parent(prev, bfqg); |
6507 |
++ /* |
6508 |
++ * Build a list of allocated nodes using the bfqd |
6509 |
++ * filed, that is still unused and will be initialized |
6510 |
++ * only after the node will be connected. |
6511 |
++ */ |
6512 |
++ prev->bfqd = bfqg; |
6513 |
++ prev = bfqg; |
6514 |
++ } |
6515 |
++ } |
6516 |
++ |
6517 |
++ return leaf; |
6518 |
++ |
6519 |
++cleanup: |
6520 |
++ while (leaf != NULL) { |
6521 |
++ prev = leaf; |
6522 |
++ leaf = leaf->bfqd; |
6523 |
++ kfree(prev); |
6524 |
++ } |
6525 |
++ |
6526 |
++ return NULL; |
6527 |
++} |
6528 |
++ |
6529 |
++/** |
6530 |
++ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy. |
6531 |
++ * @bfqd: the queue descriptor. |
6532 |
++ * @css: the leaf cgroup_subsys_state to start from. |
6533 |
++ * @leaf: the leaf group (to be associated to @cgroup). |
6534 |
++ * |
6535 |
++ * Try to link a chain of groups to a cgroup hierarchy, connecting the |
6536 |
++ * nodes bottom-up, so we can be sure that when we find a cgroup in the |
6537 |
++ * hierarchy that already as a group associated to @bfqd all the nodes |
6538 |
++ * in the path to the root cgroup have one too. |
6539 |
++ * |
6540 |
++ * On locking: the queue lock protects the hierarchy (there is a hierarchy |
6541 |
++ * per device) while the bfqio_cgroup lock protects the list of groups |
6542 |
++ * belonging to the same cgroup. |
6543 |
++ */ |
6544 |
++static void bfq_group_chain_link(struct bfq_data *bfqd, |
6545 |
++ struct cgroup_subsys_state *css, |
6546 |
++ struct bfq_group *leaf) |
6547 |
++{ |
6548 |
++ struct bfqio_cgroup *bgrp; |
6549 |
++ struct bfq_group *bfqg, *next, *prev = NULL; |
6550 |
++ unsigned long flags; |
6551 |
++ |
6552 |
++ assert_spin_locked(bfqd->queue->queue_lock); |
6553 |
++ |
6554 |
++ for (; css != NULL && leaf != NULL; css = css->parent) { |
6555 |
++ bgrp = css_to_bfqio(css); |
6556 |
++ next = leaf->bfqd; |
6557 |
++ |
6558 |
++ bfqg = bfqio_lookup_group(bgrp, bfqd); |
6559 |
++ BUG_ON(bfqg != NULL); |
6560 |
++ |
6561 |
++ spin_lock_irqsave(&bgrp->lock, flags); |
6562 |
++ |
6563 |
++ rcu_assign_pointer(leaf->bfqd, bfqd); |
6564 |
++ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data); |
6565 |
++ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list); |
6566 |
++ |
6567 |
++ spin_unlock_irqrestore(&bgrp->lock, flags); |
6568 |
++ |
6569 |
++ prev = leaf; |
6570 |
++ leaf = next; |
6571 |
++ } |
6572 |
++ |
6573 |
++ BUG_ON(css == NULL && leaf != NULL); |
6574 |
++ if (css != NULL && prev != NULL) { |
6575 |
++ bgrp = css_to_bfqio(css); |
6576 |
++ bfqg = bfqio_lookup_group(bgrp, bfqd); |
6577 |
++ bfq_group_set_parent(prev, bfqg); |
6578 |
++ } |
6579 |
++} |
6580 |
++ |
6581 |
++/** |
6582 |
++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup. |
6583 |
++ * @bfqd: queue descriptor. |
6584 |
++ * @cgroup: cgroup being searched for. |
6585 |
++ * |
6586 |
++ * Return a group associated to @bfqd in @cgroup, allocating one if |
6587 |
++ * necessary. When a group is returned all the cgroups in the path |
6588 |
++ * to the root have a group associated to @bfqd. |
6589 |
++ * |
6590 |
++ * If the allocation fails, return the root group: this breaks guarantees |
6591 |
++ * but is a safe fallbak. If this loss becames a problem it can be |
6592 |
++ * mitigated using the equivalent weight (given by the product of the |
6593 |
++ * weights of the groups in the path from @group to the root) in the |
6594 |
++ * root scheduler. |
6595 |
++ * |
6596 |
++ * We allocate all the missing nodes in the path from the leaf cgroup |
6597 |
++ * to the root and we connect the nodes only after all the allocations |
6598 |
++ * have been successful. |
6599 |
++ */ |
6600 |
++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, |
6601 |
++ struct cgroup_subsys_state *css) |
6602 |
++{ |
6603 |
++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); |
6604 |
++ struct bfq_group *bfqg; |
6605 |
++ |
6606 |
++ bfqg = bfqio_lookup_group(bgrp, bfqd); |
6607 |
++ if (bfqg != NULL) |
6608 |
++ return bfqg; |
6609 |
++ |
6610 |
++ bfqg = bfq_group_chain_alloc(bfqd, css); |
6611 |
++ if (bfqg != NULL) |
6612 |
++ bfq_group_chain_link(bfqd, css, bfqg); |
6613 |
++ else |
6614 |
++ bfqg = bfqd->root_group; |
6615 |
++ |
6616 |
++ return bfqg; |
6617 |
++} |
6618 |
++ |
6619 |
++/** |
6620 |
++ * bfq_bfqq_move - migrate @bfqq to @bfqg. |
6621 |
++ * @bfqd: queue descriptor. |
6622 |
++ * @bfqq: the queue to move. |
6623 |
++ * @entity: @bfqq's entity. |
6624 |
++ * @bfqg: the group to move to. |
6625 |
++ * |
6626 |
++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating |
6627 |
++ * it on the new one. Avoid putting the entity on the old group idle tree. |
6628 |
++ * |
6629 |
++ * Must be called under the queue lock; the cgroup owning @bfqg must |
6630 |
++ * not disappear (by now this just means that we are called under |
6631 |
++ * rcu_read_lock()). |
6632 |
++ */ |
6633 |
++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
6634 |
++ struct bfq_entity *entity, struct bfq_group *bfqg) |
6635 |
++{ |
6636 |
++ int busy, resume; |
6637 |
++ |
6638 |
++ busy = bfq_bfqq_busy(bfqq); |
6639 |
++ resume = !RB_EMPTY_ROOT(&bfqq->sort_list); |
6640 |
++ |
6641 |
++ BUG_ON(resume && !entity->on_st); |
6642 |
++ BUG_ON(busy && !resume && entity->on_st && |
6643 |
++ bfqq != bfqd->in_service_queue); |
6644 |
++ |
6645 |
++ if (busy) { |
6646 |
++ BUG_ON(atomic_read(&bfqq->ref) < 2); |
6647 |
++ |
6648 |
++ if (!resume) |
6649 |
++ bfq_del_bfqq_busy(bfqd, bfqq, 0); |
6650 |
++ else |
6651 |
++ bfq_deactivate_bfqq(bfqd, bfqq, 0); |
6652 |
++ } else if (entity->on_st) |
6653 |
++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); |
6654 |
++ |
6655 |
++ /* |
6656 |
++ * Here we use a reference to bfqg. We don't need a refcounter |
6657 |
++ * as the cgroup reference will not be dropped, so that its |
6658 |
++ * destroy() callback will not be invoked. |
6659 |
++ */ |
6660 |
++ entity->parent = bfqg->my_entity; |
6661 |
++ entity->sched_data = &bfqg->sched_data; |
6662 |
++ |
6663 |
++ if (busy && resume) |
6664 |
++ bfq_activate_bfqq(bfqd, bfqq); |
6665 |
++ |
6666 |
++ if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver) |
6667 |
++ bfq_schedule_dispatch(bfqd); |
6668 |
++} |
6669 |
++ |
6670 |
++/** |
6671 |
++ * __bfq_bic_change_cgroup - move @bic to @cgroup. |
6672 |
++ * @bfqd: the queue descriptor. |
6673 |
++ * @bic: the bic to move. |
6674 |
++ * @cgroup: the cgroup to move to. |
6675 |
++ * |
6676 |
++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller |
6677 |
++ * has to make sure that the reference to cgroup is valid across the call. |
6678 |
++ * |
6679 |
++ * NOTE: an alternative approach might have been to store the current |
6680 |
++ * cgroup in bfqq and getting a reference to it, reducing the lookup |
6681 |
++ * time here, at the price of slightly more complex code. |
6682 |
++ */ |
6683 |
++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, |
6684 |
++ struct bfq_io_cq *bic, |
6685 |
++ struct cgroup_subsys_state *css) |
6686 |
++{ |
6687 |
++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); |
6688 |
++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); |
6689 |
++ struct bfq_entity *entity; |
6690 |
++ struct bfq_group *bfqg; |
6691 |
++ struct bfqio_cgroup *bgrp; |
6692 |
++ |
6693 |
++ bgrp = css_to_bfqio(css); |
6694 |
++ |
6695 |
++ bfqg = bfq_find_alloc_group(bfqd, css); |
6696 |
++ if (async_bfqq != NULL) { |
6697 |
++ entity = &async_bfqq->entity; |
6698 |
++ |
6699 |
++ if (entity->sched_data != &bfqg->sched_data) { |
6700 |
++ bic_set_bfqq(bic, NULL, 0); |
6701 |
++ bfq_log_bfqq(bfqd, async_bfqq, |
6702 |
++ "bic_change_group: %p %d", |
6703 |
++ async_bfqq, atomic_read(&async_bfqq->ref)); |
6704 |
++ bfq_put_queue(async_bfqq); |
6705 |
++ } |
6706 |
++ } |
6707 |
++ |
6708 |
++ if (sync_bfqq != NULL) { |
6709 |
++ entity = &sync_bfqq->entity; |
6710 |
++ if (entity->sched_data != &bfqg->sched_data) |
6711 |
++ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); |
6712 |
++ } |
6713 |
++ |
6714 |
++ return bfqg; |
6715 |
++} |
6716 |
++ |
6717 |
++/** |
6718 |
++ * bfq_bic_change_cgroup - move @bic to @cgroup. |
6719 |
++ * @bic: the bic being migrated. |
6720 |
++ * @cgroup: the destination cgroup. |
6721 |
++ * |
6722 |
++ * When the task owning @bic is moved to @cgroup, @bic is immediately |
6723 |
++ * moved into its new parent group. |
6724 |
++ */ |
6725 |
++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic, |
6726 |
++ struct cgroup_subsys_state *css) |
6727 |
++{ |
6728 |
++ struct bfq_data *bfqd; |
6729 |
++ unsigned long uninitialized_var(flags); |
6730 |
++ |
6731 |
++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), |
6732 |
++ &flags); |
6733 |
++ if (bfqd != NULL) { |
6734 |
++ __bfq_bic_change_cgroup(bfqd, bic, css); |
6735 |
++ bfq_put_bfqd_unlock(bfqd, &flags); |
6736 |
++ } |
6737 |
++} |
6738 |
++ |
6739 |
++/** |
6740 |
++ * bfq_bic_update_cgroup - update the cgroup of @bic. |
6741 |
++ * @bic: the @bic to update. |
6742 |
++ * |
6743 |
++ * Make sure that @bic is enqueued in the cgroup of the current task. |
6744 |
++ * We need this in addition to moving bics during the cgroup attach |
6745 |
++ * phase because the task owning @bic could be at its first disk |
6746 |
++ * access or we may end up in the root cgroup as the result of a |
6747 |
++ * memory allocation failure and here we try to move to the right |
6748 |
++ * group. |
6749 |
++ * |
6750 |
++ * Must be called under the queue lock. It is safe to use the returned |
6751 |
++ * value even after the rcu_read_unlock() as the migration/destruction |
6752 |
++ * paths act under the queue lock too. IOW it is impossible to race with |
6753 |
++ * group migration/destruction and end up with an invalid group as: |
6754 |
++ * a) here cgroup has not yet been destroyed, nor its destroy callback |
6755 |
++ * has started execution, as current holds a reference to it, |
6756 |
++ * b) if it is destroyed after rcu_read_unlock() [after current is |
6757 |
++ * migrated to a different cgroup] its attach() callback will have |
6758 |
++ * taken care of remove all the references to the old cgroup data. |
6759 |
++ */ |
6760 |
++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic) |
6761 |
++{ |
6762 |
++ struct bfq_data *bfqd = bic_to_bfqd(bic); |
6763 |
++ struct bfq_group *bfqg; |
6764 |
++ struct cgroup_subsys_state *css; |
6765 |
++ |
6766 |
++ BUG_ON(bfqd == NULL); |
6767 |
++ |
6768 |
++ rcu_read_lock(); |
6769 |
++ css = task_css(current, bfqio_subsys_id); |
6770 |
++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, css); |
6771 |
++ rcu_read_unlock(); |
6772 |
++ |
6773 |
++ return bfqg; |
6774 |
++} |
6775 |
++ |
6776 |
++/** |
6777 |
++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. |
6778 |
++ * @st: the service tree being flushed. |
6779 |
++ */ |
6780 |
++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st) |
6781 |
++{ |
6782 |
++ struct bfq_entity *entity = st->first_idle; |
6783 |
++ |
6784 |
++ for (; entity != NULL; entity = st->first_idle) |
6785 |
++ __bfq_deactivate_entity(entity, 0); |
6786 |
++} |
6787 |
++ |
6788 |
++/** |
6789 |
++ * bfq_reparent_leaf_entity - move leaf entity to the root_group. |
6790 |
++ * @bfqd: the device data structure with the root group. |
6791 |
++ * @entity: the entity to move. |
6792 |
++ */ |
6793 |
++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd, |
6794 |
++ struct bfq_entity *entity) |
6795 |
++{ |
6796 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
6797 |
++ |
6798 |
++ BUG_ON(bfqq == NULL); |
6799 |
++ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); |
6800 |
++ return; |
6801 |
++} |
6802 |
++ |
6803 |
++/** |
6804 |
++ * bfq_reparent_active_entities - move to the root group all active entities. |
6805 |
++ * @bfqd: the device data structure with the root group. |
6806 |
++ * @bfqg: the group to move from. |
6807 |
++ * @st: the service tree with the entities. |
6808 |
++ * |
6809 |
++ * Needs queue_lock to be taken and reference to be valid over the call. |
6810 |
++ */ |
6811 |
++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd, |
6812 |
++ struct bfq_group *bfqg, |
6813 |
++ struct bfq_service_tree *st) |
6814 |
++{ |
6815 |
++ struct rb_root *active = &st->active; |
6816 |
++ struct bfq_entity *entity = NULL; |
6817 |
++ |
6818 |
++ if (!RB_EMPTY_ROOT(&st->active)) |
6819 |
++ entity = bfq_entity_of(rb_first(active)); |
6820 |
++ |
6821 |
++ for (; entity != NULL; entity = bfq_entity_of(rb_first(active))) |
6822 |
++ bfq_reparent_leaf_entity(bfqd, entity); |
6823 |
++ |
6824 |
++ if (bfqg->sched_data.in_service_entity != NULL) |
6825 |
++ bfq_reparent_leaf_entity(bfqd, |
6826 |
++ bfqg->sched_data.in_service_entity); |
6827 |
++ |
6828 |
++ return; |
6829 |
++} |
6830 |
++ |
6831 |
++/** |
6832 |
++ * bfq_destroy_group - destroy @bfqg. |
6833 |
++ * @bgrp: the bfqio_cgroup containing @bfqg. |
6834 |
++ * @bfqg: the group being destroyed. |
6835 |
++ * |
6836 |
++ * Destroy @bfqg, making sure that it is not referenced from its parent. |
6837 |
++ */ |
6838 |
++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) |
6839 |
++{ |
6840 |
++ struct bfq_data *bfqd; |
6841 |
++ struct bfq_service_tree *st; |
6842 |
++ struct bfq_entity *entity = bfqg->my_entity; |
6843 |
++ unsigned long uninitialized_var(flags); |
6844 |
++ int i; |
6845 |
++ |
6846 |
++ hlist_del(&bfqg->group_node); |
6847 |
++ |
6848 |
++ /* |
6849 |
++ * Empty all service_trees belonging to this group before deactivating |
6850 |
++ * the group itself. |
6851 |
++ */ |
6852 |
++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { |
6853 |
++ st = bfqg->sched_data.service_tree + i; |
6854 |
++ |
6855 |
++ /* |
6856 |
++ * The idle tree may still contain bfq_queues belonging |
6857 |
++ * to exited task because they never migrated to a different |
6858 |
++ * cgroup from the one being destroyed now. Noone else |
6859 |
++ * can access them so it's safe to act without any lock. |
6860 |
++ */ |
6861 |
++ bfq_flush_idle_tree(st); |
6862 |
++ |
6863 |
++ /* |
6864 |
++ * It may happen that some queues are still active |
6865 |
++ * (busy) upon group destruction (if the corresponding |
6866 |
++ * processes have been forced to terminate). We move |
6867 |
++ * all the leaf entities corresponding to these queues |
6868 |
++ * to the root_group. |
6869 |
++ * Also, it may happen that the group has an entity |
6870 |
++ * under service, which is disconnected from the active |
6871 |
++ * tree: it must be moved, too. |
6872 |
++ * There is no need to put the sync queues, as the |
6873 |
++ * scheduler has taken no reference. |
6874 |
++ */ |
6875 |
++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); |
6876 |
++ if (bfqd != NULL) { |
6877 |
++ bfq_reparent_active_entities(bfqd, bfqg, st); |
6878 |
++ bfq_put_bfqd_unlock(bfqd, &flags); |
6879 |
++ } |
6880 |
++ BUG_ON(!RB_EMPTY_ROOT(&st->active)); |
6881 |
++ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); |
6882 |
++ } |
6883 |
++ BUG_ON(bfqg->sched_data.next_in_service != NULL); |
6884 |
++ BUG_ON(bfqg->sched_data.in_service_entity != NULL); |
6885 |
++ |
6886 |
++ /* |
6887 |
++ * We may race with device destruction, take extra care when |
6888 |
++ * dereferencing bfqg->bfqd. |
6889 |
++ */ |
6890 |
++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); |
6891 |
++ if (bfqd != NULL) { |
6892 |
++ hlist_del(&bfqg->bfqd_node); |
6893 |
++ __bfq_deactivate_entity(entity, 0); |
6894 |
++ bfq_put_async_queues(bfqd, bfqg); |
6895 |
++ bfq_put_bfqd_unlock(bfqd, &flags); |
6896 |
++ } |
6897 |
++ BUG_ON(entity->tree != NULL); |
6898 |
++ |
6899 |
++ /* |
6900 |
++ * No need to defer the kfree() to the end of the RCU grace |
6901 |
++ * period: we are called from the destroy() callback of our |
6902 |
++ * cgroup, so we can be sure that noone is a) still using |
6903 |
++ * this cgroup or b) doing lookups in it. |
6904 |
++ */ |
6905 |
++ kfree(bfqg); |
6906 |
++} |
6907 |
++ |
6908 |
++static void bfq_end_raising_async(struct bfq_data *bfqd) |
6909 |
++{ |
6910 |
++ struct hlist_node *tmp; |
6911 |
++ struct bfq_group *bfqg; |
6912 |
++ |
6913 |
++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) |
6914 |
++ bfq_end_raising_async_queues(bfqd, bfqg); |
6915 |
++ bfq_end_raising_async_queues(bfqd, bfqd->root_group); |
6916 |
++} |
6917 |
++ |
6918 |
++/** |
6919 |
++ * bfq_disconnect_groups - diconnect @bfqd from all its groups. |
6920 |
++ * @bfqd: the device descriptor being exited. |
6921 |
++ * |
6922 |
++ * When the device exits we just make sure that no lookup can return |
6923 |
++ * the now unused group structures. They will be deallocated on cgroup |
6924 |
++ * destruction. |
6925 |
++ */ |
6926 |
++static void bfq_disconnect_groups(struct bfq_data *bfqd) |
6927 |
++{ |
6928 |
++ struct hlist_node *tmp; |
6929 |
++ struct bfq_group *bfqg; |
6930 |
++ |
6931 |
++ bfq_log(bfqd, "disconnect_groups beginning"); |
6932 |
++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) { |
6933 |
++ hlist_del(&bfqg->bfqd_node); |
6934 |
++ |
6935 |
++ __bfq_deactivate_entity(bfqg->my_entity, 0); |
6936 |
++ |
6937 |
++ /* |
6938 |
++ * Don't remove from the group hash, just set an |
6939 |
++ * invalid key. No lookups can race with the |
6940 |
++ * assignment as bfqd is being destroyed; this |
6941 |
++ * implies also that new elements cannot be added |
6942 |
++ * to the list. |
6943 |
++ */ |
6944 |
++ rcu_assign_pointer(bfqg->bfqd, NULL); |
6945 |
++ |
6946 |
++ bfq_log(bfqd, "disconnect_groups: put async for group %p", |
6947 |
++ bfqg); |
6948 |
++ bfq_put_async_queues(bfqd, bfqg); |
6949 |
++ } |
6950 |
++} |
6951 |
++ |
6952 |
++static inline void bfq_free_root_group(struct bfq_data *bfqd) |
6953 |
++{ |
6954 |
++ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup; |
6955 |
++ struct bfq_group *bfqg = bfqd->root_group; |
6956 |
++ |
6957 |
++ bfq_put_async_queues(bfqd, bfqg); |
6958 |
++ |
6959 |
++ spin_lock_irq(&bgrp->lock); |
6960 |
++ hlist_del_rcu(&bfqg->group_node); |
6961 |
++ spin_unlock_irq(&bgrp->lock); |
6962 |
++ |
6963 |
++ /* |
6964 |
++ * No need to synchronize_rcu() here: since the device is gone |
6965 |
++ * there cannot be any read-side access to its root_group. |
6966 |
++ */ |
6967 |
++ kfree(bfqg); |
6968 |
++} |
6969 |
++ |
6970 |
++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) |
6971 |
++{ |
6972 |
++ struct bfq_group *bfqg; |
6973 |
++ struct bfqio_cgroup *bgrp; |
6974 |
++ int i; |
6975 |
++ |
6976 |
++ bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node); |
6977 |
++ if (bfqg == NULL) |
6978 |
++ return NULL; |
6979 |
++ |
6980 |
++ bfqg->entity.parent = NULL; |
6981 |
++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) |
6982 |
++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; |
6983 |
++ |
6984 |
++ bgrp = &bfqio_root_cgroup; |
6985 |
++ spin_lock_irq(&bgrp->lock); |
6986 |
++ rcu_assign_pointer(bfqg->bfqd, bfqd); |
6987 |
++ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data); |
6988 |
++ spin_unlock_irq(&bgrp->lock); |
6989 |
++ |
6990 |
++ return bfqg; |
6991 |
++} |
6992 |
++ |
6993 |
++#define SHOW_FUNCTION(__VAR) \ |
6994 |
++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \ |
6995 |
++ struct cftype *cftype) \ |
6996 |
++{ \ |
6997 |
++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \ |
6998 |
++ u64 ret = -ENODEV; \ |
6999 |
++ \ |
7000 |
++ mutex_lock(&bfqio_mutex); \ |
7001 |
++ if (bfqio_is_removed(bgrp)) \ |
7002 |
++ goto out_unlock; \ |
7003 |
++ \ |
7004 |
++ spin_lock_irq(&bgrp->lock); \ |
7005 |
++ ret = bgrp->__VAR; \ |
7006 |
++ spin_unlock_irq(&bgrp->lock); \ |
7007 |
++ \ |
7008 |
++out_unlock: \ |
7009 |
++ mutex_unlock(&bfqio_mutex); \ |
7010 |
++ return ret; \ |
7011 |
++} |
7012 |
++ |
7013 |
++SHOW_FUNCTION(weight); |
7014 |
++SHOW_FUNCTION(ioprio); |
7015 |
++SHOW_FUNCTION(ioprio_class); |
7016 |
++#undef SHOW_FUNCTION |
7017 |
++ |
7018 |
++#define STORE_FUNCTION(__VAR, __MIN, __MAX) \ |
7019 |
++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\ |
7020 |
++ struct cftype *cftype, \ |
7021 |
++ u64 val) \ |
7022 |
++{ \ |
7023 |
++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \ |
7024 |
++ struct bfq_group *bfqg; \ |
7025 |
++ int ret = -EINVAL; \ |
7026 |
++ \ |
7027 |
++ if (val < (__MIN) || val > (__MAX)) \ |
7028 |
++ return ret; \ |
7029 |
++ \ |
7030 |
++ ret = -ENODEV; \ |
7031 |
++ mutex_lock(&bfqio_mutex); \ |
7032 |
++ if (bfqio_is_removed(bgrp)) \ |
7033 |
++ goto out_unlock; \ |
7034 |
++ ret = 0; \ |
7035 |
++ \ |
7036 |
++ spin_lock_irq(&bgrp->lock); \ |
7037 |
++ bgrp->__VAR = (unsigned short)val; \ |
7038 |
++ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \ |
7039 |
++ /* \ |
7040 |
++ * Setting the ioprio_changed flag of the entity \ |
7041 |
++ * to 1 with new_##__VAR == ##__VAR would re-set \ |
7042 |
++ * the value of the weight to its ioprio mapping. \ |
7043 |
++ * Set the flag only if necessary. \ |
7044 |
++ */ \ |
7045 |
++ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \ |
7046 |
++ bfqg->entity.new_##__VAR = (unsigned short)val; \ |
7047 |
++ smp_wmb(); \ |
7048 |
++ bfqg->entity.ioprio_changed = 1; \ |
7049 |
++ } \ |
7050 |
++ } \ |
7051 |
++ spin_unlock_irq(&bgrp->lock); \ |
7052 |
++ \ |
7053 |
++out_unlock: \ |
7054 |
++ mutex_unlock(&bfqio_mutex); \ |
7055 |
++ return ret; \ |
7056 |
++} |
7057 |
++ |
7058 |
++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT); |
7059 |
++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1); |
7060 |
++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); |
7061 |
++#undef STORE_FUNCTION |
7062 |
++ |
7063 |
++static struct cftype bfqio_files[] = { |
7064 |
++ { |
7065 |
++ .name = "weight", |
7066 |
++ .read_u64 = bfqio_cgroup_weight_read, |
7067 |
++ .write_u64 = bfqio_cgroup_weight_write, |
7068 |
++ }, |
7069 |
++ { |
7070 |
++ .name = "ioprio", |
7071 |
++ .read_u64 = bfqio_cgroup_ioprio_read, |
7072 |
++ .write_u64 = bfqio_cgroup_ioprio_write, |
7073 |
++ }, |
7074 |
++ { |
7075 |
++ .name = "ioprio_class", |
7076 |
++ .read_u64 = bfqio_cgroup_ioprio_class_read, |
7077 |
++ .write_u64 = bfqio_cgroup_ioprio_class_write, |
7078 |
++ }, |
7079 |
++ { }, /* terminate */ |
7080 |
++}; |
7081 |
++ |
7082 |
++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state |
7083 |
++ *parent_css) |
7084 |
++{ |
7085 |
++ struct bfqio_cgroup *bgrp; |
7086 |
++ |
7087 |
++ if (parent_css != NULL) { |
7088 |
++ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL); |
7089 |
++ if (bgrp == NULL) |
7090 |
++ return ERR_PTR(-ENOMEM); |
7091 |
++ } else |
7092 |
++ bgrp = &bfqio_root_cgroup; |
7093 |
++ |
7094 |
++ spin_lock_init(&bgrp->lock); |
7095 |
++ INIT_HLIST_HEAD(&bgrp->group_data); |
7096 |
++ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO; |
7097 |
++ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS; |
7098 |
++ |
7099 |
++ return &bgrp->css; |
7100 |
++} |
7101 |
++ |
7102 |
++/* |
7103 |
++ * We cannot support shared io contexts, as we have no means to support |
7104 |
++ * two tasks with the same ioc in two different groups without major rework |
7105 |
++ * of the main bic/bfqq data structures. By now we allow a task to change |
7106 |
++ * its cgroup only if it's the only owner of its ioc; the drawback of this |
7107 |
++ * behavior is that a group containing a task that forked using CLONE_IO |
7108 |
++ * will not be destroyed until the tasks sharing the ioc die. |
7109 |
++ */ |
7110 |
++static int bfqio_can_attach(struct cgroup_subsys_state *css, |
7111 |
++ struct cgroup_taskset *tset) |
7112 |
++{ |
7113 |
++ struct task_struct *task; |
7114 |
++ struct io_context *ioc; |
7115 |
++ int ret = 0; |
7116 |
++ |
7117 |
++ cgroup_taskset_for_each(task, css, tset) { |
7118 |
++ /* |
7119 |
++ * task_lock() is needed to avoid races with |
7120 |
++ * exit_io_context() |
7121 |
++ */ |
7122 |
++ task_lock(task); |
7123 |
++ ioc = task->io_context; |
7124 |
++ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) |
7125 |
++ /* |
7126 |
++ * ioc == NULL means that the task is either too young |
7127 |
++ * or exiting: if it has still no ioc the ioc can't be |
7128 |
++ * shared, if the task is exiting the attach will fail |
7129 |
++ * anyway, no matter what we return here. |
7130 |
++ */ |
7131 |
++ ret = -EINVAL; |
7132 |
++ task_unlock(task); |
7133 |
++ if (ret) |
7134 |
++ break; |
7135 |
++ } |
7136 |
++ |
7137 |
++ return ret; |
7138 |
++} |
7139 |
++ |
7140 |
++static void bfqio_attach(struct cgroup_subsys_state *css, |
7141 |
++ struct cgroup_taskset *tset) |
7142 |
++{ |
7143 |
++ struct task_struct *task; |
7144 |
++ struct io_context *ioc; |
7145 |
++ struct io_cq *icq; |
7146 |
++ |
7147 |
++ /* |
7148 |
++ * IMPORTANT NOTE: The move of more than one process at a time to a |
7149 |
++ * new group has not yet been tested. |
7150 |
++ */ |
7151 |
++ cgroup_taskset_for_each(task, css, tset) { |
7152 |
++ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); |
7153 |
++ if (ioc) { |
7154 |
++ /* |
7155 |
++ * Handle cgroup change here. |
7156 |
++ */ |
7157 |
++ rcu_read_lock(); |
7158 |
++ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node) |
7159 |
++ if (!strncmp( |
7160 |
++ icq->q->elevator->type->elevator_name, |
7161 |
++ "bfq", ELV_NAME_MAX)) |
7162 |
++ bfq_bic_change_cgroup(icq_to_bic(icq), |
7163 |
++ css); |
7164 |
++ rcu_read_unlock(); |
7165 |
++ put_io_context(ioc); |
7166 |
++ } |
7167 |
++ } |
7168 |
++} |
7169 |
++ |
7170 |
++static void bfqio_destroy(struct cgroup_subsys_state *css) |
7171 |
++{ |
7172 |
++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); |
7173 |
++ struct hlist_node *tmp; |
7174 |
++ struct bfq_group *bfqg; |
7175 |
++ |
7176 |
++ /* |
7177 |
++ * Since we are destroying the cgroup, there are no more tasks |
7178 |
++ * referencing it, and all the RCU grace periods that may have |
7179 |
++ * referenced it are ended (as the destruction of the parent |
7180 |
++ * cgroup is RCU-safe); bgrp->group_data will not be accessed by |
7181 |
++ * anything else and we don't need any synchronization. |
7182 |
++ */ |
7183 |
++ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node) |
7184 |
++ bfq_destroy_group(bgrp, bfqg); |
7185 |
++ |
7186 |
++ BUG_ON(!hlist_empty(&bgrp->group_data)); |
7187 |
++ |
7188 |
++ kfree(bgrp); |
7189 |
++} |
7190 |
++ |
7191 |
++static int bfqio_css_online(struct cgroup_subsys_state *css) |
7192 |
++{ |
7193 |
++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); |
7194 |
++ |
7195 |
++ mutex_lock(&bfqio_mutex); |
7196 |
++ bgrp->online = true; |
7197 |
++ mutex_unlock(&bfqio_mutex); |
7198 |
++ |
7199 |
++ return 0; |
7200 |
++} |
7201 |
++ |
7202 |
++static void bfqio_css_offline(struct cgroup_subsys_state *css) |
7203 |
++{ |
7204 |
++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); |
7205 |
++ |
7206 |
++ mutex_lock(&bfqio_mutex); |
7207 |
++ bgrp->online = false; |
7208 |
++ mutex_unlock(&bfqio_mutex); |
7209 |
++} |
7210 |
++ |
7211 |
++struct cgroup_subsys bfqio_subsys = { |
7212 |
++ .name = "bfqio", |
7213 |
++ .css_alloc = bfqio_create, |
7214 |
++ .css_online = bfqio_css_online, |
7215 |
++ .css_offline = bfqio_css_offline, |
7216 |
++ .can_attach = bfqio_can_attach, |
7217 |
++ .attach = bfqio_attach, |
7218 |
++ .css_free = bfqio_destroy, |
7219 |
++ .subsys_id = bfqio_subsys_id, |
7220 |
++ .base_cftypes = bfqio_files, |
7221 |
++}; |
7222 |
++#else |
7223 |
++static inline void bfq_init_entity(struct bfq_entity *entity, |
7224 |
++ struct bfq_group *bfqg) |
7225 |
++{ |
7226 |
++ entity->weight = entity->new_weight; |
7227 |
++ entity->orig_weight = entity->new_weight; |
7228 |
++ entity->ioprio = entity->new_ioprio; |
7229 |
++ entity->ioprio_class = entity->new_ioprio_class; |
7230 |
++ entity->sched_data = &bfqg->sched_data; |
7231 |
++} |
7232 |
++ |
7233 |
++static inline struct bfq_group * |
7234 |
++bfq_bic_update_cgroup(struct bfq_io_cq *bic) |
7235 |
++{ |
7236 |
++ struct bfq_data *bfqd = bic_to_bfqd(bic); |
7237 |
++ return bfqd->root_group; |
7238 |
++} |
7239 |
++ |
7240 |
++static inline void bfq_bfqq_move(struct bfq_data *bfqd, |
7241 |
++ struct bfq_queue *bfqq, |
7242 |
++ struct bfq_entity *entity, |
7243 |
++ struct bfq_group *bfqg) |
7244 |
++{ |
7245 |
++} |
7246 |
++ |
7247 |
++static void bfq_end_raising_async(struct bfq_data *bfqd) |
7248 |
++{ |
7249 |
++ bfq_end_raising_async_queues(bfqd, bfqd->root_group); |
7250 |
++} |
7251 |
++ |
7252 |
++static inline void bfq_disconnect_groups(struct bfq_data *bfqd) |
7253 |
++{ |
7254 |
++ bfq_put_async_queues(bfqd, bfqd->root_group); |
7255 |
++} |
7256 |
++ |
7257 |
++static inline void bfq_free_root_group(struct bfq_data *bfqd) |
7258 |
++{ |
7259 |
++ kfree(bfqd->root_group); |
7260 |
++} |
7261 |
++ |
7262 |
++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) |
7263 |
++{ |
7264 |
++ struct bfq_group *bfqg; |
7265 |
++ int i; |
7266 |
++ |
7267 |
++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); |
7268 |
++ if (bfqg == NULL) |
7269 |
++ return NULL; |
7270 |
++ |
7271 |
++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) |
7272 |
++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; |
7273 |
++ |
7274 |
++ return bfqg; |
7275 |
++} |
7276 |
++#endif |
7277 |
+diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c |
7278 |
+new file mode 100644 |
7279 |
+index 0000000..7f6b000 |
7280 |
+--- /dev/null |
7281 |
++++ b/block/bfq-ioc.c |
7282 |
+@@ -0,0 +1,36 @@ |
7283 |
++/* |
7284 |
++ * BFQ: I/O context handling. |
7285 |
++ * |
7286 |
++ * Based on ideas and code from CFQ: |
7287 |
++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
7288 |
++ * |
7289 |
++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
7290 |
++ * Paolo Valente <paolo.valente@×××××××.it> |
7291 |
++ * |
7292 |
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
7293 |
++ */ |
7294 |
++ |
7295 |
++/** |
7296 |
++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. |
7297 |
++ * @icq: the iocontext queue. |
7298 |
++ */ |
7299 |
++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq) |
7300 |
++{ |
7301 |
++ /* bic->icq is the first member, %NULL will convert to %NULL */ |
7302 |
++ return container_of(icq, struct bfq_io_cq, icq); |
7303 |
++} |
7304 |
++ |
7305 |
++/** |
7306 |
++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. |
7307 |
++ * @bfqd: the lookup key. |
7308 |
++ * @ioc: the io_context of the process doing I/O. |
7309 |
++ * |
7310 |
++ * Queue lock must be held. |
7311 |
++ */ |
7312 |
++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, |
7313 |
++ struct io_context *ioc) |
7314 |
++{ |
7315 |
++ if (ioc) |
7316 |
++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); |
7317 |
++ return NULL; |
7318 |
++} |
7319 |
+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c |
7320 |
+new file mode 100644 |
7321 |
+index 0000000..eb760de |
7322 |
+--- /dev/null |
7323 |
++++ b/block/bfq-iosched.c |
7324 |
+@@ -0,0 +1,3298 @@ |
7325 |
++/* |
7326 |
++ * BFQ, or Budget Fair Queueing, disk scheduler. |
7327 |
++ * |
7328 |
++ * Based on ideas and code from CFQ: |
7329 |
++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
7330 |
++ * |
7331 |
++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
7332 |
++ * Paolo Valente <paolo.valente@×××××××.it> |
7333 |
++ * |
7334 |
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
7335 |
++ * |
7336 |
++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. |
7337 |
++ * |
7338 |
++ * BFQ is a proportional share disk scheduling algorithm based on the |
7339 |
++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in |
7340 |
++ * number of sectors, to tasks instead of time slices. The disk is not granted |
7341 |
++ * to the in-service task for a given time slice, but until it has exahusted |
7342 |
++ * its assigned budget. This change from the time to the service domain allows |
7343 |
++ * BFQ to distribute the disk bandwidth among tasks as desired, without any |
7344 |
++ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an |
7345 |
++ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to |
7346 |
++ * their budgets (more precisely BFQ schedules queues associated to tasks). |
7347 |
++ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to |
7348 |
++ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low |
7349 |
++ * latencies to interactive and soft real-time applications. |
7350 |
++ * |
7351 |
++ * BFQ is described in [1], where also a reference to the initial, more |
7352 |
++ * theoretical paper on BFQ can be found. The interested reader can find in |
7353 |
++ * the latter paper full details on the main algorithm as well as formulas of |
7354 |
++ * the guarantees, plus formal proofs of all the properties. With respect to |
7355 |
++ * the version of BFQ presented in these papers, this implementation adds a |
7356 |
++ * few more heuristics, such as the one that guarantees a low latency to soft |
7357 |
++ * real-time applications, and a hierarchical extension based on H-WF2Q+. |
7358 |
++ * |
7359 |
++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with |
7360 |
++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) |
7361 |
++ * complexity derives from the one introduced with EEVDF in [3]. |
7362 |
++ * |
7363 |
++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness |
7364 |
++ * with the BFQ Disk I/O Scheduler'', |
7365 |
++ * Proceedings of the 5th Annual International Systems and Storage |
7366 |
++ * Conference (SYSTOR '12), June 2012. |
7367 |
++ * |
7368 |
++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf |
7369 |
++ * |
7370 |
++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing |
7371 |
++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, |
7372 |
++ * Oct 1997. |
7373 |
++ * |
7374 |
++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz |
7375 |
++ * |
7376 |
++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline |
7377 |
++ * First: A Flexible and Accurate Mechanism for Proportional Share |
7378 |
++ * Resource Allocation,'' technical report. |
7379 |
++ * |
7380 |
++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf |
7381 |
++ */ |
7382 |
++#include <linux/module.h> |
7383 |
++#include <linux/slab.h> |
7384 |
++#include <linux/blkdev.h> |
7385 |
++#include <linux/cgroup.h> |
7386 |
++#include <linux/elevator.h> |
7387 |
++#include <linux/jiffies.h> |
7388 |
++#include <linux/rbtree.h> |
7389 |
++#include <linux/ioprio.h> |
7390 |
++#include "bfq.h" |
7391 |
++#include "blk.h" |
7392 |
++ |
7393 |
++/* Max number of dispatches in one round of service. */ |
7394 |
++static const int bfq_quantum = 4; |
7395 |
++ |
7396 |
++/* Expiration time of sync (0) and async (1) requests, in jiffies. */ |
7397 |
++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; |
7398 |
++ |
7399 |
++/* Maximum backwards seek, in KiB. */ |
7400 |
++static const int bfq_back_max = 16 * 1024; |
7401 |
++ |
7402 |
++/* Penalty of a backwards seek, in number of sectors. */ |
7403 |
++static const int bfq_back_penalty = 2; |
7404 |
++ |
7405 |
++/* Idling period duration, in jiffies. */ |
7406 |
++static int bfq_slice_idle = HZ / 125; |
7407 |
++ |
7408 |
++/* Default maximum budget values, in sectors and number of requests. */ |
7409 |
++static const int bfq_default_max_budget = 16 * 1024; |
7410 |
++static const int bfq_max_budget_async_rq = 4; |
7411 |
++ |
7412 |
++/* |
7413 |
++ * Async to sync throughput distribution is controlled as follows: |
7414 |
++ * when an async request is served, the entity is charged the number |
7415 |
++ * of sectors of the request, multipled by the factor below |
7416 |
++ */ |
7417 |
++static const int bfq_async_charge_factor = 10; |
7418 |
++ |
7419 |
++/* Default timeout values, in jiffies, approximating CFQ defaults. */ |
7420 |
++static const int bfq_timeout_sync = HZ / 8; |
7421 |
++static int bfq_timeout_async = HZ / 25; |
7422 |
++ |
7423 |
++struct kmem_cache *bfq_pool; |
7424 |
++ |
7425 |
++/* Below this threshold (in ms), we consider thinktime immediate. */ |
7426 |
++#define BFQ_MIN_TT 2 |
7427 |
++ |
7428 |
++/* hw_tag detection: parallel requests threshold and min samples needed. */ |
7429 |
++#define BFQ_HW_QUEUE_THRESHOLD 4 |
7430 |
++#define BFQ_HW_QUEUE_SAMPLES 32 |
7431 |
++ |
7432 |
++#define BFQQ_SEEK_THR (sector_t)(8 * 1024) |
7433 |
++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) |
7434 |
++ |
7435 |
++/* Min samples used for peak rate estimation (for autotuning). */ |
7436 |
++#define BFQ_PEAK_RATE_SAMPLES 32 |
7437 |
++ |
7438 |
++/* Shift used for peak rate fixed precision calculations. */ |
7439 |
++#define BFQ_RATE_SHIFT 16 |
7440 |
++ |
7441 |
++/* |
7442 |
++ * The duration of the weight raising for interactive applications is |
7443 |
++ * computed automatically (as default behaviour), using the following |
7444 |
++ * formula: duration = (R / r) * T, where r is the peak rate of the |
7445 |
++ * disk, and R and T are two reference parameters. In particular, R is |
7446 |
++ * the peak rate of a reference disk, and T is about the maximum time |
7447 |
++ * for starting popular large applications on that disk, under BFQ and |
7448 |
++ * while reading two files in parallel. Finally, BFQ uses two |
7449 |
++ * different pairs (R, T) depending on whether the disk is rotational |
7450 |
++ * or non-rotational. |
7451 |
++ */ |
7452 |
++#define T_rot (msecs_to_jiffies(5500)) |
7453 |
++#define T_nonrot (msecs_to_jiffies(2000)) |
7454 |
++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */ |
7455 |
++#define R_rot 17415 |
7456 |
++#define R_nonrot 34791 |
7457 |
++ |
7458 |
++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ |
7459 |
++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) |
7460 |
++ |
7461 |
++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) |
7462 |
++#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) |
7463 |
++ |
7464 |
++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd); |
7465 |
++ |
7466 |
++#include "bfq-ioc.c" |
7467 |
++#include "bfq-sched.c" |
7468 |
++#include "bfq-cgroup.c" |
7469 |
++ |
7470 |
++#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\ |
7471 |
++ IOPRIO_CLASS_IDLE) |
7472 |
++#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\ |
7473 |
++ IOPRIO_CLASS_RT) |
7474 |
++ |
7475 |
++#define bfq_sample_valid(samples) ((samples) > 80) |
7476 |
++ |
7477 |
++/* |
7478 |
++ * We regard a request as SYNC, if either it's a read or has the SYNC bit |
7479 |
++ * set (in which case it could also be a direct WRITE). |
7480 |
++ */ |
7481 |
++static inline int bfq_bio_sync(struct bio *bio) |
7482 |
++{ |
7483 |
++ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) |
7484 |
++ return 1; |
7485 |
++ |
7486 |
++ return 0; |
7487 |
++} |
7488 |
++ |
7489 |
++/* |
7490 |
++ * Scheduler run of queue, if there are requests pending and no one in the |
7491 |
++ * driver that will restart queueing. |
7492 |
++ */ |
7493 |
++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd) |
7494 |
++{ |
7495 |
++ if (bfqd->queued != 0) { |
7496 |
++ bfq_log(bfqd, "schedule dispatch"); |
7497 |
++ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work); |
7498 |
++ } |
7499 |
++} |
7500 |
++ |
7501 |
++/* |
7502 |
++ * Lifted from AS - choose which of rq1 and rq2 that is best served now. |
7503 |
++ * We choose the request that is closesr to the head right now. Distance |
7504 |
++ * behind the head is penalized and only allowed to a certain extent. |
7505 |
++ */ |
7506 |
++static struct request *bfq_choose_req(struct bfq_data *bfqd, |
7507 |
++ struct request *rq1, |
7508 |
++ struct request *rq2, |
7509 |
++ sector_t last) |
7510 |
++{ |
7511 |
++ sector_t s1, s2, d1 = 0, d2 = 0; |
7512 |
++ unsigned long back_max; |
7513 |
++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ |
7514 |
++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ |
7515 |
++ unsigned wrap = 0; /* bit mask: requests behind the disk head? */ |
7516 |
++ |
7517 |
++ if (rq1 == NULL || rq1 == rq2) |
7518 |
++ return rq2; |
7519 |
++ if (rq2 == NULL) |
7520 |
++ return rq1; |
7521 |
++ |
7522 |
++ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) |
7523 |
++ return rq1; |
7524 |
++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) |
7525 |
++ return rq2; |
7526 |
++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) |
7527 |
++ return rq1; |
7528 |
++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) |
7529 |
++ return rq2; |
7530 |
++ |
7531 |
++ s1 = blk_rq_pos(rq1); |
7532 |
++ s2 = blk_rq_pos(rq2); |
7533 |
++ |
7534 |
++ /* |
7535 |
++ * By definition, 1KiB is 2 sectors. |
7536 |
++ */ |
7537 |
++ back_max = bfqd->bfq_back_max * 2; |
7538 |
++ |
7539 |
++ /* |
7540 |
++ * Strict one way elevator _except_ in the case where we allow |
7541 |
++ * short backward seeks which are biased as twice the cost of a |
7542 |
++ * similar forward seek. |
7543 |
++ */ |
7544 |
++ if (s1 >= last) |
7545 |
++ d1 = s1 - last; |
7546 |
++ else if (s1 + back_max >= last) |
7547 |
++ d1 = (last - s1) * bfqd->bfq_back_penalty; |
7548 |
++ else |
7549 |
++ wrap |= BFQ_RQ1_WRAP; |
7550 |
++ |
7551 |
++ if (s2 >= last) |
7552 |
++ d2 = s2 - last; |
7553 |
++ else if (s2 + back_max >= last) |
7554 |
++ d2 = (last - s2) * bfqd->bfq_back_penalty; |
7555 |
++ else |
7556 |
++ wrap |= BFQ_RQ2_WRAP; |
7557 |
++ |
7558 |
++ /* Found required data */ |
7559 |
++ |
7560 |
++ /* |
7561 |
++ * By doing switch() on the bit mask "wrap" we avoid having to |
7562 |
++ * check two variables for all permutations: --> faster! |
7563 |
++ */ |
7564 |
++ switch (wrap) { |
7565 |
++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ |
7566 |
++ if (d1 < d2) |
7567 |
++ return rq1; |
7568 |
++ else if (d2 < d1) |
7569 |
++ return rq2; |
7570 |
++ else { |
7571 |
++ if (s1 >= s2) |
7572 |
++ return rq1; |
7573 |
++ else |
7574 |
++ return rq2; |
7575 |
++ } |
7576 |
++ |
7577 |
++ case BFQ_RQ2_WRAP: |
7578 |
++ return rq1; |
7579 |
++ case BFQ_RQ1_WRAP: |
7580 |
++ return rq2; |
7581 |
++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ |
7582 |
++ default: |
7583 |
++ /* |
7584 |
++ * Since both rqs are wrapped, |
7585 |
++ * start with the one that's further behind head |
7586 |
++ * (--> only *one* back seek required), |
7587 |
++ * since back seek takes more time than forward. |
7588 |
++ */ |
7589 |
++ if (s1 <= s2) |
7590 |
++ return rq1; |
7591 |
++ else |
7592 |
++ return rq2; |
7593 |
++ } |
7594 |
++} |
7595 |
++ |
7596 |
++static struct bfq_queue * |
7597 |
++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, |
7598 |
++ sector_t sector, struct rb_node **ret_parent, |
7599 |
++ struct rb_node ***rb_link) |
7600 |
++{ |
7601 |
++ struct rb_node **p, *parent; |
7602 |
++ struct bfq_queue *bfqq = NULL; |
7603 |
++ |
7604 |
++ parent = NULL; |
7605 |
++ p = &root->rb_node; |
7606 |
++ while (*p) { |
7607 |
++ struct rb_node **n; |
7608 |
++ |
7609 |
++ parent = *p; |
7610 |
++ bfqq = rb_entry(parent, struct bfq_queue, pos_node); |
7611 |
++ |
7612 |
++ /* |
7613 |
++ * Sort strictly based on sector. Smallest to the left, |
7614 |
++ * largest to the right. |
7615 |
++ */ |
7616 |
++ if (sector > blk_rq_pos(bfqq->next_rq)) |
7617 |
++ n = &(*p)->rb_right; |
7618 |
++ else if (sector < blk_rq_pos(bfqq->next_rq)) |
7619 |
++ n = &(*p)->rb_left; |
7620 |
++ else |
7621 |
++ break; |
7622 |
++ p = n; |
7623 |
++ bfqq = NULL; |
7624 |
++ } |
7625 |
++ |
7626 |
++ *ret_parent = parent; |
7627 |
++ if (rb_link) |
7628 |
++ *rb_link = p; |
7629 |
++ |
7630 |
++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", |
7631 |
++ (long long unsigned)sector, |
7632 |
++ bfqq != NULL ? bfqq->pid : 0); |
7633 |
++ |
7634 |
++ return bfqq; |
7635 |
++} |
7636 |
++ |
7637 |
++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
7638 |
++{ |
7639 |
++ struct rb_node **p, *parent; |
7640 |
++ struct bfq_queue *__bfqq; |
7641 |
++ |
7642 |
++ if (bfqq->pos_root != NULL) { |
7643 |
++ rb_erase(&bfqq->pos_node, bfqq->pos_root); |
7644 |
++ bfqq->pos_root = NULL; |
7645 |
++ } |
7646 |
++ |
7647 |
++ if (bfq_class_idle(bfqq)) |
7648 |
++ return; |
7649 |
++ if (!bfqq->next_rq) |
7650 |
++ return; |
7651 |
++ |
7652 |
++ bfqq->pos_root = &bfqd->rq_pos_tree; |
7653 |
++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, |
7654 |
++ blk_rq_pos(bfqq->next_rq), &parent, &p); |
7655 |
++ if (__bfqq == NULL) { |
7656 |
++ rb_link_node(&bfqq->pos_node, parent, p); |
7657 |
++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); |
7658 |
++ } else |
7659 |
++ bfqq->pos_root = NULL; |
7660 |
++} |
7661 |
++ |
7662 |
++static struct request *bfq_find_next_rq(struct bfq_data *bfqd, |
7663 |
++ struct bfq_queue *bfqq, |
7664 |
++ struct request *last) |
7665 |
++{ |
7666 |
++ struct rb_node *rbnext = rb_next(&last->rb_node); |
7667 |
++ struct rb_node *rbprev = rb_prev(&last->rb_node); |
7668 |
++ struct request *next = NULL, *prev = NULL; |
7669 |
++ |
7670 |
++ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); |
7671 |
++ |
7672 |
++ if (rbprev != NULL) |
7673 |
++ prev = rb_entry_rq(rbprev); |
7674 |
++ |
7675 |
++ if (rbnext != NULL) |
7676 |
++ next = rb_entry_rq(rbnext); |
7677 |
++ else { |
7678 |
++ rbnext = rb_first(&bfqq->sort_list); |
7679 |
++ if (rbnext && rbnext != &last->rb_node) |
7680 |
++ next = rb_entry_rq(rbnext); |
7681 |
++ } |
7682 |
++ |
7683 |
++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); |
7684 |
++} |
7685 |
++ |
7686 |
++static void bfq_del_rq_rb(struct request *rq) |
7687 |
++{ |
7688 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
7689 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
7690 |
++ const int sync = rq_is_sync(rq); |
7691 |
++ |
7692 |
++ BUG_ON(bfqq->queued[sync] == 0); |
7693 |
++ bfqq->queued[sync]--; |
7694 |
++ bfqd->queued--; |
7695 |
++ |
7696 |
++ elv_rb_del(&bfqq->sort_list, rq); |
7697 |
++ |
7698 |
++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { |
7699 |
++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) |
7700 |
++ bfq_del_bfqq_busy(bfqd, bfqq, 1); |
7701 |
++ /* |
7702 |
++ * Remove queue from request-position tree as it is empty. |
7703 |
++ */ |
7704 |
++ if (bfqq->pos_root != NULL) { |
7705 |
++ rb_erase(&bfqq->pos_node, bfqq->pos_root); |
7706 |
++ bfqq->pos_root = NULL; |
7707 |
++ } |
7708 |
++ } |
7709 |
++} |
7710 |
++ |
7711 |
++/* see the definition of bfq_async_charge_factor for details */ |
7712 |
++static inline unsigned long bfq_serv_to_charge(struct request *rq, |
7713 |
++ struct bfq_queue *bfqq) |
7714 |
++{ |
7715 |
++ return blk_rq_sectors(rq) * |
7716 |
++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) * |
7717 |
++ bfq_async_charge_factor)); |
7718 |
++} |
7719 |
++ |
7720 |
++/** |
7721 |
++ * bfq_updated_next_req - update the queue after a new next_rq selection. |
7722 |
++ * @bfqd: the device data the queue belongs to. |
7723 |
++ * @bfqq: the queue to update. |
7724 |
++ * |
7725 |
++ * If the first request of a queue changes we make sure that the queue |
7726 |
++ * has enough budget to serve at least its first request (if the |
7727 |
++ * request has grown). We do this because if the queue has not enough |
7728 |
++ * budget for its first request, it has to go through two dispatch |
7729 |
++ * rounds to actually get it dispatched. |
7730 |
++ */ |
7731 |
++static void bfq_updated_next_req(struct bfq_data *bfqd, |
7732 |
++ struct bfq_queue *bfqq) |
7733 |
++{ |
7734 |
++ struct bfq_entity *entity = &bfqq->entity; |
7735 |
++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); |
7736 |
++ struct request *next_rq = bfqq->next_rq; |
7737 |
++ unsigned long new_budget; |
7738 |
++ |
7739 |
++ if (next_rq == NULL) |
7740 |
++ return; |
7741 |
++ |
7742 |
++ if (bfqq == bfqd->in_service_queue) |
7743 |
++ /* |
7744 |
++ * In order not to break guarantees, budgets cannot be |
7745 |
++ * changed after an entity has been selected. |
7746 |
++ */ |
7747 |
++ return; |
7748 |
++ |
7749 |
++ BUG_ON(entity->tree != &st->active); |
7750 |
++ BUG_ON(entity == entity->sched_data->in_service_entity); |
7751 |
++ |
7752 |
++ new_budget = max_t(unsigned long, bfqq->max_budget, |
7753 |
++ bfq_serv_to_charge(next_rq, bfqq)); |
7754 |
++ entity->budget = new_budget; |
7755 |
++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); |
7756 |
++ bfq_activate_bfqq(bfqd, bfqq); |
7757 |
++} |
7758 |
++ |
7759 |
++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd) |
7760 |
++{ |
7761 |
++ u64 dur; |
7762 |
++ |
7763 |
++ if (bfqd->bfq_raising_max_time > 0) |
7764 |
++ return bfqd->bfq_raising_max_time; |
7765 |
++ |
7766 |
++ dur = bfqd->RT_prod; |
7767 |
++ do_div(dur, bfqd->peak_rate); |
7768 |
++ |
7769 |
++ return dur; |
7770 |
++} |
7771 |
++ |
7772 |
++static void bfq_add_rq_rb(struct request *rq) |
7773 |
++{ |
7774 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
7775 |
++ struct bfq_entity *entity = &bfqq->entity; |
7776 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
7777 |
++ struct request *next_rq, *prev; |
7778 |
++ unsigned long old_raising_coeff = bfqq->raising_coeff; |
7779 |
++ int idle_for_long_time = 0; |
7780 |
++ |
7781 |
++ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq)); |
7782 |
++ bfqq->queued[rq_is_sync(rq)]++; |
7783 |
++ bfqd->queued++; |
7784 |
++ |
7785 |
++ elv_rb_add(&bfqq->sort_list, rq); |
7786 |
++ |
7787 |
++ /* |
7788 |
++ * Check if this request is a better next-serve candidate. |
7789 |
++ */ |
7790 |
++ prev = bfqq->next_rq; |
7791 |
++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); |
7792 |
++ BUG_ON(next_rq == NULL); |
7793 |
++ bfqq->next_rq = next_rq; |
7794 |
++ |
7795 |
++ /* |
7796 |
++ * Adjust priority tree position, if next_rq changes. |
7797 |
++ */ |
7798 |
++ if (prev != bfqq->next_rq) |
7799 |
++ bfq_rq_pos_tree_add(bfqd, bfqq); |
7800 |
++ |
7801 |
++ if (!bfq_bfqq_busy(bfqq)) { |
7802 |
++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 && |
7803 |
++ time_is_before_jiffies(bfqq->soft_rt_next_start); |
7804 |
++ idle_for_long_time = time_is_before_jiffies( |
7805 |
++ bfqq->budget_timeout + |
7806 |
++ bfqd->bfq_raising_min_idle_time); |
7807 |
++ entity->budget = max_t(unsigned long, bfqq->max_budget, |
7808 |
++ bfq_serv_to_charge(next_rq, bfqq)); |
7809 |
++ |
7810 |
++ if (!bfqd->low_latency) |
7811 |
++ goto add_bfqq_busy; |
7812 |
++ |
7813 |
++ /* |
7814 |
++ * If the queue is not being boosted and has been idle |
7815 |
++ * for enough time, start a weight-raising period |
7816 |
++ */ |
7817 |
++ if (old_raising_coeff == 1 && |
7818 |
++ (idle_for_long_time || soft_rt)) { |
7819 |
++ bfqq->raising_coeff = bfqd->bfq_raising_coeff; |
7820 |
++ if (idle_for_long_time) |
7821 |
++ bfqq->raising_cur_max_time = |
7822 |
++ bfq_wrais_duration(bfqd); |
7823 |
++ else |
7824 |
++ bfqq->raising_cur_max_time = |
7825 |
++ bfqd->bfq_raising_rt_max_time; |
7826 |
++ bfq_log_bfqq(bfqd, bfqq, |
7827 |
++ "wrais starting at %lu, " |
7828 |
++ "rais_max_time %u", |
7829 |
++ jiffies, |
7830 |
++ jiffies_to_msecs(bfqq-> |
7831 |
++ raising_cur_max_time)); |
7832 |
++ } else if (old_raising_coeff > 1) { |
7833 |
++ if (idle_for_long_time) |
7834 |
++ bfqq->raising_cur_max_time = |
7835 |
++ bfq_wrais_duration(bfqd); |
7836 |
++ else if (bfqq->raising_cur_max_time == |
7837 |
++ bfqd->bfq_raising_rt_max_time && |
7838 |
++ !soft_rt) { |
7839 |
++ bfqq->raising_coeff = 1; |
7840 |
++ bfq_log_bfqq(bfqd, bfqq, |
7841 |
++ "wrais ending at %lu, " |
7842 |
++ "rais_max_time %u", |
7843 |
++ jiffies, |
7844 |
++ jiffies_to_msecs(bfqq-> |
7845 |
++ raising_cur_max_time)); |
7846 |
++ } else if (time_before( |
7847 |
++ bfqq->last_rais_start_finish + |
7848 |
++ bfqq->raising_cur_max_time, |
7849 |
++ jiffies + |
7850 |
++ bfqd->bfq_raising_rt_max_time) && |
7851 |
++ soft_rt) { |
7852 |
++ /* |
7853 |
++ * |
7854 |
++ * The remaining weight-raising time is lower |
7855 |
++ * than bfqd->bfq_raising_rt_max_time, which |
7856 |
++ * means that the application is enjoying |
7857 |
++ * weight raising either because deemed soft rt |
7858 |
++ * in the near past, or because deemed |
7859 |
++ * interactive a long ago. In both cases, |
7860 |
++ * resetting now the current remaining weight- |
7861 |
++ * raising time for the application to the |
7862 |
++ * weight-raising duration for soft rt |
7863 |
++ * applications would not cause any latency |
7864 |
++ * increase for the application (as the new |
7865 |
++ * duration would be higher than the remaining |
7866 |
++ * time). |
7867 |
++ * |
7868 |
++ * In addition, the application is now meeting |
7869 |
++ * the requirements for being deemed soft rt. |
7870 |
++ * In the end we can correctly and safely |
7871 |
++ * (re)charge the weight-raising duration for |
7872 |
++ * the application with the weight-raising |
7873 |
++ * duration for soft rt applications. |
7874 |
++ * |
7875 |
++ * In particular, doing this recharge now, i.e., |
7876 |
++ * before the weight-raising period for the |
7877 |
++ * application finishes, reduces the probability |
7878 |
++ * of the following negative scenario: |
7879 |
++ * 1) the weight of a soft rt application is |
7880 |
++ * raised at startup (as for any newly |
7881 |
++ * created application), |
7882 |
++ * 2) since the application is not interactive, |
7883 |
++ * at a certain time weight-raising is |
7884 |
++ * stopped for the application, |
7885 |
++ * 3) at that time the application happens to |
7886 |
++ * still have pending requests, and hence |
7887 |
++ * is destined to not have a chance to be |
7888 |
++ * deemed soft rt before these requests are |
7889 |
++ * completed (see the comments to the |
7890 |
++ * function bfq_bfqq_softrt_next_start() |
7891 |
++ * for details on soft rt detection), |
7892 |
++ * 4) these pending requests experience a high |
7893 |
++ * latency because the application is not |
7894 |
++ * weight-raised while they are pending. |
7895 |
++ */ |
7896 |
++ bfqq->last_rais_start_finish = jiffies; |
7897 |
++ bfqq->raising_cur_max_time = |
7898 |
++ bfqd->bfq_raising_rt_max_time; |
7899 |
++ } |
7900 |
++ } |
7901 |
++ if (old_raising_coeff != bfqq->raising_coeff) |
7902 |
++ entity->ioprio_changed = 1; |
7903 |
++add_bfqq_busy: |
7904 |
++ bfqq->last_idle_bklogged = jiffies; |
7905 |
++ bfqq->service_from_backlogged = 0; |
7906 |
++ bfq_clear_bfqq_softrt_update(bfqq); |
7907 |
++ bfq_add_bfqq_busy(bfqd, bfqq); |
7908 |
++ } else { |
7909 |
++ if (bfqd->low_latency && old_raising_coeff == 1 && |
7910 |
++ !rq_is_sync(rq) && |
7911 |
++ time_is_before_jiffies( |
7912 |
++ bfqq->last_rais_start_finish + |
7913 |
++ bfqd->bfq_raising_min_inter_arr_async)) { |
7914 |
++ bfqq->raising_coeff = bfqd->bfq_raising_coeff; |
7915 |
++ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd); |
7916 |
++ |
7917 |
++ bfqd->raised_busy_queues++; |
7918 |
++ entity->ioprio_changed = 1; |
7919 |
++ bfq_log_bfqq(bfqd, bfqq, |
7920 |
++ "non-idle wrais starting at %lu, " |
7921 |
++ "rais_max_time %u", |
7922 |
++ jiffies, |
7923 |
++ jiffies_to_msecs(bfqq-> |
7924 |
++ raising_cur_max_time)); |
7925 |
++ } |
7926 |
++ bfq_updated_next_req(bfqd, bfqq); |
7927 |
++ } |
7928 |
++ |
7929 |
++ if (bfqd->low_latency && |
7930 |
++ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 || |
7931 |
++ idle_for_long_time)) |
7932 |
++ bfqq->last_rais_start_finish = jiffies; |
7933 |
++} |
7934 |
++ |
7935 |
++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq) |
7936 |
++{ |
7937 |
++ elv_rb_del(&bfqq->sort_list, rq); |
7938 |
++ bfqq->queued[rq_is_sync(rq)]--; |
7939 |
++ bfqq->bfqd->queued--; |
7940 |
++ bfq_add_rq_rb(rq); |
7941 |
++} |
7942 |
++ |
7943 |
++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, |
7944 |
++ struct bio *bio) |
7945 |
++{ |
7946 |
++ struct task_struct *tsk = current; |
7947 |
++ struct bfq_io_cq *bic; |
7948 |
++ struct bfq_queue *bfqq; |
7949 |
++ |
7950 |
++ bic = bfq_bic_lookup(bfqd, tsk->io_context); |
7951 |
++ if (bic == NULL) |
7952 |
++ return NULL; |
7953 |
++ |
7954 |
++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); |
7955 |
++ if (bfqq != NULL) |
7956 |
++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); |
7957 |
++ |
7958 |
++ return NULL; |
7959 |
++} |
7960 |
++ |
7961 |
++static void bfq_activate_request(struct request_queue *q, struct request *rq) |
7962 |
++{ |
7963 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
7964 |
++ |
7965 |
++ bfqd->rq_in_driver++; |
7966 |
++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); |
7967 |
++ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", |
7968 |
++ (long long unsigned)bfqd->last_position); |
7969 |
++} |
7970 |
++ |
7971 |
++static void bfq_deactivate_request(struct request_queue *q, struct request *rq) |
7972 |
++{ |
7973 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
7974 |
++ |
7975 |
++ WARN_ON(bfqd->rq_in_driver == 0); |
7976 |
++ bfqd->rq_in_driver--; |
7977 |
++} |
7978 |
++ |
7979 |
++static void bfq_remove_request(struct request *rq) |
7980 |
++{ |
7981 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
7982 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
7983 |
++ |
7984 |
++ if (bfqq->next_rq == rq) { |
7985 |
++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); |
7986 |
++ bfq_updated_next_req(bfqd, bfqq); |
7987 |
++ } |
7988 |
++ |
7989 |
++ list_del_init(&rq->queuelist); |
7990 |
++ bfq_del_rq_rb(rq); |
7991 |
++ |
7992 |
++ if (rq->cmd_flags & REQ_META) { |
7993 |
++ WARN_ON(bfqq->meta_pending == 0); |
7994 |
++ bfqq->meta_pending--; |
7995 |
++ } |
7996 |
++} |
7997 |
++ |
7998 |
++static int bfq_merge(struct request_queue *q, struct request **req, |
7999 |
++ struct bio *bio) |
8000 |
++{ |
8001 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
8002 |
++ struct request *__rq; |
8003 |
++ |
8004 |
++ __rq = bfq_find_rq_fmerge(bfqd, bio); |
8005 |
++ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) { |
8006 |
++ *req = __rq; |
8007 |
++ return ELEVATOR_FRONT_MERGE; |
8008 |
++ } |
8009 |
++ |
8010 |
++ return ELEVATOR_NO_MERGE; |
8011 |
++} |
8012 |
++ |
8013 |
++static void bfq_merged_request(struct request_queue *q, struct request *req, |
8014 |
++ int type) |
8015 |
++{ |
8016 |
++ if (type == ELEVATOR_FRONT_MERGE) { |
8017 |
++ struct bfq_queue *bfqq = RQ_BFQQ(req); |
8018 |
++ |
8019 |
++ bfq_reposition_rq_rb(bfqq, req); |
8020 |
++ } |
8021 |
++} |
8022 |
++ |
8023 |
++static void bfq_merged_requests(struct request_queue *q, struct request *rq, |
8024 |
++ struct request *next) |
8025 |
++{ |
8026 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
8027 |
++ |
8028 |
++ /* |
8029 |
++ * Reposition in fifo if next is older than rq. |
8030 |
++ */ |
8031 |
++ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && |
8032 |
++ time_before(rq_fifo_time(next), rq_fifo_time(rq))) { |
8033 |
++ list_move(&rq->queuelist, &next->queuelist); |
8034 |
++ rq_set_fifo_time(rq, rq_fifo_time(next)); |
8035 |
++ } |
8036 |
++ |
8037 |
++ if (bfqq->next_rq == next) |
8038 |
++ bfqq->next_rq = rq; |
8039 |
++ |
8040 |
++ bfq_remove_request(next); |
8041 |
++} |
8042 |
++ |
8043 |
++/* Must be called with bfqq != NULL */ |
8044 |
++static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq) |
8045 |
++{ |
8046 |
++ BUG_ON(bfqq == NULL); |
8047 |
++ if (bfq_bfqq_busy(bfqq)) |
8048 |
++ bfqq->bfqd->raised_busy_queues--; |
8049 |
++ bfqq->raising_coeff = 1; |
8050 |
++ bfqq->raising_cur_max_time = 0; |
8051 |
++ /* Trigger a weight change on the next activation of the queue */ |
8052 |
++ bfqq->entity.ioprio_changed = 1; |
8053 |
++} |
8054 |
++ |
8055 |
++static void bfq_end_raising_async_queues(struct bfq_data *bfqd, |
8056 |
++ struct bfq_group *bfqg) |
8057 |
++{ |
8058 |
++ int i, j; |
8059 |
++ |
8060 |
++ for (i = 0; i < 2; i++) |
8061 |
++ for (j = 0; j < IOPRIO_BE_NR; j++) |
8062 |
++ if (bfqg->async_bfqq[i][j] != NULL) |
8063 |
++ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]); |
8064 |
++ if (bfqg->async_idle_bfqq != NULL) |
8065 |
++ bfq_bfqq_end_raising(bfqg->async_idle_bfqq); |
8066 |
++} |
8067 |
++ |
8068 |
++static void bfq_end_raising(struct bfq_data *bfqd) |
8069 |
++{ |
8070 |
++ struct bfq_queue *bfqq; |
8071 |
++ |
8072 |
++ spin_lock_irq(bfqd->queue->queue_lock); |
8073 |
++ |
8074 |
++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) |
8075 |
++ bfq_bfqq_end_raising(bfqq); |
8076 |
++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) |
8077 |
++ bfq_bfqq_end_raising(bfqq); |
8078 |
++ bfq_end_raising_async(bfqd); |
8079 |
++ |
8080 |
++ spin_unlock_irq(bfqd->queue->queue_lock); |
8081 |
++} |
8082 |
++ |
8083 |
++static int bfq_allow_merge(struct request_queue *q, struct request *rq, |
8084 |
++ struct bio *bio) |
8085 |
++{ |
8086 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
8087 |
++ struct bfq_io_cq *bic; |
8088 |
++ struct bfq_queue *bfqq; |
8089 |
++ |
8090 |
++ /* |
8091 |
++ * Disallow merge of a sync bio into an async request. |
8092 |
++ */ |
8093 |
++ if (bfq_bio_sync(bio) && !rq_is_sync(rq)) |
8094 |
++ return 0; |
8095 |
++ |
8096 |
++ /* |
8097 |
++ * Lookup the bfqq that this bio will be queued with. Allow |
8098 |
++ * merge only if rq is queued there. |
8099 |
++ * Queue lock is held here. |
8100 |
++ */ |
8101 |
++ bic = bfq_bic_lookup(bfqd, current->io_context); |
8102 |
++ if (bic == NULL) |
8103 |
++ return 0; |
8104 |
++ |
8105 |
++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); |
8106 |
++ return bfqq == RQ_BFQQ(rq); |
8107 |
++} |
8108 |
++ |
8109 |
++static void __bfq_set_in_service_queue(struct bfq_data *bfqd, |
8110 |
++ struct bfq_queue *bfqq) |
8111 |
++{ |
8112 |
++ if (bfqq != NULL) { |
8113 |
++ bfq_mark_bfqq_must_alloc(bfqq); |
8114 |
++ bfq_mark_bfqq_budget_new(bfqq); |
8115 |
++ bfq_clear_bfqq_fifo_expire(bfqq); |
8116 |
++ |
8117 |
++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; |
8118 |
++ |
8119 |
++ bfq_log_bfqq(bfqd, bfqq, |
8120 |
++ "set_in_service_queue, cur-budget = %lu", |
8121 |
++ bfqq->entity.budget); |
8122 |
++ } |
8123 |
++ |
8124 |
++ bfqd->in_service_queue = bfqq; |
8125 |
++} |
8126 |
++ |
8127 |
++/* |
8128 |
++ * Get and set a new queue for service. |
8129 |
++ */ |
8130 |
++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd, |
8131 |
++ struct bfq_queue *bfqq) |
8132 |
++{ |
8133 |
++ if (!bfqq) |
8134 |
++ bfqq = bfq_get_next_queue(bfqd); |
8135 |
++ else |
8136 |
++ bfq_get_next_queue_forced(bfqd, bfqq); |
8137 |
++ |
8138 |
++ __bfq_set_in_service_queue(bfqd, bfqq); |
8139 |
++ return bfqq; |
8140 |
++} |
8141 |
++ |
8142 |
++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd, |
8143 |
++ struct request *rq) |
8144 |
++{ |
8145 |
++ if (blk_rq_pos(rq) >= bfqd->last_position) |
8146 |
++ return blk_rq_pos(rq) - bfqd->last_position; |
8147 |
++ else |
8148 |
++ return bfqd->last_position - blk_rq_pos(rq); |
8149 |
++} |
8150 |
++ |
8151 |
++/* |
8152 |
++ * Return true if bfqq has no request pending and rq is close enough to |
8153 |
++ * bfqd->last_position, or if rq is closer to bfqd->last_position than |
8154 |
++ * bfqq->next_rq |
8155 |
++ */ |
8156 |
++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq) |
8157 |
++{ |
8158 |
++ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR; |
8159 |
++} |
8160 |
++ |
8161 |
++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) |
8162 |
++{ |
8163 |
++ struct rb_root *root = &bfqd->rq_pos_tree; |
8164 |
++ struct rb_node *parent, *node; |
8165 |
++ struct bfq_queue *__bfqq; |
8166 |
++ sector_t sector = bfqd->last_position; |
8167 |
++ |
8168 |
++ if (RB_EMPTY_ROOT(root)) |
8169 |
++ return NULL; |
8170 |
++ |
8171 |
++ /* |
8172 |
++ * First, if we find a request starting at the end of the last |
8173 |
++ * request, choose it. |
8174 |
++ */ |
8175 |
++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); |
8176 |
++ if (__bfqq != NULL) |
8177 |
++ return __bfqq; |
8178 |
++ |
8179 |
++ /* |
8180 |
++ * If the exact sector wasn't found, the parent of the NULL leaf |
8181 |
++ * will contain the closest sector (rq_pos_tree sorted by next_request |
8182 |
++ * position). |
8183 |
++ */ |
8184 |
++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); |
8185 |
++ if (bfq_rq_close(bfqd, __bfqq->next_rq)) |
8186 |
++ return __bfqq; |
8187 |
++ |
8188 |
++ if (blk_rq_pos(__bfqq->next_rq) < sector) |
8189 |
++ node = rb_next(&__bfqq->pos_node); |
8190 |
++ else |
8191 |
++ node = rb_prev(&__bfqq->pos_node); |
8192 |
++ if (node == NULL) |
8193 |
++ return NULL; |
8194 |
++ |
8195 |
++ __bfqq = rb_entry(node, struct bfq_queue, pos_node); |
8196 |
++ if (bfq_rq_close(bfqd, __bfqq->next_rq)) |
8197 |
++ return __bfqq; |
8198 |
++ |
8199 |
++ return NULL; |
8200 |
++} |
8201 |
++ |
8202 |
++/* |
8203 |
++ * bfqd - obvious |
8204 |
++ * cur_bfqq - passed in so that we don't decide that the current queue |
8205 |
++ * is closely cooperating with itself. |
8206 |
++ * |
8207 |
++ * We are assuming that cur_bfqq has dispatched at least one request, |
8208 |
++ * and that bfqd->last_position reflects a position on the disk associated |
8209 |
++ * with the I/O issued by cur_bfqq. |
8210 |
++ */ |
8211 |
++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, |
8212 |
++ struct bfq_queue *cur_bfqq) |
8213 |
++{ |
8214 |
++ struct bfq_queue *bfqq; |
8215 |
++ |
8216 |
++ if (bfq_class_idle(cur_bfqq)) |
8217 |
++ return NULL; |
8218 |
++ if (!bfq_bfqq_sync(cur_bfqq)) |
8219 |
++ return NULL; |
8220 |
++ if (BFQQ_SEEKY(cur_bfqq)) |
8221 |
++ return NULL; |
8222 |
++ |
8223 |
++ /* If device has only one backlogged bfq_queue, don't search. */ |
8224 |
++ if (bfqd->busy_queues == 1) |
8225 |
++ return NULL; |
8226 |
++ |
8227 |
++ /* |
8228 |
++ * We should notice if some of the queues are cooperating, e.g. |
8229 |
++ * working closely on the same area of the disk. In that case, |
8230 |
++ * we can group them together and don't waste time idling. |
8231 |
++ */ |
8232 |
++ bfqq = bfqq_close(bfqd); |
8233 |
++ if (bfqq == NULL || bfqq == cur_bfqq) |
8234 |
++ return NULL; |
8235 |
++ |
8236 |
++ /* |
8237 |
++ * Do not merge queues from different bfq_groups. |
8238 |
++ */ |
8239 |
++ if (bfqq->entity.parent != cur_bfqq->entity.parent) |
8240 |
++ return NULL; |
8241 |
++ |
8242 |
++ /* |
8243 |
++ * It only makes sense to merge sync queues. |
8244 |
++ */ |
8245 |
++ if (!bfq_bfqq_sync(bfqq)) |
8246 |
++ return NULL; |
8247 |
++ if (BFQQ_SEEKY(bfqq)) |
8248 |
++ return NULL; |
8249 |
++ |
8250 |
++ /* |
8251 |
++ * Do not merge queues of different priority classes. |
8252 |
++ */ |
8253 |
++ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq)) |
8254 |
++ return NULL; |
8255 |
++ |
8256 |
++ return bfqq; |
8257 |
++} |
8258 |
++ |
8259 |
++/* |
8260 |
++ * If enough samples have been computed, return the current max budget |
8261 |
++ * stored in bfqd, which is dynamically updated according to the |
8262 |
++ * estimated disk peak rate; otherwise return the default max budget |
8263 |
++ */ |
8264 |
++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd) |
8265 |
++{ |
8266 |
++ if (bfqd->budgets_assigned < 194) |
8267 |
++ return bfq_default_max_budget; |
8268 |
++ else |
8269 |
++ return bfqd->bfq_max_budget; |
8270 |
++} |
8271 |
++ |
8272 |
++/* |
8273 |
++ * Return min budget, which is a fraction of the current or default |
8274 |
++ * max budget (trying with 1/32) |
8275 |
++ */ |
8276 |
++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd) |
8277 |
++{ |
8278 |
++ if (bfqd->budgets_assigned < 194) |
8279 |
++ return bfq_default_max_budget / 32; |
8280 |
++ else |
8281 |
++ return bfqd->bfq_max_budget / 32; |
8282 |
++} |
8283 |
++ |
8284 |
++/* |
8285 |
++ * Decides whether idling should be done for given device and |
8286 |
++ * given in-service queue. |
8287 |
++ */ |
8288 |
++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd, |
8289 |
++ struct bfq_queue *in_service_bfqq) |
8290 |
++{ |
8291 |
++ if (in_service_bfqq == NULL) |
8292 |
++ return false; |
8293 |
++ /* |
8294 |
++ * If the device is non-rotational, and hence has no seek penalty, |
8295 |
++ * disable idling; but do so only if: |
8296 |
++ * - device does not support queuing, otherwise we still have |
8297 |
++ * a problem with sync vs async workloads; |
8298 |
++ * - the queue is not weight-raised, to preserve guarantees. |
8299 |
++ */ |
8300 |
++ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag && |
8301 |
++ in_service_bfqq->raising_coeff == 1); |
8302 |
++} |
8303 |
++ |
8304 |
++static void bfq_arm_slice_timer(struct bfq_data *bfqd) |
8305 |
++{ |
8306 |
++ struct bfq_queue *bfqq = bfqd->in_service_queue; |
8307 |
++ struct bfq_io_cq *bic; |
8308 |
++ unsigned long sl; |
8309 |
++ |
8310 |
++ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); |
8311 |
++ |
8312 |
++ /* Tasks have exited, don't wait. */ |
8313 |
++ bic = bfqd->in_service_bic; |
8314 |
++ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0) |
8315 |
++ return; |
8316 |
++ |
8317 |
++ bfq_mark_bfqq_wait_request(bfqq); |
8318 |
++ |
8319 |
++ /* |
8320 |
++ * We don't want to idle for seeks, but we do want to allow |
8321 |
++ * fair distribution of slice time for a process doing back-to-back |
8322 |
++ * seeks. So allow a little bit of time for him to submit a new rq. |
8323 |
++ * |
8324 |
++ * To prevent processes with (partly) seeky workloads from |
8325 |
++ * being too ill-treated, grant them a small fraction of the |
8326 |
++ * assigned budget before reducing the waiting time to |
8327 |
++ * BFQ_MIN_TT. This happened to help reduce latency. |
8328 |
++ */ |
8329 |
++ sl = bfqd->bfq_slice_idle; |
8330 |
++ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) && |
8331 |
++ bfqq->entity.service > bfq_max_budget(bfqd) / 8 && |
8332 |
++ bfqq->raising_coeff == 1) |
8333 |
++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); |
8334 |
++ else if (bfqq->raising_coeff > 1) |
8335 |
++ sl = sl * 3; |
8336 |
++ bfqd->last_idling_start = ktime_get(); |
8337 |
++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl); |
8338 |
++ bfq_log(bfqd, "arm idle: %u/%u ms", |
8339 |
++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); |
8340 |
++} |
8341 |
++ |
8342 |
++/* |
8343 |
++ * Set the maximum time for the in-service queue to consume its |
8344 |
++ * budget. This prevents seeky processes from lowering the disk |
8345 |
++ * throughput (always guaranteed with a time slice scheme as in CFQ). |
8346 |
++ */ |
8347 |
++static void bfq_set_budget_timeout(struct bfq_data *bfqd) |
8348 |
++{ |
8349 |
++ struct bfq_queue *bfqq = bfqd->in_service_queue; |
8350 |
++ unsigned int timeout_coeff; |
8351 |
++ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time) |
8352 |
++ timeout_coeff = 1; |
8353 |
++ else |
8354 |
++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; |
8355 |
++ |
8356 |
++ bfqd->last_budget_start = ktime_get(); |
8357 |
++ |
8358 |
++ bfq_clear_bfqq_budget_new(bfqq); |
8359 |
++ bfqq->budget_timeout = jiffies + |
8360 |
++ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; |
8361 |
++ |
8362 |
++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", |
8363 |
++ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * |
8364 |
++ timeout_coeff)); |
8365 |
++} |
8366 |
++ |
8367 |
++/* |
8368 |
++ * Move request from internal lists to the request queue dispatch list. |
8369 |
++ */ |
8370 |
++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) |
8371 |
++{ |
8372 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
8373 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
8374 |
++ |
8375 |
++ bfq_remove_request(rq); |
8376 |
++ bfqq->dispatched++; |
8377 |
++ elv_dispatch_sort(q, rq); |
8378 |
++ |
8379 |
++ if (bfq_bfqq_sync(bfqq)) |
8380 |
++ bfqd->sync_flight++; |
8381 |
++} |
8382 |
++ |
8383 |
++/* |
8384 |
++ * Return expired entry, or NULL to just start from scratch in rbtree. |
8385 |
++ */ |
8386 |
++static struct request *bfq_check_fifo(struct bfq_queue *bfqq) |
8387 |
++{ |
8388 |
++ struct request *rq = NULL; |
8389 |
++ |
8390 |
++ if (bfq_bfqq_fifo_expire(bfqq)) |
8391 |
++ return NULL; |
8392 |
++ |
8393 |
++ bfq_mark_bfqq_fifo_expire(bfqq); |
8394 |
++ |
8395 |
++ if (list_empty(&bfqq->fifo)) |
8396 |
++ return NULL; |
8397 |
++ |
8398 |
++ rq = rq_entry_fifo(bfqq->fifo.next); |
8399 |
++ |
8400 |
++ if (time_before(jiffies, rq_fifo_time(rq))) |
8401 |
++ return NULL; |
8402 |
++ |
8403 |
++ return rq; |
8404 |
++} |
8405 |
++ |
8406 |
++/* |
8407 |
++ * Must be called with the queue_lock held. |
8408 |
++ */ |
8409 |
++static int bfqq_process_refs(struct bfq_queue *bfqq) |
8410 |
++{ |
8411 |
++ int process_refs, io_refs; |
8412 |
++ |
8413 |
++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; |
8414 |
++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; |
8415 |
++ BUG_ON(process_refs < 0); |
8416 |
++ return process_refs; |
8417 |
++} |
8418 |
++ |
8419 |
++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) |
8420 |
++{ |
8421 |
++ int process_refs, new_process_refs; |
8422 |
++ struct bfq_queue *__bfqq; |
8423 |
++ |
8424 |
++ /* |
8425 |
++ * If there are no process references on the new_bfqq, then it is |
8426 |
++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain |
8427 |
++ * may have dropped their last reference (not just their last process |
8428 |
++ * reference). |
8429 |
++ */ |
8430 |
++ if (!bfqq_process_refs(new_bfqq)) |
8431 |
++ return; |
8432 |
++ |
8433 |
++ /* Avoid a circular list and skip interim queue merges. */ |
8434 |
++ while ((__bfqq = new_bfqq->new_bfqq)) { |
8435 |
++ if (__bfqq == bfqq) |
8436 |
++ return; |
8437 |
++ new_bfqq = __bfqq; |
8438 |
++ } |
8439 |
++ |
8440 |
++ process_refs = bfqq_process_refs(bfqq); |
8441 |
++ new_process_refs = bfqq_process_refs(new_bfqq); |
8442 |
++ /* |
8443 |
++ * If the process for the bfqq has gone away, there is no |
8444 |
++ * sense in merging the queues. |
8445 |
++ */ |
8446 |
++ if (process_refs == 0 || new_process_refs == 0) |
8447 |
++ return; |
8448 |
++ |
8449 |
++ /* |
8450 |
++ * Merge in the direction of the lesser amount of work. |
8451 |
++ */ |
8452 |
++ if (new_process_refs >= process_refs) { |
8453 |
++ bfqq->new_bfqq = new_bfqq; |
8454 |
++ atomic_add(process_refs, &new_bfqq->ref); |
8455 |
++ } else { |
8456 |
++ new_bfqq->new_bfqq = bfqq; |
8457 |
++ atomic_add(new_process_refs, &bfqq->ref); |
8458 |
++ } |
8459 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", |
8460 |
++ new_bfqq->pid); |
8461 |
++} |
8462 |
++ |
8463 |
++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) |
8464 |
++{ |
8465 |
++ struct bfq_entity *entity = &bfqq->entity; |
8466 |
++ return entity->budget - entity->service; |
8467 |
++} |
8468 |
++ |
8469 |
++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
8470 |
++{ |
8471 |
++ BUG_ON(bfqq != bfqd->in_service_queue); |
8472 |
++ |
8473 |
++ __bfq_bfqd_reset_in_service(bfqd); |
8474 |
++ |
8475 |
++ /* |
8476 |
++ * If this bfqq is shared between multiple processes, check |
8477 |
++ * to make sure that those processes are still issuing I/Os |
8478 |
++ * within the mean seek distance. If not, it may be time to |
8479 |
++ * break the queues apart again. |
8480 |
++ */ |
8481 |
++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) |
8482 |
++ bfq_mark_bfqq_split_coop(bfqq); |
8483 |
++ |
8484 |
++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { |
8485 |
++ /* |
8486 |
++ * overloading budget_timeout field to store when |
8487 |
++ * the queue remains with no backlog, used by |
8488 |
++ * the weight-raising mechanism |
8489 |
++ */ |
8490 |
++ bfqq->budget_timeout = jiffies; |
8491 |
++ bfq_del_bfqq_busy(bfqd, bfqq, 1); |
8492 |
++ } else { |
8493 |
++ bfq_activate_bfqq(bfqd, bfqq); |
8494 |
++ /* |
8495 |
++ * Resort priority tree of potential close cooperators. |
8496 |
++ */ |
8497 |
++ bfq_rq_pos_tree_add(bfqd, bfqq); |
8498 |
++ } |
8499 |
++} |
8500 |
++ |
8501 |
++/** |
8502 |
++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. |
8503 |
++ * @bfqd: device data. |
8504 |
++ * @bfqq: queue to update. |
8505 |
++ * @reason: reason for expiration. |
8506 |
++ * |
8507 |
++ * Handle the feedback on @bfqq budget. See the body for detailed |
8508 |
++ * comments. |
8509 |
++ */ |
8510 |
++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, |
8511 |
++ struct bfq_queue *bfqq, |
8512 |
++ enum bfqq_expiration reason) |
8513 |
++{ |
8514 |
++ struct request *next_rq; |
8515 |
++ unsigned long budget, min_budget; |
8516 |
++ |
8517 |
++ budget = bfqq->max_budget; |
8518 |
++ min_budget = bfq_min_budget(bfqd); |
8519 |
++ |
8520 |
++ BUG_ON(bfqq != bfqd->in_service_queue); |
8521 |
++ |
8522 |
++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu", |
8523 |
++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); |
8524 |
++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu", |
8525 |
++ budget, bfq_min_budget(bfqd)); |
8526 |
++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", |
8527 |
++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); |
8528 |
++ |
8529 |
++ if (bfq_bfqq_sync(bfqq)) { |
8530 |
++ switch (reason) { |
8531 |
++ /* |
8532 |
++ * Caveat: in all the following cases we trade latency |
8533 |
++ * for throughput. |
8534 |
++ */ |
8535 |
++ case BFQ_BFQQ_TOO_IDLE: |
8536 |
++ /* |
8537 |
++ * This is the only case where we may reduce |
8538 |
++ * the budget: if there is no requets of the |
8539 |
++ * process still waiting for completion, then |
8540 |
++ * we assume (tentatively) that the timer has |
8541 |
++ * expired because the batch of requests of |
8542 |
++ * the process could have been served with a |
8543 |
++ * smaller budget. Hence, betting that |
8544 |
++ * process will behave in the same way when it |
8545 |
++ * becomes backlogged again, we reduce its |
8546 |
++ * next budget. As long as we guess right, |
8547 |
++ * this budget cut reduces the latency |
8548 |
++ * experienced by the process. |
8549 |
++ * |
8550 |
++ * However, if there are still outstanding |
8551 |
++ * requests, then the process may have not yet |
8552 |
++ * issued its next request just because it is |
8553 |
++ * still waiting for the completion of some of |
8554 |
++ * the still oustanding ones. So in this |
8555 |
++ * subcase we do not reduce its budget, on the |
8556 |
++ * contrary we increase it to possibly boost |
8557 |
++ * the throughput, as discussed in the |
8558 |
++ * comments to the BUDGET_TIMEOUT case. |
8559 |
++ */ |
8560 |
++ if (bfqq->dispatched > 0) /* still oustanding reqs */ |
8561 |
++ budget = min(budget * 2, bfqd->bfq_max_budget); |
8562 |
++ else { |
8563 |
++ if (budget > 5 * min_budget) |
8564 |
++ budget -= 4 * min_budget; |
8565 |
++ else |
8566 |
++ budget = min_budget; |
8567 |
++ } |
8568 |
++ break; |
8569 |
++ case BFQ_BFQQ_BUDGET_TIMEOUT: |
8570 |
++ /* |
8571 |
++ * We double the budget here because: 1) it |
8572 |
++ * gives the chance to boost the throughput if |
8573 |
++ * this is not a seeky process (which may have |
8574 |
++ * bumped into this timeout because of, e.g., |
8575 |
++ * ZBR), 2) together with charge_full_budget |
8576 |
++ * it helps give seeky processes higher |
8577 |
++ * timestamps, and hence be served less |
8578 |
++ * frequently. |
8579 |
++ */ |
8580 |
++ budget = min(budget * 2, bfqd->bfq_max_budget); |
8581 |
++ break; |
8582 |
++ case BFQ_BFQQ_BUDGET_EXHAUSTED: |
8583 |
++ /* |
8584 |
++ * The process still has backlog, and did not |
8585 |
++ * let either the budget timeout or the disk |
8586 |
++ * idling timeout expire. Hence it is not |
8587 |
++ * seeky, has a short thinktime and may be |
8588 |
++ * happy with a higher budget too. So |
8589 |
++ * definitely increase the budget of this good |
8590 |
++ * candidate to boost the disk throughput. |
8591 |
++ */ |
8592 |
++ budget = min(budget * 4, bfqd->bfq_max_budget); |
8593 |
++ break; |
8594 |
++ case BFQ_BFQQ_NO_MORE_REQUESTS: |
8595 |
++ /* |
8596 |
++ * Leave the budget unchanged. |
8597 |
++ */ |
8598 |
++ default: |
8599 |
++ return; |
8600 |
++ } |
8601 |
++ } else /* async queue */ |
8602 |
++ /* async queues get always the maximum possible budget |
8603 |
++ * (their ability to dispatch is limited by |
8604 |
++ * @bfqd->bfq_max_budget_async_rq). |
8605 |
++ */ |
8606 |
++ budget = bfqd->bfq_max_budget; |
8607 |
++ |
8608 |
++ bfqq->max_budget = budget; |
8609 |
++ |
8610 |
++ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 && |
8611 |
++ bfqq->max_budget > bfqd->bfq_max_budget) |
8612 |
++ bfqq->max_budget = bfqd->bfq_max_budget; |
8613 |
++ |
8614 |
++ /* |
8615 |
++ * Make sure that we have enough budget for the next request. |
8616 |
++ * Since the finish time of the bfqq must be kept in sync with |
8617 |
++ * the budget, be sure to call __bfq_bfqq_expire() after the |
8618 |
++ * update. |
8619 |
++ */ |
8620 |
++ next_rq = bfqq->next_rq; |
8621 |
++ if (next_rq != NULL) |
8622 |
++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, |
8623 |
++ bfq_serv_to_charge(next_rq, bfqq)); |
8624 |
++ else |
8625 |
++ bfqq->entity.budget = bfqq->max_budget; |
8626 |
++ |
8627 |
++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu", |
8628 |
++ next_rq != NULL ? blk_rq_sectors(next_rq) : 0, |
8629 |
++ bfqq->entity.budget); |
8630 |
++} |
8631 |
++ |
8632 |
++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) |
8633 |
++{ |
8634 |
++ unsigned long max_budget; |
8635 |
++ |
8636 |
++ /* |
8637 |
++ * The max_budget calculated when autotuning is equal to the |
8638 |
++ * amount of sectors transfered in timeout_sync at the |
8639 |
++ * estimated peak rate. |
8640 |
++ */ |
8641 |
++ max_budget = (unsigned long)(peak_rate * 1000 * |
8642 |
++ timeout >> BFQ_RATE_SHIFT); |
8643 |
++ |
8644 |
++ return max_budget; |
8645 |
++} |
8646 |
++ |
8647 |
++/* |
8648 |
++ * In addition to updating the peak rate, checks whether the process |
8649 |
++ * is "slow", and returns 1 if so. This slow flag is used, in addition |
8650 |
++ * to the budget timeout, to reduce the amount of service provided to |
8651 |
++ * seeky processes, and hence reduce their chances to lower the |
8652 |
++ * throughput. See the code for more details. |
8653 |
++ */ |
8654 |
++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
8655 |
++ int compensate, enum bfqq_expiration reason) |
8656 |
++{ |
8657 |
++ u64 bw, usecs, expected, timeout; |
8658 |
++ ktime_t delta; |
8659 |
++ int update = 0; |
8660 |
++ |
8661 |
++ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) |
8662 |
++ return 0; |
8663 |
++ |
8664 |
++ if (compensate) |
8665 |
++ delta = bfqd->last_idling_start; |
8666 |
++ else |
8667 |
++ delta = ktime_get(); |
8668 |
++ delta = ktime_sub(delta, bfqd->last_budget_start); |
8669 |
++ usecs = ktime_to_us(delta); |
8670 |
++ |
8671 |
++ /* Don't trust short/unrealistic values. */ |
8672 |
++ if (usecs < 100 || usecs >= LONG_MAX) |
8673 |
++ return 0; |
8674 |
++ |
8675 |
++ /* |
8676 |
++ * Calculate the bandwidth for the last slice. We use a 64 bit |
8677 |
++ * value to store the peak rate, in sectors per usec in fixed |
8678 |
++ * point math. We do so to have enough precision in the estimate |
8679 |
++ * and to avoid overflows. |
8680 |
++ */ |
8681 |
++ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; |
8682 |
++ do_div(bw, (unsigned long)usecs); |
8683 |
++ |
8684 |
++ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); |
8685 |
++ |
8686 |
++ /* |
8687 |
++ * Use only long (> 20ms) intervals to filter out spikes for |
8688 |
++ * the peak rate estimation. |
8689 |
++ */ |
8690 |
++ if (usecs > 20000) { |
8691 |
++ if (bw > bfqd->peak_rate || |
8692 |
++ (!BFQQ_SEEKY(bfqq) && |
8693 |
++ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { |
8694 |
++ bfq_log(bfqd, "measured bw =%llu", bw); |
8695 |
++ /* |
8696 |
++ * To smooth oscillations use a low-pass filter with |
8697 |
++ * alpha=7/8, i.e., |
8698 |
++ * new_rate = (7/8) * old_rate + (1/8) * bw |
8699 |
++ */ |
8700 |
++ do_div(bw, 8); |
8701 |
++ if (bw == 0) |
8702 |
++ return 0; |
8703 |
++ bfqd->peak_rate *= 7; |
8704 |
++ do_div(bfqd->peak_rate, 8); |
8705 |
++ bfqd->peak_rate += bw; |
8706 |
++ update = 1; |
8707 |
++ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); |
8708 |
++ } |
8709 |
++ |
8710 |
++ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; |
8711 |
++ |
8712 |
++ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) |
8713 |
++ bfqd->peak_rate_samples++; |
8714 |
++ |
8715 |
++ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && |
8716 |
++ update && bfqd->bfq_user_max_budget == 0) { |
8717 |
++ bfqd->bfq_max_budget = |
8718 |
++ bfq_calc_max_budget(bfqd->peak_rate, timeout); |
8719 |
++ bfq_log(bfqd, "new max_budget=%lu", |
8720 |
++ bfqd->bfq_max_budget); |
8721 |
++ } |
8722 |
++ } |
8723 |
++ |
8724 |
++ /* |
8725 |
++ * If the process has been served for a too short time |
8726 |
++ * interval to let its possible sequential accesses prevail on |
8727 |
++ * the initial seek time needed to move the disk head on the |
8728 |
++ * first sector it requested, then give the process a chance |
8729 |
++ * and for the moment return false. |
8730 |
++ */ |
8731 |
++ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) |
8732 |
++ return 0; |
8733 |
++ |
8734 |
++ /* |
8735 |
++ * A process is considered ``slow'' (i.e., seeky, so that we |
8736 |
++ * cannot treat it fairly in the service domain, as it would |
8737 |
++ * slow down too much the other processes) if, when a slice |
8738 |
++ * ends for whatever reason, it has received service at a |
8739 |
++ * rate that would not be high enough to complete the budget |
8740 |
++ * before the budget timeout expiration. |
8741 |
++ */ |
8742 |
++ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; |
8743 |
++ |
8744 |
++ /* |
8745 |
++ * Caveat: processes doing IO in the slower disk zones will |
8746 |
++ * tend to be slow(er) even if not seeky. And the estimated |
8747 |
++ * peak rate will actually be an average over the disk |
8748 |
++ * surface. Hence, to not be too harsh with unlucky processes, |
8749 |
++ * we keep a budget/3 margin of safety before declaring a |
8750 |
++ * process slow. |
8751 |
++ */ |
8752 |
++ return expected > (4 * bfqq->entity.budget) / 3; |
8753 |
++} |
8754 |
++ |
8755 |
++/* |
8756 |
++ * To be deemed as soft real-time, an application must meet two requirements. |
8757 |
++ * The first is that the application must not require an average bandwidth |
8758 |
++ * higher than the approximate bandwidth required to playback or record a |
8759 |
++ * compressed high-definition video. |
8760 |
++ * The next function is invoked on the completion of the last request of a |
8761 |
++ * batch, to compute the next-start time instant, soft_rt_next_start, such |
8762 |
++ * that, if the next request of the application does not arrive before |
8763 |
++ * soft_rt_next_start, then the above requirement on the bandwidth is met. |
8764 |
++ * |
8765 |
++ * The second requirement is that the request pattern of the application is |
8766 |
++ * isochronous, i.e., that, after issuing a request or a batch of requests, the |
8767 |
++ * application stops for a while, then issues a new batch, and so on. For this |
8768 |
++ * reason the next function is invoked to compute soft_rt_next_start only for |
8769 |
++ * applications that meet this requirement, whereas soft_rt_next_start is set |
8770 |
++ * to infinity for applications that do not. |
8771 |
++ * |
8772 |
++ * Unfortunately, even a greedy application may happen to behave in an |
8773 |
++ * isochronous way if several processes are competing for the CPUs. In fact, |
8774 |
++ * in this scenario the application stops issuing requests while the CPUs are |
8775 |
++ * busy serving other processes, then restarts, then stops again for a while, |
8776 |
++ * and so on. In addition, if the disk achieves a low enough throughput with |
8777 |
++ * the request pattern issued by the application (e.g., because the request |
8778 |
++ * pattern is random and/or the device is slow), then the above bandwidth |
8779 |
++ * requirement may happen to be met too. To prevent such a greedy application |
8780 |
++ * to be deemed as soft real-time, a further rule is used in the computation |
8781 |
++ * of soft_rt_next_start: soft_rt_next_start must be higher than the current |
8782 |
++ * time plus the maximum time for which the arrival of a request is waited |
8783 |
++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This |
8784 |
++ * filters out greedy applications, as the latter issue instead their next |
8785 |
++ * request as soon as possible after the last one has been completed (in |
8786 |
++ * contrast, when a batch of requests is completed, a soft real-time |
8787 |
++ * application spends some time processing data). |
8788 |
++ * |
8789 |
++ * Actually, the last filter may easily generate false positives if: only |
8790 |
++ * bfqd->bfq_slice_idle is used as a reference time interval, and one or |
8791 |
++ * both the following two cases occur: |
8792 |
++ * 1) HZ is so low that the duration of a jiffie is comparable to or higher |
8793 |
++ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with |
8794 |
++ * HZ=100. |
8795 |
++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing |
8796 |
++ * for a while, then suddenly 'jump' by several units to recover the lost |
8797 |
++ * increments. This seems to happen, e.g., inside virtual machines. |
8798 |
++ * To address this issue, we do not use as a reference time interval just |
8799 |
++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In |
8800 |
++ * particular we add the minimum number of jiffies for which the filter seems |
8801 |
++ * to be quite precise also in embedded systems and KVM/QEMU virtual machines. |
8802 |
++ */ |
8803 |
++static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, |
8804 |
++ struct bfq_queue *bfqq) |
8805 |
++{ |
8806 |
++ return max(bfqq->last_idle_bklogged + |
8807 |
++ HZ * bfqq->service_from_backlogged / |
8808 |
++ bfqd->bfq_raising_max_softrt_rate, |
8809 |
++ jiffies + bfqq->bfqd->bfq_slice_idle + 4); |
8810 |
++} |
8811 |
++ |
8812 |
++/* |
8813 |
++ * Largest-possible time instant such that, for as long as possible, the |
8814 |
++ * current time will be lower than this time instant according to the macro |
8815 |
++ * time_is_before_jiffies(). |
8816 |
++ */ |
8817 |
++static inline unsigned long bfq_infinity_from_now(unsigned long now) |
8818 |
++{ |
8819 |
++ return now + ULONG_MAX / 2; |
8820 |
++} |
8821 |
++ |
8822 |
++/** |
8823 |
++ * bfq_bfqq_expire - expire a queue. |
8824 |
++ * @bfqd: device owning the queue. |
8825 |
++ * @bfqq: the queue to expire. |
8826 |
++ * @compensate: if true, compensate for the time spent idling. |
8827 |
++ * @reason: the reason causing the expiration. |
8828 |
++ * |
8829 |
++ * |
8830 |
++ * If the process associated to the queue is slow (i.e., seeky), or in |
8831 |
++ * case of budget timeout, or, finally, if it is async, we |
8832 |
++ * artificially charge it an entire budget (independently of the |
8833 |
++ * actual service it received). As a consequence, the queue will get |
8834 |
++ * higher timestamps than the correct ones upon reactivation, and |
8835 |
++ * hence it will be rescheduled as if it had received more service |
8836 |
++ * than what it actually received. In the end, this class of processes |
8837 |
++ * will receive less service in proportion to how slowly they consume |
8838 |
++ * their budgets (and hence how seriously they tend to lower the |
8839 |
++ * throughput). |
8840 |
++ * |
8841 |
++ * In contrast, when a queue expires because it has been idling for |
8842 |
++ * too much or because it exhausted its budget, we do not touch the |
8843 |
++ * amount of service it has received. Hence when the queue will be |
8844 |
++ * reactivated and its timestamps updated, the latter will be in sync |
8845 |
++ * with the actual service received by the queue until expiration. |
8846 |
++ * |
8847 |
++ * Charging a full budget to the first type of queues and the exact |
8848 |
++ * service to the others has the effect of using the WF2Q+ policy to |
8849 |
++ * schedule the former on a timeslice basis, without violating the |
8850 |
++ * service domain guarantees of the latter. |
8851 |
++ */ |
8852 |
++static void bfq_bfqq_expire(struct bfq_data *bfqd, |
8853 |
++ struct bfq_queue *bfqq, |
8854 |
++ int compensate, |
8855 |
++ enum bfqq_expiration reason) |
8856 |
++{ |
8857 |
++ int slow; |
8858 |
++ BUG_ON(bfqq != bfqd->in_service_queue); |
8859 |
++ |
8860 |
++ /* Update disk peak rate for autotuning and check whether the |
8861 |
++ * process is slow (see bfq_update_peak_rate). |
8862 |
++ */ |
8863 |
++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); |
8864 |
++ |
8865 |
++ /* |
8866 |
++ * As above explained, 'punish' slow (i.e., seeky), timed-out |
8867 |
++ * and async queues, to favor sequential sync workloads. |
8868 |
++ * |
8869 |
++ * Processes doing IO in the slower disk zones will tend to be |
8870 |
++ * slow(er) even if not seeky. Hence, since the estimated peak |
8871 |
++ * rate is actually an average over the disk surface, these |
8872 |
++ * processes may timeout just for bad luck. To avoid punishing |
8873 |
++ * them we do not charge a full budget to a process that |
8874 |
++ * succeeded in consuming at least 2/3 of its budget. |
8875 |
++ */ |
8876 |
++ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && |
8877 |
++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) |
8878 |
++ bfq_bfqq_charge_full_budget(bfqq); |
8879 |
++ |
8880 |
++ bfqq->service_from_backlogged += bfqq->entity.service; |
8881 |
++ |
8882 |
++ if (bfqd->low_latency && bfqq->raising_coeff == 1) |
8883 |
++ bfqq->last_rais_start_finish = jiffies; |
8884 |
++ |
8885 |
++ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0 && |
8886 |
++ RB_EMPTY_ROOT(&bfqq->sort_list)) { |
8887 |
++ /* |
8888 |
++ * If we get here, then the request pattern is |
8889 |
++ * isochronous (see the comments to the function |
8890 |
++ * bfq_bfqq_softrt_next_start()). However, if the |
8891 |
++ * queue still has in-flight requests, then it is |
8892 |
++ * better to postpone the computation of next_start |
8893 |
++ * to the next request completion. In fact, if we |
8894 |
++ * computed it now, then the application might pass |
8895 |
++ * the greedy-application filter improperly, because |
8896 |
++ * the arrival of its next request may happen to be |
8897 |
++ * higher than (jiffies + bfqq->bfqd->bfq_slice_idle) |
8898 |
++ * not because the application is truly soft real- |
8899 |
++ * time, but just because the application is currently |
8900 |
++ * waiting for the completion of some request before |
8901 |
++ * issuing, as quickly as possible, its next request. |
8902 |
++ */ |
8903 |
++ if (bfqq->dispatched > 0) { |
8904 |
++ /* |
8905 |
++ * The application is still waiting for the |
8906 |
++ * completion of one or more requests: |
8907 |
++ * prevent it from possibly being incorrectly |
8908 |
++ * deemed as soft real-time by setting its |
8909 |
++ * soft_rt_next_start to infinity. In fact, |
8910 |
++ * without this assignment, the application |
8911 |
++ * would be incorrectly deemed as soft |
8912 |
++ * real-time if: |
8913 |
++ * 1) it issued a new request before the |
8914 |
++ * completion of all its in-flight |
8915 |
++ * requests, and |
8916 |
++ * 2) at that time, its soft_rt_next_start |
8917 |
++ * happened to be in the past. |
8918 |
++ */ |
8919 |
++ bfqq->soft_rt_next_start = |
8920 |
++ bfq_infinity_from_now(jiffies); |
8921 |
++ bfq_mark_bfqq_softrt_update(bfqq); |
8922 |
++ } else |
8923 |
++ bfqq->soft_rt_next_start = |
8924 |
++ bfq_bfqq_softrt_next_start(bfqd, bfqq); |
8925 |
++ } |
8926 |
++ |
8927 |
++ bfq_log_bfqq(bfqd, bfqq, |
8928 |
++ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow, |
8929 |
++ bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); |
8930 |
++ |
8931 |
++ /* Increase, decrease or leave budget unchanged according to reason */ |
8932 |
++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); |
8933 |
++ __bfq_bfqq_expire(bfqd, bfqq); |
8934 |
++} |
8935 |
++ |
8936 |
++/* |
8937 |
++ * Budget timeout is not implemented through a dedicated timer, but |
8938 |
++ * just checked on request arrivals and completions, as well as on |
8939 |
++ * idle timer expirations. |
8940 |
++ */ |
8941 |
++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) |
8942 |
++{ |
8943 |
++ if (bfq_bfqq_budget_new(bfqq)) |
8944 |
++ return 0; |
8945 |
++ |
8946 |
++ if (time_before(jiffies, bfqq->budget_timeout)) |
8947 |
++ return 0; |
8948 |
++ |
8949 |
++ return 1; |
8950 |
++} |
8951 |
++ |
8952 |
++/* |
8953 |
++ * If we expire a queue that is waiting for the arrival of a new |
8954 |
++ * request, we may prevent the fictitious timestamp backshifting that |
8955 |
++ * allows the guarantees of the queue to be preserved (see [1] for |
8956 |
++ * this tricky aspect). Hence we return true only if this condition |
8957 |
++ * does not hold, or if the queue is slow enough to deserve only to be |
8958 |
++ * kicked off for preserving a high throughput. |
8959 |
++*/ |
8960 |
++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) |
8961 |
++{ |
8962 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
8963 |
++ "may_budget_timeout: wr %d left %d timeout %d", |
8964 |
++ bfq_bfqq_wait_request(bfqq), |
8965 |
++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, |
8966 |
++ bfq_bfqq_budget_timeout(bfqq)); |
8967 |
++ |
8968 |
++ return (!bfq_bfqq_wait_request(bfqq) || |
8969 |
++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) |
8970 |
++ && |
8971 |
++ bfq_bfqq_budget_timeout(bfqq); |
8972 |
++} |
8973 |
++ |
8974 |
++/* |
8975 |
++ * For weight-raised queues issuing sync requests, idling is always performed, |
8976 |
++ * as this is instrumental in guaranteeing a high fraction of the throughput |
8977 |
++ * to these queues, and hence in guaranteeing a lower latency for their |
8978 |
++ * requests. See [1] for details. |
8979 |
++ * |
8980 |
++ * For non-weight-raised queues, idling is instead disabled if the device is |
8981 |
++ * NCQ-enabled and non-rotational, as this boosts the throughput on such |
8982 |
++ * devices. |
8983 |
++ */ |
8984 |
++static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq) |
8985 |
++{ |
8986 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
8987 |
++ |
8988 |
++ return bfq_bfqq_sync(bfqq) && ( |
8989 |
++ bfqq->raising_coeff > 1 || |
8990 |
++ (bfq_bfqq_idle_window(bfqq) && |
8991 |
++ !(bfqd->hw_tag && |
8992 |
++ (blk_queue_nonrot(bfqd->queue) || |
8993 |
++ /* |
8994 |
++ * If there are weight-raised busy queues, then do not idle |
8995 |
++ * the disk for a sync non-weight-raised queue, and hence |
8996 |
++ * expire the queue immediately if empty. Combined with the |
8997 |
++ * timestamping rules of BFQ (see [1] for details), this |
8998 |
++ * causes sync non-weight-raised queues to get a lower |
8999 |
++ * fraction of the disk throughput, and hence reduces the rate |
9000 |
++ * at which the processes associated to these queues ask for |
9001 |
++ * requests from the request pool. |
9002 |
++ * |
9003 |
++ * This is beneficial for weight-raised processes, when the |
9004 |
++ * system operates in request-pool saturation conditions |
9005 |
++ * (e.g., in the presence of write hogs). In fact, if |
9006 |
++ * non-weight-raised processes ask for requests at a lower |
9007 |
++ * rate, then weight-raised processes have a higher |
9008 |
++ * probability to get a request from the pool immediately |
9009 |
++ * (or at least soon) when they need one. Hence they have a |
9010 |
++ * higher probability to actually get a fraction of the disk |
9011 |
++ * throughput proportional to their high weight. This is |
9012 |
++ * especially true with NCQ-enabled drives, which enqueue |
9013 |
++ * several requests in advance and further reorder |
9014 |
++ * internally-queued requests. |
9015 |
++ * |
9016 |
++ * Mistreating non-weight-raised queues in the above-described |
9017 |
++ * way, when there are busy weight-raised queues, seems to |
9018 |
++ * mitigate starvation problems in the presence of heavy write |
9019 |
++ * workloads and NCQ, and hence to guarantee a higher |
9020 |
++ * application and system responsiveness in these hostile |
9021 |
++ * scenarios. |
9022 |
++ */ |
9023 |
++ bfqd->raised_busy_queues > 0) |
9024 |
++ ) |
9025 |
++ ) |
9026 |
++ ); |
9027 |
++} |
9028 |
++ |
9029 |
++/* |
9030 |
++ * If the in-service queue is empty, but it is sync and either of the following |
9031 |
++ * conditions holds, then: 1) the queue must remain in service and cannot be |
9032 |
++ * expired, and 2) the disk must be idled to wait for the possible arrival |
9033 |
++ * of a new request for the queue. The conditions are: |
9034 |
++ * - the device is rotational and not performing NCQ, and the queue has its |
9035 |
++ * idle window set (in this case, waiting for a new request for the queue |
9036 |
++ * is likely to boost the disk throughput); |
9037 |
++ * - the queue is weight-raised (waiting for the request is necessary to |
9038 |
++ * provide the queue with fairness and latency guarantees, see [1] for |
9039 |
++ * details). |
9040 |
++ */ |
9041 |
++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) |
9042 |
++{ |
9043 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
9044 |
++ |
9045 |
++ return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && |
9046 |
++ bfq_bfqq_must_not_expire(bfqq) && |
9047 |
++ !bfq_queue_nonrot_noidle(bfqd, bfqq)); |
9048 |
++} |
9049 |
++ |
9050 |
++/* |
9051 |
++ * Select a queue for service. If we have a current queue in service, |
9052 |
++ * check whether to continue servicing it, or retrieve and set a new one. |
9053 |
++ */ |
9054 |
++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) |
9055 |
++{ |
9056 |
++ struct bfq_queue *bfqq, *new_bfqq = NULL; |
9057 |
++ struct request *next_rq; |
9058 |
++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; |
9059 |
++ |
9060 |
++ bfqq = bfqd->in_service_queue; |
9061 |
++ if (bfqq == NULL) |
9062 |
++ goto new_queue; |
9063 |
++ |
9064 |
++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); |
9065 |
++ |
9066 |
++ /* |
9067 |
++ * If another queue has a request waiting within our mean seek |
9068 |
++ * distance, let it run. The expire code will check for close |
9069 |
++ * cooperators and put the close queue at the front of the |
9070 |
++ * service tree. If possible, merge the expiring queue with the |
9071 |
++ * new bfqq. |
9072 |
++ */ |
9073 |
++ new_bfqq = bfq_close_cooperator(bfqd, bfqq); |
9074 |
++ if (new_bfqq != NULL && bfqq->new_bfqq == NULL) |
9075 |
++ bfq_setup_merge(bfqq, new_bfqq); |
9076 |
++ |
9077 |
++ if (bfq_may_expire_for_budg_timeout(bfqq) && |
9078 |
++ !timer_pending(&bfqd->idle_slice_timer) && |
9079 |
++ !bfq_bfqq_must_idle(bfqq)) |
9080 |
++ goto expire; |
9081 |
++ |
9082 |
++ next_rq = bfqq->next_rq; |
9083 |
++ /* |
9084 |
++ * If bfqq has requests queued and it has enough budget left to |
9085 |
++ * serve them, keep the queue, otherwise expire it. |
9086 |
++ */ |
9087 |
++ if (next_rq != NULL) { |
9088 |
++ if (bfq_serv_to_charge(next_rq, bfqq) > |
9089 |
++ bfq_bfqq_budget_left(bfqq)) { |
9090 |
++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; |
9091 |
++ goto expire; |
9092 |
++ } else { |
9093 |
++ /* |
9094 |
++ * The idle timer may be pending because we may not |
9095 |
++ * disable disk idling even when a new request arrives |
9096 |
++ */ |
9097 |
++ if (timer_pending(&bfqd->idle_slice_timer)) { |
9098 |
++ /* |
9099 |
++ * If we get here: 1) at least a new request |
9100 |
++ * has arrived but we have not disabled the |
9101 |
++ * timer because the request was too small, |
9102 |
++ * 2) then the block layer has unplugged the |
9103 |
++ * device, causing the dispatch to be invoked. |
9104 |
++ * |
9105 |
++ * Since the device is unplugged, now the |
9106 |
++ * requests are probably large enough to |
9107 |
++ * provide a reasonable throughput. |
9108 |
++ * So we disable idling. |
9109 |
++ */ |
9110 |
++ bfq_clear_bfqq_wait_request(bfqq); |
9111 |
++ del_timer(&bfqd->idle_slice_timer); |
9112 |
++ } |
9113 |
++ if (new_bfqq == NULL) |
9114 |
++ goto keep_queue; |
9115 |
++ else |
9116 |
++ goto expire; |
9117 |
++ } |
9118 |
++ } |
9119 |
++ |
9120 |
++ /* |
9121 |
++ * No requests pending. If the in-service queue has no cooperator and |
9122 |
++ * still has requests in flight (possibly waiting for a completion) |
9123 |
++ * or is idling for a new request, then keep it. |
9124 |
++ */ |
9125 |
++ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) || |
9126 |
++ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) { |
9127 |
++ bfqq = NULL; |
9128 |
++ goto keep_queue; |
9129 |
++ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) { |
9130 |
++ /* |
9131 |
++ * Expiring the queue because there is a close cooperator, |
9132 |
++ * cancel timer. |
9133 |
++ */ |
9134 |
++ bfq_clear_bfqq_wait_request(bfqq); |
9135 |
++ del_timer(&bfqd->idle_slice_timer); |
9136 |
++ } |
9137 |
++ |
9138 |
++ reason = BFQ_BFQQ_NO_MORE_REQUESTS; |
9139 |
++expire: |
9140 |
++ bfq_bfqq_expire(bfqd, bfqq, 0, reason); |
9141 |
++new_queue: |
9142 |
++ bfqq = bfq_set_in_service_queue(bfqd, new_bfqq); |
9143 |
++ bfq_log(bfqd, "select_queue: new queue %d returned", |
9144 |
++ bfqq != NULL ? bfqq->pid : 0); |
9145 |
++keep_queue: |
9146 |
++ return bfqq; |
9147 |
++} |
9148 |
++ |
9149 |
++static void bfq_update_raising_data(struct bfq_data *bfqd, |
9150 |
++ struct bfq_queue *bfqq) |
9151 |
++{ |
9152 |
++ if (bfqq->raising_coeff > 1) { /* queue is being boosted */ |
9153 |
++ struct bfq_entity *entity = &bfqq->entity; |
9154 |
++ |
9155 |
++ bfq_log_bfqq(bfqd, bfqq, |
9156 |
++ "raising period dur %u/%u msec, " |
9157 |
++ "old raising coeff %u, w %d(%d)", |
9158 |
++ jiffies_to_msecs(jiffies - |
9159 |
++ bfqq->last_rais_start_finish), |
9160 |
++ jiffies_to_msecs(bfqq->raising_cur_max_time), |
9161 |
++ bfqq->raising_coeff, |
9162 |
++ bfqq->entity.weight, bfqq->entity.orig_weight); |
9163 |
++ |
9164 |
++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != |
9165 |
++ entity->orig_weight * bfqq->raising_coeff); |
9166 |
++ if (entity->ioprio_changed) |
9167 |
++ bfq_log_bfqq(bfqd, bfqq, |
9168 |
++ "WARN: pending prio change"); |
9169 |
++ /* |
9170 |
++ * If too much time has elapsed from the beginning |
9171 |
++ * of this weight-raising, stop it. |
9172 |
++ */ |
9173 |
++ if (time_is_before_jiffies(bfqq->last_rais_start_finish + |
9174 |
++ bfqq->raising_cur_max_time)) { |
9175 |
++ bfqq->last_rais_start_finish = jiffies; |
9176 |
++ bfq_log_bfqq(bfqd, bfqq, |
9177 |
++ "wrais ending at %lu, " |
9178 |
++ "rais_max_time %u", |
9179 |
++ bfqq->last_rais_start_finish, |
9180 |
++ jiffies_to_msecs(bfqq-> |
9181 |
++ raising_cur_max_time)); |
9182 |
++ bfq_bfqq_end_raising(bfqq); |
9183 |
++ __bfq_entity_update_weight_prio( |
9184 |
++ bfq_entity_service_tree(entity), |
9185 |
++ entity); |
9186 |
++ } |
9187 |
++ } |
9188 |
++} |
9189 |
++ |
9190 |
++/* |
9191 |
++ * Dispatch one request from bfqq, moving it to the request queue |
9192 |
++ * dispatch list. |
9193 |
++ */ |
9194 |
++static int bfq_dispatch_request(struct bfq_data *bfqd, |
9195 |
++ struct bfq_queue *bfqq) |
9196 |
++{ |
9197 |
++ int dispatched = 0; |
9198 |
++ struct request *rq; |
9199 |
++ unsigned long service_to_charge; |
9200 |
++ |
9201 |
++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); |
9202 |
++ |
9203 |
++ /* Follow expired path, else get first next available. */ |
9204 |
++ rq = bfq_check_fifo(bfqq); |
9205 |
++ if (rq == NULL) |
9206 |
++ rq = bfqq->next_rq; |
9207 |
++ service_to_charge = bfq_serv_to_charge(rq, bfqq); |
9208 |
++ |
9209 |
++ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { |
9210 |
++ /* |
9211 |
++ * This may happen if the next rq is chosen |
9212 |
++ * in fifo order instead of sector order. |
9213 |
++ * The budget is properly dimensioned |
9214 |
++ * to be always sufficient to serve the next request |
9215 |
++ * only if it is chosen in sector order. The reason is |
9216 |
++ * that it would be quite inefficient and little useful |
9217 |
++ * to always make sure that the budget is large enough |
9218 |
++ * to serve even the possible next rq in fifo order. |
9219 |
++ * In fact, requests are seldom served in fifo order. |
9220 |
++ * |
9221 |
++ * Expire the queue for budget exhaustion, and |
9222 |
++ * make sure that the next act_budget is enough |
9223 |
++ * to serve the next request, even if it comes |
9224 |
++ * from the fifo expired path. |
9225 |
++ */ |
9226 |
++ bfqq->next_rq = rq; |
9227 |
++ /* |
9228 |
++ * Since this dispatch is failed, make sure that |
9229 |
++ * a new one will be performed |
9230 |
++ */ |
9231 |
++ if (!bfqd->rq_in_driver) |
9232 |
++ bfq_schedule_dispatch(bfqd); |
9233 |
++ goto expire; |
9234 |
++ } |
9235 |
++ |
9236 |
++ /* Finally, insert request into driver dispatch list. */ |
9237 |
++ bfq_bfqq_served(bfqq, service_to_charge); |
9238 |
++ bfq_dispatch_insert(bfqd->queue, rq); |
9239 |
++ |
9240 |
++ bfq_update_raising_data(bfqd, bfqq); |
9241 |
++ |
9242 |
++ bfq_log_bfqq(bfqd, bfqq, |
9243 |
++ "dispatched %u sec req (%llu), budg left %lu", |
9244 |
++ blk_rq_sectors(rq), |
9245 |
++ (long long unsigned)blk_rq_pos(rq), |
9246 |
++ bfq_bfqq_budget_left(bfqq)); |
9247 |
++ |
9248 |
++ dispatched++; |
9249 |
++ |
9250 |
++ if (bfqd->in_service_bic == NULL) { |
9251 |
++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); |
9252 |
++ bfqd->in_service_bic = RQ_BIC(rq); |
9253 |
++ } |
9254 |
++ |
9255 |
++ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && |
9256 |
++ dispatched >= bfqd->bfq_max_budget_async_rq) || |
9257 |
++ bfq_class_idle(bfqq))) |
9258 |
++ goto expire; |
9259 |
++ |
9260 |
++ return dispatched; |
9261 |
++ |
9262 |
++expire: |
9263 |
++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED); |
9264 |
++ return dispatched; |
9265 |
++} |
9266 |
++ |
9267 |
++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) |
9268 |
++{ |
9269 |
++ int dispatched = 0; |
9270 |
++ |
9271 |
++ while (bfqq->next_rq != NULL) { |
9272 |
++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); |
9273 |
++ dispatched++; |
9274 |
++ } |
9275 |
++ |
9276 |
++ BUG_ON(!list_empty(&bfqq->fifo)); |
9277 |
++ return dispatched; |
9278 |
++} |
9279 |
++ |
9280 |
++/* |
9281 |
++ * Drain our current requests. Used for barriers and when switching |
9282 |
++ * io schedulers on-the-fly. |
9283 |
++ */ |
9284 |
++static int bfq_forced_dispatch(struct bfq_data *bfqd) |
9285 |
++{ |
9286 |
++ struct bfq_queue *bfqq, *n; |
9287 |
++ struct bfq_service_tree *st; |
9288 |
++ int dispatched = 0; |
9289 |
++ |
9290 |
++ bfqq = bfqd->in_service_queue; |
9291 |
++ if (bfqq != NULL) |
9292 |
++ __bfq_bfqq_expire(bfqd, bfqq); |
9293 |
++ |
9294 |
++ /* |
9295 |
++ * Loop through classes, and be careful to leave the scheduler |
9296 |
++ * in a consistent state, as feedback mechanisms and vtime |
9297 |
++ * updates cannot be disabled during the process. |
9298 |
++ */ |
9299 |
++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { |
9300 |
++ st = bfq_entity_service_tree(&bfqq->entity); |
9301 |
++ |
9302 |
++ dispatched += __bfq_forced_dispatch_bfqq(bfqq); |
9303 |
++ bfqq->max_budget = bfq_max_budget(bfqd); |
9304 |
++ |
9305 |
++ bfq_forget_idle(st); |
9306 |
++ } |
9307 |
++ |
9308 |
++ BUG_ON(bfqd->busy_queues != 0); |
9309 |
++ |
9310 |
++ return dispatched; |
9311 |
++} |
9312 |
++ |
9313 |
++static int bfq_dispatch_requests(struct request_queue *q, int force) |
9314 |
++{ |
9315 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
9316 |
++ struct bfq_queue *bfqq; |
9317 |
++ int max_dispatch; |
9318 |
++ |
9319 |
++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); |
9320 |
++ if (bfqd->busy_queues == 0) |
9321 |
++ return 0; |
9322 |
++ |
9323 |
++ if (unlikely(force)) |
9324 |
++ return bfq_forced_dispatch(bfqd); |
9325 |
++ |
9326 |
++ bfqq = bfq_select_queue(bfqd); |
9327 |
++ if (bfqq == NULL) |
9328 |
++ return 0; |
9329 |
++ |
9330 |
++ max_dispatch = bfqd->bfq_quantum; |
9331 |
++ if (bfq_class_idle(bfqq)) |
9332 |
++ max_dispatch = 1; |
9333 |
++ |
9334 |
++ if (!bfq_bfqq_sync(bfqq)) |
9335 |
++ max_dispatch = bfqd->bfq_max_budget_async_rq; |
9336 |
++ |
9337 |
++ if (bfqq->dispatched >= max_dispatch) { |
9338 |
++ if (bfqd->busy_queues > 1) |
9339 |
++ return 0; |
9340 |
++ if (bfqq->dispatched >= 4 * max_dispatch) |
9341 |
++ return 0; |
9342 |
++ } |
9343 |
++ |
9344 |
++ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) |
9345 |
++ return 0; |
9346 |
++ |
9347 |
++ bfq_clear_bfqq_wait_request(bfqq); |
9348 |
++ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); |
9349 |
++ |
9350 |
++ if (!bfq_dispatch_request(bfqd, bfqq)) |
9351 |
++ return 0; |
9352 |
++ |
9353 |
++ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)", |
9354 |
++ bfqq->pid, max_dispatch); |
9355 |
++ |
9356 |
++ return 1; |
9357 |
++} |
9358 |
++ |
9359 |
++/* |
9360 |
++ * Task holds one reference to the queue, dropped when task exits. Each rq |
9361 |
++ * in-flight on this queue also holds a reference, dropped when rq is freed. |
9362 |
++ * |
9363 |
++ * Queue lock must be held here. |
9364 |
++ */ |
9365 |
++static void bfq_put_queue(struct bfq_queue *bfqq) |
9366 |
++{ |
9367 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
9368 |
++ |
9369 |
++ BUG_ON(atomic_read(&bfqq->ref) <= 0); |
9370 |
++ |
9371 |
++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, |
9372 |
++ atomic_read(&bfqq->ref)); |
9373 |
++ if (!atomic_dec_and_test(&bfqq->ref)) |
9374 |
++ return; |
9375 |
++ |
9376 |
++ BUG_ON(rb_first(&bfqq->sort_list) != NULL); |
9377 |
++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); |
9378 |
++ BUG_ON(bfqq->entity.tree != NULL); |
9379 |
++ BUG_ON(bfq_bfqq_busy(bfqq)); |
9380 |
++ BUG_ON(bfqd->in_service_queue == bfqq); |
9381 |
++ |
9382 |
++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); |
9383 |
++ |
9384 |
++ kmem_cache_free(bfq_pool, bfqq); |
9385 |
++} |
9386 |
++ |
9387 |
++static void bfq_put_cooperator(struct bfq_queue *bfqq) |
9388 |
++{ |
9389 |
++ struct bfq_queue *__bfqq, *next; |
9390 |
++ |
9391 |
++ /* |
9392 |
++ * If this queue was scheduled to merge with another queue, be |
9393 |
++ * sure to drop the reference taken on that queue (and others in |
9394 |
++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. |
9395 |
++ */ |
9396 |
++ __bfqq = bfqq->new_bfqq; |
9397 |
++ while (__bfqq) { |
9398 |
++ if (__bfqq == bfqq) { |
9399 |
++ WARN(1, "bfqq->new_bfqq loop detected.\n"); |
9400 |
++ break; |
9401 |
++ } |
9402 |
++ next = __bfqq->new_bfqq; |
9403 |
++ bfq_put_queue(__bfqq); |
9404 |
++ __bfqq = next; |
9405 |
++ } |
9406 |
++} |
9407 |
++ |
9408 |
++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
9409 |
++{ |
9410 |
++ if (bfqq == bfqd->in_service_queue) { |
9411 |
++ __bfq_bfqq_expire(bfqd, bfqq); |
9412 |
++ bfq_schedule_dispatch(bfqd); |
9413 |
++ } |
9414 |
++ |
9415 |
++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, |
9416 |
++ atomic_read(&bfqq->ref)); |
9417 |
++ |
9418 |
++ bfq_put_cooperator(bfqq); |
9419 |
++ |
9420 |
++ bfq_put_queue(bfqq); |
9421 |
++} |
9422 |
++ |
9423 |
++static void bfq_init_icq(struct io_cq *icq) |
9424 |
++{ |
9425 |
++ struct bfq_io_cq *bic = icq_to_bic(icq); |
9426 |
++ |
9427 |
++ bic->ttime.last_end_request = jiffies; |
9428 |
++} |
9429 |
++ |
9430 |
++static void bfq_exit_icq(struct io_cq *icq) |
9431 |
++{ |
9432 |
++ struct bfq_io_cq *bic = icq_to_bic(icq); |
9433 |
++ struct bfq_data *bfqd = bic_to_bfqd(bic); |
9434 |
++ |
9435 |
++ if (bic->bfqq[BLK_RW_ASYNC]) { |
9436 |
++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); |
9437 |
++ bic->bfqq[BLK_RW_ASYNC] = NULL; |
9438 |
++ } |
9439 |
++ |
9440 |
++ if (bic->bfqq[BLK_RW_SYNC]) { |
9441 |
++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); |
9442 |
++ bic->bfqq[BLK_RW_SYNC] = NULL; |
9443 |
++ } |
9444 |
++} |
9445 |
++ |
9446 |
++/* |
9447 |
++ * Update the entity prio values; note that the new values will not |
9448 |
++ * be used until the next (re)activation. |
9449 |
++ */ |
9450 |
++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) |
9451 |
++{ |
9452 |
++ struct task_struct *tsk = current; |
9453 |
++ int ioprio_class; |
9454 |
++ |
9455 |
++ if (!bfq_bfqq_prio_changed(bfqq)) |
9456 |
++ return; |
9457 |
++ |
9458 |
++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); |
9459 |
++ switch (ioprio_class) { |
9460 |
++ default: |
9461 |
++ dev_err(bfqq->bfqd->queue->backing_dev_info.dev, |
9462 |
++ "bfq: bad prio %x\n", ioprio_class); |
9463 |
++ case IOPRIO_CLASS_NONE: |
9464 |
++ /* |
9465 |
++ * No prio set, inherit CPU scheduling settings. |
9466 |
++ */ |
9467 |
++ bfqq->entity.new_ioprio = task_nice_ioprio(tsk); |
9468 |
++ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk); |
9469 |
++ break; |
9470 |
++ case IOPRIO_CLASS_RT: |
9471 |
++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); |
9472 |
++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT; |
9473 |
++ break; |
9474 |
++ case IOPRIO_CLASS_BE: |
9475 |
++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); |
9476 |
++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; |
9477 |
++ break; |
9478 |
++ case IOPRIO_CLASS_IDLE: |
9479 |
++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE; |
9480 |
++ bfqq->entity.new_ioprio = 7; |
9481 |
++ bfq_clear_bfqq_idle_window(bfqq); |
9482 |
++ break; |
9483 |
++ } |
9484 |
++ |
9485 |
++ bfqq->entity.ioprio_changed = 1; |
9486 |
++ |
9487 |
++ /* |
9488 |
++ * Keep track of original prio settings in case we have to temporarily |
9489 |
++ * elevate the priority of this queue. |
9490 |
++ */ |
9491 |
++ bfqq->org_ioprio = bfqq->entity.new_ioprio; |
9492 |
++ bfq_clear_bfqq_prio_changed(bfqq); |
9493 |
++} |
9494 |
++ |
9495 |
++static void bfq_changed_ioprio(struct bfq_io_cq *bic) |
9496 |
++{ |
9497 |
++ struct bfq_data *bfqd; |
9498 |
++ struct bfq_queue *bfqq, *new_bfqq; |
9499 |
++ struct bfq_group *bfqg; |
9500 |
++ unsigned long uninitialized_var(flags); |
9501 |
++ int ioprio = bic->icq.ioc->ioprio; |
9502 |
++ |
9503 |
++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), |
9504 |
++ &flags); |
9505 |
++ /* |
9506 |
++ * This condition may trigger on a newly created bic, be sure to drop |
9507 |
++ * the lock before returning. |
9508 |
++ */ |
9509 |
++ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio)) |
9510 |
++ goto out; |
9511 |
++ |
9512 |
++ bfqq = bic->bfqq[BLK_RW_ASYNC]; |
9513 |
++ if (bfqq != NULL) { |
9514 |
++ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group, |
9515 |
++ sched_data); |
9516 |
++ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic, |
9517 |
++ GFP_ATOMIC); |
9518 |
++ if (new_bfqq != NULL) { |
9519 |
++ bic->bfqq[BLK_RW_ASYNC] = new_bfqq; |
9520 |
++ bfq_log_bfqq(bfqd, bfqq, |
9521 |
++ "changed_ioprio: bfqq %p %d", |
9522 |
++ bfqq, atomic_read(&bfqq->ref)); |
9523 |
++ bfq_put_queue(bfqq); |
9524 |
++ } |
9525 |
++ } |
9526 |
++ |
9527 |
++ bfqq = bic->bfqq[BLK_RW_SYNC]; |
9528 |
++ if (bfqq != NULL) |
9529 |
++ bfq_mark_bfqq_prio_changed(bfqq); |
9530 |
++ |
9531 |
++ bic->ioprio = ioprio; |
9532 |
++ |
9533 |
++out: |
9534 |
++ bfq_put_bfqd_unlock(bfqd, &flags); |
9535 |
++} |
9536 |
++ |
9537 |
++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
9538 |
++ pid_t pid, int is_sync) |
9539 |
++{ |
9540 |
++ RB_CLEAR_NODE(&bfqq->entity.rb_node); |
9541 |
++ INIT_LIST_HEAD(&bfqq->fifo); |
9542 |
++ |
9543 |
++ atomic_set(&bfqq->ref, 0); |
9544 |
++ bfqq->bfqd = bfqd; |
9545 |
++ |
9546 |
++ bfq_mark_bfqq_prio_changed(bfqq); |
9547 |
++ |
9548 |
++ if (is_sync) { |
9549 |
++ if (!bfq_class_idle(bfqq)) |
9550 |
++ bfq_mark_bfqq_idle_window(bfqq); |
9551 |
++ bfq_mark_bfqq_sync(bfqq); |
9552 |
++ } |
9553 |
++ |
9554 |
++ /* Tentative initial value to trade off between thr and lat */ |
9555 |
++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; |
9556 |
++ bfqq->pid = pid; |
9557 |
++ |
9558 |
++ bfqq->raising_coeff = 1; |
9559 |
++ bfqq->last_rais_start_finish = 0; |
9560 |
++ /* |
9561 |
++ * Set to the value for which bfqq will not be deemed as |
9562 |
++ * soft rt when it becomes backlogged. |
9563 |
++ */ |
9564 |
++ bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); |
9565 |
++} |
9566 |
++ |
9567 |
++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, |
9568 |
++ struct bfq_group *bfqg, |
9569 |
++ int is_sync, |
9570 |
++ struct bfq_io_cq *bic, |
9571 |
++ gfp_t gfp_mask) |
9572 |
++{ |
9573 |
++ struct bfq_queue *bfqq, *new_bfqq = NULL; |
9574 |
++ |
9575 |
++retry: |
9576 |
++ /* bic always exists here */ |
9577 |
++ bfqq = bic_to_bfqq(bic, is_sync); |
9578 |
++ |
9579 |
++ /* |
9580 |
++ * Always try a new alloc if we fall back to the OOM bfqq |
9581 |
++ * originally, since it should just be a temporary situation. |
9582 |
++ */ |
9583 |
++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { |
9584 |
++ bfqq = NULL; |
9585 |
++ if (new_bfqq != NULL) { |
9586 |
++ bfqq = new_bfqq; |
9587 |
++ new_bfqq = NULL; |
9588 |
++ } else if (gfp_mask & __GFP_WAIT) { |
9589 |
++ spin_unlock_irq(bfqd->queue->queue_lock); |
9590 |
++ new_bfqq = kmem_cache_alloc_node(bfq_pool, |
9591 |
++ gfp_mask | __GFP_ZERO, |
9592 |
++ bfqd->queue->node); |
9593 |
++ spin_lock_irq(bfqd->queue->queue_lock); |
9594 |
++ if (new_bfqq != NULL) |
9595 |
++ goto retry; |
9596 |
++ } else { |
9597 |
++ bfqq = kmem_cache_alloc_node(bfq_pool, |
9598 |
++ gfp_mask | __GFP_ZERO, |
9599 |
++ bfqd->queue->node); |
9600 |
++ } |
9601 |
++ |
9602 |
++ if (bfqq != NULL) { |
9603 |
++ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync); |
9604 |
++ bfq_log_bfqq(bfqd, bfqq, "allocated"); |
9605 |
++ } else { |
9606 |
++ bfqq = &bfqd->oom_bfqq; |
9607 |
++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); |
9608 |
++ } |
9609 |
++ |
9610 |
++ bfq_init_prio_data(bfqq, bic); |
9611 |
++ bfq_init_entity(&bfqq->entity, bfqg); |
9612 |
++ } |
9613 |
++ |
9614 |
++ if (new_bfqq != NULL) |
9615 |
++ kmem_cache_free(bfq_pool, new_bfqq); |
9616 |
++ |
9617 |
++ return bfqq; |
9618 |
++} |
9619 |
++ |
9620 |
++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, |
9621 |
++ struct bfq_group *bfqg, |
9622 |
++ int ioprio_class, int ioprio) |
9623 |
++{ |
9624 |
++ switch (ioprio_class) { |
9625 |
++ case IOPRIO_CLASS_RT: |
9626 |
++ return &bfqg->async_bfqq[0][ioprio]; |
9627 |
++ case IOPRIO_CLASS_NONE: |
9628 |
++ ioprio = IOPRIO_NORM; |
9629 |
++ /* fall through */ |
9630 |
++ case IOPRIO_CLASS_BE: |
9631 |
++ return &bfqg->async_bfqq[1][ioprio]; |
9632 |
++ case IOPRIO_CLASS_IDLE: |
9633 |
++ return &bfqg->async_idle_bfqq; |
9634 |
++ default: |
9635 |
++ BUG(); |
9636 |
++ } |
9637 |
++} |
9638 |
++ |
9639 |
++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, |
9640 |
++ struct bfq_group *bfqg, int is_sync, |
9641 |
++ struct bfq_io_cq *bic, gfp_t gfp_mask) |
9642 |
++{ |
9643 |
++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); |
9644 |
++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); |
9645 |
++ struct bfq_queue **async_bfqq = NULL; |
9646 |
++ struct bfq_queue *bfqq = NULL; |
9647 |
++ |
9648 |
++ if (!is_sync) { |
9649 |
++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, |
9650 |
++ ioprio); |
9651 |
++ bfqq = *async_bfqq; |
9652 |
++ } |
9653 |
++ |
9654 |
++ if (bfqq == NULL) |
9655 |
++ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask); |
9656 |
++ |
9657 |
++ /* |
9658 |
++ * Pin the queue now that it's allocated, scheduler exit will prune it. |
9659 |
++ */ |
9660 |
++ if (!is_sync && *async_bfqq == NULL) { |
9661 |
++ atomic_inc(&bfqq->ref); |
9662 |
++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", |
9663 |
++ bfqq, atomic_read(&bfqq->ref)); |
9664 |
++ *async_bfqq = bfqq; |
9665 |
++ } |
9666 |
++ |
9667 |
++ atomic_inc(&bfqq->ref); |
9668 |
++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, |
9669 |
++ atomic_read(&bfqq->ref)); |
9670 |
++ return bfqq; |
9671 |
++} |
9672 |
++ |
9673 |
++static void bfq_update_io_thinktime(struct bfq_data *bfqd, |
9674 |
++ struct bfq_io_cq *bic) |
9675 |
++{ |
9676 |
++ unsigned long elapsed = jiffies - bic->ttime.last_end_request; |
9677 |
++ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); |
9678 |
++ |
9679 |
++ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; |
9680 |
++ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; |
9681 |
++ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / |
9682 |
++ bic->ttime.ttime_samples; |
9683 |
++} |
9684 |
++ |
9685 |
++static void bfq_update_io_seektime(struct bfq_data *bfqd, |
9686 |
++ struct bfq_queue *bfqq, |
9687 |
++ struct request *rq) |
9688 |
++{ |
9689 |
++ sector_t sdist; |
9690 |
++ u64 total; |
9691 |
++ |
9692 |
++ if (bfqq->last_request_pos < blk_rq_pos(rq)) |
9693 |
++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos; |
9694 |
++ else |
9695 |
++ sdist = bfqq->last_request_pos - blk_rq_pos(rq); |
9696 |
++ |
9697 |
++ /* |
9698 |
++ * Don't allow the seek distance to get too large from the |
9699 |
++ * odd fragment, pagein, etc. |
9700 |
++ */ |
9701 |
++ if (bfqq->seek_samples == 0) /* first request, not really a seek */ |
9702 |
++ sdist = 0; |
9703 |
++ else if (bfqq->seek_samples <= 60) /* second & third seek */ |
9704 |
++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); |
9705 |
++ else |
9706 |
++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); |
9707 |
++ |
9708 |
++ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; |
9709 |
++ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; |
9710 |
++ total = bfqq->seek_total + (bfqq->seek_samples/2); |
9711 |
++ do_div(total, bfqq->seek_samples); |
9712 |
++ bfqq->seek_mean = (sector_t)total; |
9713 |
++ |
9714 |
++ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, |
9715 |
++ (u64)bfqq->seek_mean); |
9716 |
++} |
9717 |
++ |
9718 |
++/* |
9719 |
++ * Disable idle window if the process thinks too long or seeks so much that |
9720 |
++ * it doesn't matter. |
9721 |
++ */ |
9722 |
++static void bfq_update_idle_window(struct bfq_data *bfqd, |
9723 |
++ struct bfq_queue *bfqq, |
9724 |
++ struct bfq_io_cq *bic) |
9725 |
++{ |
9726 |
++ int enable_idle; |
9727 |
++ |
9728 |
++ /* Don't idle for async or idle io prio class. */ |
9729 |
++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) |
9730 |
++ return; |
9731 |
++ |
9732 |
++ enable_idle = bfq_bfqq_idle_window(bfqq); |
9733 |
++ |
9734 |
++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || |
9735 |
++ bfqd->bfq_slice_idle == 0 || |
9736 |
++ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && |
9737 |
++ bfqq->raising_coeff == 1)) |
9738 |
++ enable_idle = 0; |
9739 |
++ else if (bfq_sample_valid(bic->ttime.ttime_samples)) { |
9740 |
++ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && |
9741 |
++ bfqq->raising_coeff == 1) |
9742 |
++ enable_idle = 0; |
9743 |
++ else |
9744 |
++ enable_idle = 1; |
9745 |
++ } |
9746 |
++ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", |
9747 |
++ enable_idle); |
9748 |
++ |
9749 |
++ if (enable_idle) |
9750 |
++ bfq_mark_bfqq_idle_window(bfqq); |
9751 |
++ else |
9752 |
++ bfq_clear_bfqq_idle_window(bfqq); |
9753 |
++} |
9754 |
++ |
9755 |
++/* |
9756 |
++ * Called when a new fs request (rq) is added to bfqq. Check if there's |
9757 |
++ * something we should do about it. |
9758 |
++ */ |
9759 |
++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
9760 |
++ struct request *rq) |
9761 |
++{ |
9762 |
++ struct bfq_io_cq *bic = RQ_BIC(rq); |
9763 |
++ |
9764 |
++ if (rq->cmd_flags & REQ_META) |
9765 |
++ bfqq->meta_pending++; |
9766 |
++ |
9767 |
++ bfq_update_io_thinktime(bfqd, bic); |
9768 |
++ bfq_update_io_seektime(bfqd, bfqq, rq); |
9769 |
++ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || |
9770 |
++ !BFQQ_SEEKY(bfqq)) |
9771 |
++ bfq_update_idle_window(bfqd, bfqq, bic); |
9772 |
++ |
9773 |
++ bfq_log_bfqq(bfqd, bfqq, |
9774 |
++ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", |
9775 |
++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), |
9776 |
++ (long long unsigned)bfqq->seek_mean); |
9777 |
++ |
9778 |
++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); |
9779 |
++ |
9780 |
++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { |
9781 |
++ int small_req = bfqq->queued[rq_is_sync(rq)] == 1 && |
9782 |
++ blk_rq_sectors(rq) < 32; |
9783 |
++ int budget_timeout = bfq_bfqq_budget_timeout(bfqq); |
9784 |
++ |
9785 |
++ /* |
9786 |
++ * There is just this request queued: if the request |
9787 |
++ * is small and the queue is not to be expired, then |
9788 |
++ * just exit. |
9789 |
++ * |
9790 |
++ * In this way, if the disk is being idled to wait for |
9791 |
++ * a new request from the in-service queue, we avoid |
9792 |
++ * unplugging the device and committing the disk to serve |
9793 |
++ * just a small request. On the contrary, we wait for |
9794 |
++ * the block layer to decide when to unplug the device: |
9795 |
++ * hopefully, new requests will be merged to this one |
9796 |
++ * quickly, then the device will be unplugged and |
9797 |
++ * larger requests will be dispatched. |
9798 |
++ */ |
9799 |
++ if (small_req && !budget_timeout) |
9800 |
++ return; |
9801 |
++ |
9802 |
++ /* |
9803 |
++ * A large enough request arrived, or the queue is to |
9804 |
++ * be expired: in both cases disk idling is to be |
9805 |
++ * stopped, so clear wait_request flag and reset |
9806 |
++ * timer. |
9807 |
++ */ |
9808 |
++ bfq_clear_bfqq_wait_request(bfqq); |
9809 |
++ del_timer(&bfqd->idle_slice_timer); |
9810 |
++ |
9811 |
++ /* |
9812 |
++ * The queue is not empty, because a new request just |
9813 |
++ * arrived. Hence we can safely expire the queue, in |
9814 |
++ * case of budget timeout, without risking that the |
9815 |
++ * timestamps of the queue are not updated correctly. |
9816 |
++ * See [1] for more details. |
9817 |
++ */ |
9818 |
++ if (budget_timeout) |
9819 |
++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); |
9820 |
++ |
9821 |
++ /* |
9822 |
++ * Let the request rip immediately, or let a new queue be |
9823 |
++ * selected if bfqq has just been expired. |
9824 |
++ */ |
9825 |
++ __blk_run_queue(bfqd->queue); |
9826 |
++ } |
9827 |
++} |
9828 |
++ |
9829 |
++static void bfq_insert_request(struct request_queue *q, struct request *rq) |
9830 |
++{ |
9831 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
9832 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
9833 |
++ |
9834 |
++ assert_spin_locked(bfqd->queue->queue_lock); |
9835 |
++ bfq_init_prio_data(bfqq, RQ_BIC(rq)); |
9836 |
++ |
9837 |
++ bfq_add_rq_rb(rq); |
9838 |
++ |
9839 |
++ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]); |
9840 |
++ list_add_tail(&rq->queuelist, &bfqq->fifo); |
9841 |
++ |
9842 |
++ bfq_rq_enqueued(bfqd, bfqq, rq); |
9843 |
++} |
9844 |
++ |
9845 |
++static void bfq_update_hw_tag(struct bfq_data *bfqd) |
9846 |
++{ |
9847 |
++ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, |
9848 |
++ bfqd->rq_in_driver); |
9849 |
++ |
9850 |
++ if (bfqd->hw_tag == 1) |
9851 |
++ return; |
9852 |
++ |
9853 |
++ /* |
9854 |
++ * This sample is valid if the number of outstanding requests |
9855 |
++ * is large enough to allow a queueing behavior. Note that the |
9856 |
++ * sum is not exact, as it's not taking into account deactivated |
9857 |
++ * requests. |
9858 |
++ */ |
9859 |
++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) |
9860 |
++ return; |
9861 |
++ |
9862 |
++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) |
9863 |
++ return; |
9864 |
++ |
9865 |
++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; |
9866 |
++ bfqd->max_rq_in_driver = 0; |
9867 |
++ bfqd->hw_tag_samples = 0; |
9868 |
++} |
9869 |
++ |
9870 |
++static void bfq_completed_request(struct request_queue *q, struct request *rq) |
9871 |
++{ |
9872 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
9873 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
9874 |
++ const int sync = rq_is_sync(rq); |
9875 |
++ |
9876 |
++ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)", |
9877 |
++ blk_rq_sectors(rq), sync); |
9878 |
++ |
9879 |
++ bfq_update_hw_tag(bfqd); |
9880 |
++ |
9881 |
++ WARN_ON(!bfqd->rq_in_driver); |
9882 |
++ WARN_ON(!bfqq->dispatched); |
9883 |
++ bfqd->rq_in_driver--; |
9884 |
++ bfqq->dispatched--; |
9885 |
++ |
9886 |
++ if (bfq_bfqq_sync(bfqq)) |
9887 |
++ bfqd->sync_flight--; |
9888 |
++ |
9889 |
++ if (sync) |
9890 |
++ RQ_BIC(rq)->ttime.last_end_request = jiffies; |
9891 |
++ |
9892 |
++ /* |
9893 |
++ * The computation of softrt_next_start was scheduled for the next |
9894 |
++ * request completion: it is now time to compute it. |
9895 |
++ */ |
9896 |
++ if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list)) |
9897 |
++ bfqq->soft_rt_next_start = |
9898 |
++ bfq_bfqq_softrt_next_start(bfqd, bfqq); |
9899 |
++ |
9900 |
++ /* |
9901 |
++ * If this is the in-service queue, check if it needs to be expired, |
9902 |
++ * or if we want to idle in case it has no pending requests. |
9903 |
++ */ |
9904 |
++ if (bfqd->in_service_queue == bfqq) { |
9905 |
++ if (bfq_bfqq_budget_new(bfqq)) |
9906 |
++ bfq_set_budget_timeout(bfqd); |
9907 |
++ |
9908 |
++ if (bfq_bfqq_must_idle(bfqq)) { |
9909 |
++ bfq_arm_slice_timer(bfqd); |
9910 |
++ goto out; |
9911 |
++ } else if (bfq_may_expire_for_budg_timeout(bfqq)) |
9912 |
++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); |
9913 |
++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && |
9914 |
++ (bfqq->dispatched == 0 || |
9915 |
++ !bfq_bfqq_must_not_expire(bfqq))) |
9916 |
++ bfq_bfqq_expire(bfqd, bfqq, 0, |
9917 |
++ BFQ_BFQQ_NO_MORE_REQUESTS); |
9918 |
++ } |
9919 |
++ |
9920 |
++ if (!bfqd->rq_in_driver) |
9921 |
++ bfq_schedule_dispatch(bfqd); |
9922 |
++ |
9923 |
++out: |
9924 |
++ return; |
9925 |
++} |
9926 |
++ |
9927 |
++static inline int __bfq_may_queue(struct bfq_queue *bfqq) |
9928 |
++{ |
9929 |
++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { |
9930 |
++ bfq_clear_bfqq_must_alloc(bfqq); |
9931 |
++ return ELV_MQUEUE_MUST; |
9932 |
++ } |
9933 |
++ |
9934 |
++ return ELV_MQUEUE_MAY; |
9935 |
++} |
9936 |
++ |
9937 |
++static int bfq_may_queue(struct request_queue *q, int rw) |
9938 |
++{ |
9939 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
9940 |
++ struct task_struct *tsk = current; |
9941 |
++ struct bfq_io_cq *bic; |
9942 |
++ struct bfq_queue *bfqq; |
9943 |
++ |
9944 |
++ /* |
9945 |
++ * Don't force setup of a queue from here, as a call to may_queue |
9946 |
++ * does not necessarily imply that a request actually will be queued. |
9947 |
++ * So just lookup a possibly existing queue, or return 'may queue' |
9948 |
++ * if that fails. |
9949 |
++ */ |
9950 |
++ bic = bfq_bic_lookup(bfqd, tsk->io_context); |
9951 |
++ if (bic == NULL) |
9952 |
++ return ELV_MQUEUE_MAY; |
9953 |
++ |
9954 |
++ bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); |
9955 |
++ if (bfqq != NULL) { |
9956 |
++ bfq_init_prio_data(bfqq, bic); |
9957 |
++ |
9958 |
++ return __bfq_may_queue(bfqq); |
9959 |
++ } |
9960 |
++ |
9961 |
++ return ELV_MQUEUE_MAY; |
9962 |
++} |
9963 |
++ |
9964 |
++/* |
9965 |
++ * Queue lock held here. |
9966 |
++ */ |
9967 |
++static void bfq_put_request(struct request *rq) |
9968 |
++{ |
9969 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
9970 |
++ |
9971 |
++ if (bfqq != NULL) { |
9972 |
++ const int rw = rq_data_dir(rq); |
9973 |
++ |
9974 |
++ BUG_ON(!bfqq->allocated[rw]); |
9975 |
++ bfqq->allocated[rw]--; |
9976 |
++ |
9977 |
++ rq->elv.priv[0] = NULL; |
9978 |
++ rq->elv.priv[1] = NULL; |
9979 |
++ |
9980 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", |
9981 |
++ bfqq, atomic_read(&bfqq->ref)); |
9982 |
++ bfq_put_queue(bfqq); |
9983 |
++ } |
9984 |
++} |
9985 |
++ |
9986 |
++static struct bfq_queue * |
9987 |
++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, |
9988 |
++ struct bfq_queue *bfqq) |
9989 |
++{ |
9990 |
++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", |
9991 |
++ (long unsigned)bfqq->new_bfqq->pid); |
9992 |
++ bic_set_bfqq(bic, bfqq->new_bfqq, 1); |
9993 |
++ bfq_mark_bfqq_coop(bfqq->new_bfqq); |
9994 |
++ bfq_put_queue(bfqq); |
9995 |
++ return bic_to_bfqq(bic, 1); |
9996 |
++} |
9997 |
++ |
9998 |
++/* |
9999 |
++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this |
10000 |
++ * was the last process referring to said bfqq. |
10001 |
++ */ |
10002 |
++static struct bfq_queue * |
10003 |
++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) |
10004 |
++{ |
10005 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); |
10006 |
++ if (bfqq_process_refs(bfqq) == 1) { |
10007 |
++ bfqq->pid = current->pid; |
10008 |
++ bfq_clear_bfqq_coop(bfqq); |
10009 |
++ bfq_clear_bfqq_split_coop(bfqq); |
10010 |
++ return bfqq; |
10011 |
++ } |
10012 |
++ |
10013 |
++ bic_set_bfqq(bic, NULL, 1); |
10014 |
++ |
10015 |
++ bfq_put_cooperator(bfqq); |
10016 |
++ |
10017 |
++ bfq_put_queue(bfqq); |
10018 |
++ return NULL; |
10019 |
++} |
10020 |
++ |
10021 |
++/* |
10022 |
++ * Allocate bfq data structures associated with this request. |
10023 |
++ */ |
10024 |
++static int bfq_set_request(struct request_queue *q, struct request *rq, |
10025 |
++ struct bio *bio, gfp_t gfp_mask) |
10026 |
++{ |
10027 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
10028 |
++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); |
10029 |
++ const int rw = rq_data_dir(rq); |
10030 |
++ const int is_sync = rq_is_sync(rq); |
10031 |
++ struct bfq_queue *bfqq; |
10032 |
++ struct bfq_group *bfqg; |
10033 |
++ unsigned long flags; |
10034 |
++ |
10035 |
++ might_sleep_if(gfp_mask & __GFP_WAIT); |
10036 |
++ |
10037 |
++ bfq_changed_ioprio(bic); |
10038 |
++ |
10039 |
++ spin_lock_irqsave(q->queue_lock, flags); |
10040 |
++ |
10041 |
++ if (bic == NULL) |
10042 |
++ goto queue_fail; |
10043 |
++ |
10044 |
++ bfqg = bfq_bic_update_cgroup(bic); |
10045 |
++ |
10046 |
++new_queue: |
10047 |
++ bfqq = bic_to_bfqq(bic, is_sync); |
10048 |
++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { |
10049 |
++ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask); |
10050 |
++ bic_set_bfqq(bic, bfqq, is_sync); |
10051 |
++ } else { |
10052 |
++ /* |
10053 |
++ * If the queue was seeky for too long, break it apart. |
10054 |
++ */ |
10055 |
++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { |
10056 |
++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); |
10057 |
++ bfqq = bfq_split_bfqq(bic, bfqq); |
10058 |
++ if (!bfqq) |
10059 |
++ goto new_queue; |
10060 |
++ } |
10061 |
++ |
10062 |
++ /* |
10063 |
++ * Check to see if this queue is scheduled to merge with |
10064 |
++ * another closely cooperating queue. The merging of queues |
10065 |
++ * happens here as it must be done in process context. |
10066 |
++ * The reference on new_bfqq was taken in merge_bfqqs. |
10067 |
++ */ |
10068 |
++ if (bfqq->new_bfqq != NULL) |
10069 |
++ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq); |
10070 |
++ } |
10071 |
++ |
10072 |
++ bfqq->allocated[rw]++; |
10073 |
++ atomic_inc(&bfqq->ref); |
10074 |
++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, |
10075 |
++ atomic_read(&bfqq->ref)); |
10076 |
++ |
10077 |
++ rq->elv.priv[0] = bic; |
10078 |
++ rq->elv.priv[1] = bfqq; |
10079 |
++ |
10080 |
++ spin_unlock_irqrestore(q->queue_lock, flags); |
10081 |
++ |
10082 |
++ return 0; |
10083 |
++ |
10084 |
++queue_fail: |
10085 |
++ bfq_schedule_dispatch(bfqd); |
10086 |
++ spin_unlock_irqrestore(q->queue_lock, flags); |
10087 |
++ |
10088 |
++ return 1; |
10089 |
++} |
10090 |
++ |
10091 |
++static void bfq_kick_queue(struct work_struct *work) |
10092 |
++{ |
10093 |
++ struct bfq_data *bfqd = |
10094 |
++ container_of(work, struct bfq_data, unplug_work); |
10095 |
++ struct request_queue *q = bfqd->queue; |
10096 |
++ |
10097 |
++ spin_lock_irq(q->queue_lock); |
10098 |
++ __blk_run_queue(q); |
10099 |
++ spin_unlock_irq(q->queue_lock); |
10100 |
++} |
10101 |
++ |
10102 |
++/* |
10103 |
++ * Handler of the expiration of the timer running if the in-service queue |
10104 |
++ * is idling inside its time slice. |
10105 |
++ */ |
10106 |
++static void bfq_idle_slice_timer(unsigned long data) |
10107 |
++{ |
10108 |
++ struct bfq_data *bfqd = (struct bfq_data *)data; |
10109 |
++ struct bfq_queue *bfqq; |
10110 |
++ unsigned long flags; |
10111 |
++ enum bfqq_expiration reason; |
10112 |
++ |
10113 |
++ spin_lock_irqsave(bfqd->queue->queue_lock, flags); |
10114 |
++ |
10115 |
++ bfqq = bfqd->in_service_queue; |
10116 |
++ /* |
10117 |
++ * Theoretical race here: the in-service queue can be NULL or different |
10118 |
++ * from the queue that was idling if the timer handler spins on |
10119 |
++ * the queue_lock and a new request arrives for the current |
10120 |
++ * queue and there is a full dispatch cycle that changes the |
10121 |
++ * in-service queue. This can hardly happen, but in the worst case |
10122 |
++ * we just expire a queue too early. |
10123 |
++ */ |
10124 |
++ if (bfqq != NULL) { |
10125 |
++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); |
10126 |
++ if (bfq_bfqq_budget_timeout(bfqq)) |
10127 |
++ /* |
10128 |
++ * Also here the queue can be safely expired |
10129 |
++ * for budget timeout without wasting |
10130 |
++ * guarantees |
10131 |
++ */ |
10132 |
++ reason = BFQ_BFQQ_BUDGET_TIMEOUT; |
10133 |
++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) |
10134 |
++ /* |
10135 |
++ * The queue may not be empty upon timer expiration, |
10136 |
++ * because we may not disable the timer when the first |
10137 |
++ * request of the in-service queue arrives during |
10138 |
++ * disk idling |
10139 |
++ */ |
10140 |
++ reason = BFQ_BFQQ_TOO_IDLE; |
10141 |
++ else |
10142 |
++ goto schedule_dispatch; |
10143 |
++ |
10144 |
++ bfq_bfqq_expire(bfqd, bfqq, 1, reason); |
10145 |
++ } |
10146 |
++ |
10147 |
++schedule_dispatch: |
10148 |
++ bfq_schedule_dispatch(bfqd); |
10149 |
++ |
10150 |
++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); |
10151 |
++} |
10152 |
++ |
10153 |
++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) |
10154 |
++{ |
10155 |
++ del_timer_sync(&bfqd->idle_slice_timer); |
10156 |
++ cancel_work_sync(&bfqd->unplug_work); |
10157 |
++} |
10158 |
++ |
10159 |
++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd, |
10160 |
++ struct bfq_queue **bfqq_ptr) |
10161 |
++{ |
10162 |
++ struct bfq_group *root_group = bfqd->root_group; |
10163 |
++ struct bfq_queue *bfqq = *bfqq_ptr; |
10164 |
++ |
10165 |
++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); |
10166 |
++ if (bfqq != NULL) { |
10167 |
++ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); |
10168 |
++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", |
10169 |
++ bfqq, atomic_read(&bfqq->ref)); |
10170 |
++ bfq_put_queue(bfqq); |
10171 |
++ *bfqq_ptr = NULL; |
10172 |
++ } |
10173 |
++} |
10174 |
++ |
10175 |
++/* |
10176 |
++ * Release all the bfqg references to its async queues. If we are |
10177 |
++ * deallocating the group these queues may still contain requests, so |
10178 |
++ * we reparent them to the root cgroup (i.e., the only one that will |
10179 |
++ * exist for sure untill all the requests on a device are gone). |
10180 |
++ */ |
10181 |
++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) |
10182 |
++{ |
10183 |
++ int i, j; |
10184 |
++ |
10185 |
++ for (i = 0; i < 2; i++) |
10186 |
++ for (j = 0; j < IOPRIO_BE_NR; j++) |
10187 |
++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); |
10188 |
++ |
10189 |
++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); |
10190 |
++} |
10191 |
++ |
10192 |
++static void bfq_exit_queue(struct elevator_queue *e) |
10193 |
++{ |
10194 |
++ struct bfq_data *bfqd = e->elevator_data; |
10195 |
++ struct request_queue *q = bfqd->queue; |
10196 |
++ struct bfq_queue *bfqq, *n; |
10197 |
++ |
10198 |
++ bfq_shutdown_timer_wq(bfqd); |
10199 |
++ |
10200 |
++ spin_lock_irq(q->queue_lock); |
10201 |
++ |
10202 |
++ BUG_ON(bfqd->in_service_queue != NULL); |
10203 |
++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) |
10204 |
++ bfq_deactivate_bfqq(bfqd, bfqq, 0); |
10205 |
++ |
10206 |
++ bfq_disconnect_groups(bfqd); |
10207 |
++ spin_unlock_irq(q->queue_lock); |
10208 |
++ |
10209 |
++ bfq_shutdown_timer_wq(bfqd); |
10210 |
++ |
10211 |
++ synchronize_rcu(); |
10212 |
++ |
10213 |
++ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); |
10214 |
++ |
10215 |
++ bfq_free_root_group(bfqd); |
10216 |
++ kfree(bfqd); |
10217 |
++} |
10218 |
++ |
10219 |
++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) |
10220 |
++{ |
10221 |
++ struct bfq_group *bfqg; |
10222 |
++ struct bfq_data *bfqd; |
10223 |
++ struct elevator_queue *eq; |
10224 |
++ |
10225 |
++ eq = elevator_alloc(q, e); |
10226 |
++ if (eq == NULL) |
10227 |
++ return -ENOMEM; |
10228 |
++ |
10229 |
++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); |
10230 |
++ if (bfqd == NULL) { |
10231 |
++ kobject_put(&eq->kobj); |
10232 |
++ return -ENOMEM; |
10233 |
++ } |
10234 |
++ eq->elevator_data = bfqd; |
10235 |
++ |
10236 |
++ /* |
10237 |
++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. |
10238 |
++ * Grab a permanent reference to it, so that the normal code flow |
10239 |
++ * will not attempt to free it. |
10240 |
++ */ |
10241 |
++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0); |
10242 |
++ atomic_inc(&bfqd->oom_bfqq.ref); |
10243 |
++ |
10244 |
++ bfqd->queue = q; |
10245 |
++ |
10246 |
++ spin_lock_irq(q->queue_lock); |
10247 |
++ q->elevator = eq; |
10248 |
++ spin_unlock_irq(q->queue_lock); |
10249 |
++ |
10250 |
++ bfqg = bfq_alloc_root_group(bfqd, q->node); |
10251 |
++ if (bfqg == NULL) { |
10252 |
++ kfree(bfqd); |
10253 |
++ kobject_put(&eq->kobj); |
10254 |
++ return -ENOMEM; |
10255 |
++ } |
10256 |
++ |
10257 |
++ bfqd->root_group = bfqg; |
10258 |
++ |
10259 |
++ init_timer(&bfqd->idle_slice_timer); |
10260 |
++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; |
10261 |
++ bfqd->idle_slice_timer.data = (unsigned long)bfqd; |
10262 |
++ |
10263 |
++ bfqd->rq_pos_tree = RB_ROOT; |
10264 |
++ |
10265 |
++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); |
10266 |
++ |
10267 |
++ INIT_LIST_HEAD(&bfqd->active_list); |
10268 |
++ INIT_LIST_HEAD(&bfqd->idle_list); |
10269 |
++ |
10270 |
++ bfqd->hw_tag = -1; |
10271 |
++ |
10272 |
++ bfqd->bfq_max_budget = bfq_default_max_budget; |
10273 |
++ |
10274 |
++ bfqd->bfq_quantum = bfq_quantum; |
10275 |
++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; |
10276 |
++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; |
10277 |
++ bfqd->bfq_back_max = bfq_back_max; |
10278 |
++ bfqd->bfq_back_penalty = bfq_back_penalty; |
10279 |
++ bfqd->bfq_slice_idle = bfq_slice_idle; |
10280 |
++ bfqd->bfq_class_idle_last_service = 0; |
10281 |
++ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; |
10282 |
++ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; |
10283 |
++ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; |
10284 |
++ |
10285 |
++ bfqd->low_latency = true; |
10286 |
++ |
10287 |
++ bfqd->bfq_raising_coeff = 20; |
10288 |
++ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300); |
10289 |
++ bfqd->bfq_raising_max_time = 0; |
10290 |
++ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000); |
10291 |
++ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500); |
10292 |
++ bfqd->bfq_raising_max_softrt_rate = 7000; /* |
10293 |
++ * Approximate rate required |
10294 |
++ * to playback or record a |
10295 |
++ * high-definition compressed |
10296 |
++ * video. |
10297 |
++ */ |
10298 |
++ bfqd->raised_busy_queues = 0; |
10299 |
++ |
10300 |
++ /* Initially estimate the device's peak rate as the reference rate */ |
10301 |
++ if (blk_queue_nonrot(bfqd->queue)) { |
10302 |
++ bfqd->RT_prod = R_nonrot * T_nonrot; |
10303 |
++ bfqd->peak_rate = R_nonrot; |
10304 |
++ } else { |
10305 |
++ bfqd->RT_prod = R_rot * T_rot; |
10306 |
++ bfqd->peak_rate = R_rot; |
10307 |
++ } |
10308 |
++ |
10309 |
++ return 0; |
10310 |
++} |
10311 |
++ |
10312 |
++static void bfq_slab_kill(void) |
10313 |
++{ |
10314 |
++ if (bfq_pool != NULL) |
10315 |
++ kmem_cache_destroy(bfq_pool); |
10316 |
++} |
10317 |
++ |
10318 |
++static int __init bfq_slab_setup(void) |
10319 |
++{ |
10320 |
++ bfq_pool = KMEM_CACHE(bfq_queue, 0); |
10321 |
++ if (bfq_pool == NULL) |
10322 |
++ return -ENOMEM; |
10323 |
++ return 0; |
10324 |
++} |
10325 |
++ |
10326 |
++static ssize_t bfq_var_show(unsigned int var, char *page) |
10327 |
++{ |
10328 |
++ return sprintf(page, "%d\n", var); |
10329 |
++} |
10330 |
++ |
10331 |
++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count) |
10332 |
++{ |
10333 |
++ unsigned long new_val; |
10334 |
++ int ret = kstrtoul(page, 10, &new_val); |
10335 |
++ |
10336 |
++ if (ret == 0) |
10337 |
++ *var = new_val; |
10338 |
++ |
10339 |
++ return count; |
10340 |
++} |
10341 |
++ |
10342 |
++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page) |
10343 |
++{ |
10344 |
++ struct bfq_data *bfqd = e->elevator_data; |
10345 |
++ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ? |
10346 |
++ jiffies_to_msecs(bfqd->bfq_raising_max_time) : |
10347 |
++ jiffies_to_msecs(bfq_wrais_duration(bfqd))); |
10348 |
++} |
10349 |
++ |
10350 |
++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) |
10351 |
++{ |
10352 |
++ struct bfq_queue *bfqq; |
10353 |
++ struct bfq_data *bfqd = e->elevator_data; |
10354 |
++ ssize_t num_char = 0; |
10355 |
++ |
10356 |
++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", |
10357 |
++ bfqd->queued); |
10358 |
++ |
10359 |
++ spin_lock_irq(bfqd->queue->queue_lock); |
10360 |
++ |
10361 |
++ num_char += sprintf(page + num_char, "Active:\n"); |
10362 |
++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { |
10363 |
++ num_char += sprintf(page + num_char, |
10364 |
++ "pid%d: weight %hu, nr_queued %d %d," |
10365 |
++ " dur %d/%u\n", |
10366 |
++ bfqq->pid, |
10367 |
++ bfqq->entity.weight, |
10368 |
++ bfqq->queued[0], |
10369 |
++ bfqq->queued[1], |
10370 |
++ jiffies_to_msecs(jiffies - |
10371 |
++ bfqq->last_rais_start_finish), |
10372 |
++ jiffies_to_msecs(bfqq->raising_cur_max_time)); |
10373 |
++ } |
10374 |
++ |
10375 |
++ num_char += sprintf(page + num_char, "Idle:\n"); |
10376 |
++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { |
10377 |
++ num_char += sprintf(page + num_char, |
10378 |
++ "pid%d: weight %hu, dur %d/%u\n", |
10379 |
++ bfqq->pid, |
10380 |
++ bfqq->entity.weight, |
10381 |
++ jiffies_to_msecs(jiffies - |
10382 |
++ bfqq->last_rais_start_finish), |
10383 |
++ jiffies_to_msecs(bfqq->raising_cur_max_time)); |
10384 |
++ } |
10385 |
++ |
10386 |
++ spin_unlock_irq(bfqd->queue->queue_lock); |
10387 |
++ |
10388 |
++ return num_char; |
10389 |
++} |
10390 |
++ |
10391 |
++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ |
10392 |
++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ |
10393 |
++{ \ |
10394 |
++ struct bfq_data *bfqd = e->elevator_data; \ |
10395 |
++ unsigned int __data = __VAR; \ |
10396 |
++ if (__CONV) \ |
10397 |
++ __data = jiffies_to_msecs(__data); \ |
10398 |
++ return bfq_var_show(__data, (page)); \ |
10399 |
++} |
10400 |
++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0); |
10401 |
++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); |
10402 |
++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); |
10403 |
++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); |
10404 |
++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); |
10405 |
++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); |
10406 |
++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); |
10407 |
++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0); |
10408 |
++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); |
10409 |
++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); |
10410 |
++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); |
10411 |
++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0); |
10412 |
++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1); |
10413 |
++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time, |
10414 |
++ 1); |
10415 |
++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show, |
10416 |
++ bfqd->bfq_raising_min_inter_arr_async, |
10417 |
++ 1); |
10418 |
++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show, |
10419 |
++ bfqd->bfq_raising_max_softrt_rate, 0); |
10420 |
++#undef SHOW_FUNCTION |
10421 |
++ |
10422 |
++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ |
10423 |
++static ssize_t \ |
10424 |
++__FUNC(struct elevator_queue *e, const char *page, size_t count) \ |
10425 |
++{ \ |
10426 |
++ struct bfq_data *bfqd = e->elevator_data; \ |
10427 |
++ unsigned long uninitialized_var(__data); \ |
10428 |
++ int ret = bfq_var_store(&__data, (page), count); \ |
10429 |
++ if (__data < (MIN)) \ |
10430 |
++ __data = (MIN); \ |
10431 |
++ else if (__data > (MAX)) \ |
10432 |
++ __data = (MAX); \ |
10433 |
++ if (__CONV) \ |
10434 |
++ *(__PTR) = msecs_to_jiffies(__data); \ |
10435 |
++ else \ |
10436 |
++ *(__PTR) = __data; \ |
10437 |
++ return ret; \ |
10438 |
++} |
10439 |
++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0); |
10440 |
++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, |
10441 |
++ INT_MAX, 1); |
10442 |
++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, |
10443 |
++ INT_MAX, 1); |
10444 |
++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); |
10445 |
++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, |
10446 |
++ INT_MAX, 0); |
10447 |
++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); |
10448 |
++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, |
10449 |
++ 1, INT_MAX, 0); |
10450 |
++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, |
10451 |
++ INT_MAX, 1); |
10452 |
++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1, |
10453 |
++ INT_MAX, 0); |
10454 |
++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0, |
10455 |
++ INT_MAX, 1); |
10456 |
++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0, |
10457 |
++ INT_MAX, 1); |
10458 |
++STORE_FUNCTION(bfq_raising_min_idle_time_store, |
10459 |
++ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1); |
10460 |
++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store, |
10461 |
++ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1); |
10462 |
++STORE_FUNCTION(bfq_raising_max_softrt_rate_store, |
10463 |
++ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0); |
10464 |
++#undef STORE_FUNCTION |
10465 |
++ |
10466 |
++/* do nothing for the moment */ |
10467 |
++static ssize_t bfq_weights_store(struct elevator_queue *e, |
10468 |
++ const char *page, size_t count) |
10469 |
++{ |
10470 |
++ return count; |
10471 |
++} |
10472 |
++ |
10473 |
++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) |
10474 |
++{ |
10475 |
++ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); |
10476 |
++ |
10477 |
++ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) |
10478 |
++ return bfq_calc_max_budget(bfqd->peak_rate, timeout); |
10479 |
++ else |
10480 |
++ return bfq_default_max_budget; |
10481 |
++} |
10482 |
++ |
10483 |
++static ssize_t bfq_max_budget_store(struct elevator_queue *e, |
10484 |
++ const char *page, size_t count) |
10485 |
++{ |
10486 |
++ struct bfq_data *bfqd = e->elevator_data; |
10487 |
++ unsigned long uninitialized_var(__data); |
10488 |
++ int ret = bfq_var_store(&__data, (page), count); |
10489 |
++ |
10490 |
++ if (__data == 0) |
10491 |
++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); |
10492 |
++ else { |
10493 |
++ if (__data > INT_MAX) |
10494 |
++ __data = INT_MAX; |
10495 |
++ bfqd->bfq_max_budget = __data; |
10496 |
++ } |
10497 |
++ |
10498 |
++ bfqd->bfq_user_max_budget = __data; |
10499 |
++ |
10500 |
++ return ret; |
10501 |
++} |
10502 |
++ |
10503 |
++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, |
10504 |
++ const char *page, size_t count) |
10505 |
++{ |
10506 |
++ struct bfq_data *bfqd = e->elevator_data; |
10507 |
++ unsigned long uninitialized_var(__data); |
10508 |
++ int ret = bfq_var_store(&__data, (page), count); |
10509 |
++ |
10510 |
++ if (__data < 1) |
10511 |
++ __data = 1; |
10512 |
++ else if (__data > INT_MAX) |
10513 |
++ __data = INT_MAX; |
10514 |
++ |
10515 |
++ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); |
10516 |
++ if (bfqd->bfq_user_max_budget == 0) |
10517 |
++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); |
10518 |
++ |
10519 |
++ return ret; |
10520 |
++} |
10521 |
++ |
10522 |
++static ssize_t bfq_low_latency_store(struct elevator_queue *e, |
10523 |
++ const char *page, size_t count) |
10524 |
++{ |
10525 |
++ struct bfq_data *bfqd = e->elevator_data; |
10526 |
++ unsigned long uninitialized_var(__data); |
10527 |
++ int ret = bfq_var_store(&__data, (page), count); |
10528 |
++ |
10529 |
++ if (__data > 1) |
10530 |
++ __data = 1; |
10531 |
++ if (__data == 0 && bfqd->low_latency != 0) |
10532 |
++ bfq_end_raising(bfqd); |
10533 |
++ bfqd->low_latency = __data; |
10534 |
++ |
10535 |
++ return ret; |
10536 |
++} |
10537 |
++ |
10538 |
++#define BFQ_ATTR(name) \ |
10539 |
++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) |
10540 |
++ |
10541 |
++static struct elv_fs_entry bfq_attrs[] = { |
10542 |
++ BFQ_ATTR(quantum), |
10543 |
++ BFQ_ATTR(fifo_expire_sync), |
10544 |
++ BFQ_ATTR(fifo_expire_async), |
10545 |
++ BFQ_ATTR(back_seek_max), |
10546 |
++ BFQ_ATTR(back_seek_penalty), |
10547 |
++ BFQ_ATTR(slice_idle), |
10548 |
++ BFQ_ATTR(max_budget), |
10549 |
++ BFQ_ATTR(max_budget_async_rq), |
10550 |
++ BFQ_ATTR(timeout_sync), |
10551 |
++ BFQ_ATTR(timeout_async), |
10552 |
++ BFQ_ATTR(low_latency), |
10553 |
++ BFQ_ATTR(raising_coeff), |
10554 |
++ BFQ_ATTR(raising_max_time), |
10555 |
++ BFQ_ATTR(raising_rt_max_time), |
10556 |
++ BFQ_ATTR(raising_min_idle_time), |
10557 |
++ BFQ_ATTR(raising_min_inter_arr_async), |
10558 |
++ BFQ_ATTR(raising_max_softrt_rate), |
10559 |
++ BFQ_ATTR(weights), |
10560 |
++ __ATTR_NULL |
10561 |
++}; |
10562 |
++ |
10563 |
++static struct elevator_type iosched_bfq = { |
10564 |
++ .ops = { |
10565 |
++ .elevator_merge_fn = bfq_merge, |
10566 |
++ .elevator_merged_fn = bfq_merged_request, |
10567 |
++ .elevator_merge_req_fn = bfq_merged_requests, |
10568 |
++ .elevator_allow_merge_fn = bfq_allow_merge, |
10569 |
++ .elevator_dispatch_fn = bfq_dispatch_requests, |
10570 |
++ .elevator_add_req_fn = bfq_insert_request, |
10571 |
++ .elevator_activate_req_fn = bfq_activate_request, |
10572 |
++ .elevator_deactivate_req_fn = bfq_deactivate_request, |
10573 |
++ .elevator_completed_req_fn = bfq_completed_request, |
10574 |
++ .elevator_former_req_fn = elv_rb_former_request, |
10575 |
++ .elevator_latter_req_fn = elv_rb_latter_request, |
10576 |
++ .elevator_init_icq_fn = bfq_init_icq, |
10577 |
++ .elevator_exit_icq_fn = bfq_exit_icq, |
10578 |
++ .elevator_set_req_fn = bfq_set_request, |
10579 |
++ .elevator_put_req_fn = bfq_put_request, |
10580 |
++ .elevator_may_queue_fn = bfq_may_queue, |
10581 |
++ .elevator_init_fn = bfq_init_queue, |
10582 |
++ .elevator_exit_fn = bfq_exit_queue, |
10583 |
++ }, |
10584 |
++ .icq_size = sizeof(struct bfq_io_cq), |
10585 |
++ .icq_align = __alignof__(struct bfq_io_cq), |
10586 |
++ .elevator_attrs = bfq_attrs, |
10587 |
++ .elevator_name = "bfq", |
10588 |
++ .elevator_owner = THIS_MODULE, |
10589 |
++}; |
10590 |
++ |
10591 |
++static int __init bfq_init(void) |
10592 |
++{ |
10593 |
++ /* |
10594 |
++ * Can be 0 on HZ < 1000 setups. |
10595 |
++ */ |
10596 |
++ if (bfq_slice_idle == 0) |
10597 |
++ bfq_slice_idle = 1; |
10598 |
++ |
10599 |
++ if (bfq_timeout_async == 0) |
10600 |
++ bfq_timeout_async = 1; |
10601 |
++ |
10602 |
++ if (bfq_slab_setup()) |
10603 |
++ return -ENOMEM; |
10604 |
++ |
10605 |
++ elv_register(&iosched_bfq); |
10606 |
++ printk(KERN_INFO "BFQ I/O-scheduler version: v7r1"); |
10607 |
++ |
10608 |
++ return 0; |
10609 |
++} |
10610 |
++ |
10611 |
++static void __exit bfq_exit(void) |
10612 |
++{ |
10613 |
++ elv_unregister(&iosched_bfq); |
10614 |
++ bfq_slab_kill(); |
10615 |
++} |
10616 |
++ |
10617 |
++module_init(bfq_init); |
10618 |
++module_exit(bfq_exit); |
10619 |
++ |
10620 |
++MODULE_AUTHOR("Fabio Checconi, Paolo Valente"); |
10621 |
++MODULE_LICENSE("GPL"); |
10622 |
++MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler"); |
10623 |
+diff --git a/block/bfq-sched.c b/block/bfq-sched.c |
10624 |
+new file mode 100644 |
10625 |
+index 0000000..999b475 |
10626 |
+--- /dev/null |
10627 |
++++ b/block/bfq-sched.c |
10628 |
+@@ -0,0 +1,1078 @@ |
10629 |
++/* |
10630 |
++ * BFQ: Hierarchical B-WF2Q+ scheduler. |
10631 |
++ * |
10632 |
++ * Based on ideas and code from CFQ: |
10633 |
++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
10634 |
++ * |
10635 |
++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
10636 |
++ * Paolo Valente <paolo.valente@×××××××.it> |
10637 |
++ * |
10638 |
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
10639 |
++ */ |
10640 |
++ |
10641 |
++#ifdef CONFIG_CGROUP_BFQIO |
10642 |
++#define for_each_entity(entity) \ |
10643 |
++ for (; entity != NULL; entity = entity->parent) |
10644 |
++ |
10645 |
++#define for_each_entity_safe(entity, parent) \ |
10646 |
++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) |
10647 |
++ |
10648 |
++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, |
10649 |
++ int extract, |
10650 |
++ struct bfq_data *bfqd); |
10651 |
++ |
10652 |
++static inline void bfq_update_budget(struct bfq_entity *next_in_service) |
10653 |
++{ |
10654 |
++ struct bfq_entity *bfqg_entity; |
10655 |
++ struct bfq_group *bfqg; |
10656 |
++ struct bfq_sched_data *group_sd; |
10657 |
++ |
10658 |
++ BUG_ON(next_in_service == NULL); |
10659 |
++ |
10660 |
++ group_sd = next_in_service->sched_data; |
10661 |
++ |
10662 |
++ bfqg = container_of(group_sd, struct bfq_group, sched_data); |
10663 |
++ /* |
10664 |
++ * bfq_group's my_entity field is not NULL only if the group |
10665 |
++ * is not the root group. We must not touch the root entity |
10666 |
++ * as it must never become an in-service entity. |
10667 |
++ */ |
10668 |
++ bfqg_entity = bfqg->my_entity; |
10669 |
++ if (bfqg_entity != NULL) |
10670 |
++ bfqg_entity->budget = next_in_service->budget; |
10671 |
++} |
10672 |
++ |
10673 |
++static int bfq_update_next_in_service(struct bfq_sched_data *sd) |
10674 |
++{ |
10675 |
++ struct bfq_entity *next_in_service; |
10676 |
++ |
10677 |
++ if (sd->in_service_entity != NULL) |
10678 |
++ /* will update/requeue at the end of service */ |
10679 |
++ return 0; |
10680 |
++ |
10681 |
++ /* |
10682 |
++ * NOTE: this can be improved in many ways, such as returning |
10683 |
++ * 1 (and thus propagating upwards the update) only when the |
10684 |
++ * budget changes, or caching the bfqq that will be scheduled |
10685 |
++ * next from this subtree. By now we worry more about |
10686 |
++ * correctness than about performance... |
10687 |
++ */ |
10688 |
++ next_in_service = bfq_lookup_next_entity(sd, 0, NULL); |
10689 |
++ sd->next_in_service = next_in_service; |
10690 |
++ |
10691 |
++ if (next_in_service != NULL) |
10692 |
++ bfq_update_budget(next_in_service); |
10693 |
++ |
10694 |
++ return 1; |
10695 |
++} |
10696 |
++ |
10697 |
++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, |
10698 |
++ struct bfq_entity *entity) |
10699 |
++{ |
10700 |
++ BUG_ON(sd->next_in_service != entity); |
10701 |
++} |
10702 |
++#else |
10703 |
++#define for_each_entity(entity) \ |
10704 |
++ for (; entity != NULL; entity = NULL) |
10705 |
++ |
10706 |
++#define for_each_entity_safe(entity, parent) \ |
10707 |
++ for (parent = NULL; entity != NULL; entity = parent) |
10708 |
++ |
10709 |
++static inline int bfq_update_next_in_service(struct bfq_sched_data *sd) |
10710 |
++{ |
10711 |
++ return 0; |
10712 |
++} |
10713 |
++ |
10714 |
++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, |
10715 |
++ struct bfq_entity *entity) |
10716 |
++{ |
10717 |
++} |
10718 |
++ |
10719 |
++static inline void bfq_update_budget(struct bfq_entity *next_in_service) |
10720 |
++{ |
10721 |
++} |
10722 |
++#endif |
10723 |
++ |
10724 |
++/* |
10725 |
++ * Shift for timestamp calculations. This actually limits the maximum |
10726 |
++ * service allowed in one timestamp delta (small shift values increase it), |
10727 |
++ * the maximum total weight that can be used for the queues in the system |
10728 |
++ * (big shift values increase it), and the period of virtual time wraparounds. |
10729 |
++ */ |
10730 |
++#define WFQ_SERVICE_SHIFT 22 |
10731 |
++ |
10732 |
++/** |
10733 |
++ * bfq_gt - compare two timestamps. |
10734 |
++ * @a: first ts. |
10735 |
++ * @b: second ts. |
10736 |
++ * |
10737 |
++ * Return @a > @b, dealing with wrapping correctly. |
10738 |
++ */ |
10739 |
++static inline int bfq_gt(u64 a, u64 b) |
10740 |
++{ |
10741 |
++ return (s64)(a - b) > 0; |
10742 |
++} |
10743 |
++ |
10744 |
++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) |
10745 |
++{ |
10746 |
++ struct bfq_queue *bfqq = NULL; |
10747 |
++ |
10748 |
++ BUG_ON(entity == NULL); |
10749 |
++ |
10750 |
++ if (entity->my_sched_data == NULL) |
10751 |
++ bfqq = container_of(entity, struct bfq_queue, entity); |
10752 |
++ |
10753 |
++ return bfqq; |
10754 |
++} |
10755 |
++ |
10756 |
++ |
10757 |
++/** |
10758 |
++ * bfq_delta - map service into the virtual time domain. |
10759 |
++ * @service: amount of service. |
10760 |
++ * @weight: scale factor (weight of an entity or weight sum). |
10761 |
++ */ |
10762 |
++static inline u64 bfq_delta(unsigned long service, |
10763 |
++ unsigned long weight) |
10764 |
++{ |
10765 |
++ u64 d = (u64)service << WFQ_SERVICE_SHIFT; |
10766 |
++ |
10767 |
++ do_div(d, weight); |
10768 |
++ return d; |
10769 |
++} |
10770 |
++ |
10771 |
++/** |
10772 |
++ * bfq_calc_finish - assign the finish time to an entity. |
10773 |
++ * @entity: the entity to act upon. |
10774 |
++ * @service: the service to be charged to the entity. |
10775 |
++ */ |
10776 |
++static inline void bfq_calc_finish(struct bfq_entity *entity, |
10777 |
++ unsigned long service) |
10778 |
++{ |
10779 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
10780 |
++ |
10781 |
++ BUG_ON(entity->weight == 0); |
10782 |
++ |
10783 |
++ entity->finish = entity->start + |
10784 |
++ bfq_delta(service, entity->weight); |
10785 |
++ |
10786 |
++ if (bfqq != NULL) { |
10787 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
10788 |
++ "calc_finish: serv %lu, w %d", |
10789 |
++ service, entity->weight); |
10790 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
10791 |
++ "calc_finish: start %llu, finish %llu, delta %llu", |
10792 |
++ entity->start, entity->finish, |
10793 |
++ bfq_delta(service, entity->weight)); |
10794 |
++ } |
10795 |
++} |
10796 |
++ |
10797 |
++/** |
10798 |
++ * bfq_entity_of - get an entity from a node. |
10799 |
++ * @node: the node field of the entity. |
10800 |
++ * |
10801 |
++ * Convert a node pointer to the relative entity. This is used only |
10802 |
++ * to simplify the logic of some functions and not as the generic |
10803 |
++ * conversion mechanism because, e.g., in the tree walking functions, |
10804 |
++ * the check for a %NULL value would be redundant. |
10805 |
++ */ |
10806 |
++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node) |
10807 |
++{ |
10808 |
++ struct bfq_entity *entity = NULL; |
10809 |
++ |
10810 |
++ if (node != NULL) |
10811 |
++ entity = rb_entry(node, struct bfq_entity, rb_node); |
10812 |
++ |
10813 |
++ return entity; |
10814 |
++} |
10815 |
++ |
10816 |
++/** |
10817 |
++ * bfq_extract - remove an entity from a tree. |
10818 |
++ * @root: the tree root. |
10819 |
++ * @entity: the entity to remove. |
10820 |
++ */ |
10821 |
++static inline void bfq_extract(struct rb_root *root, |
10822 |
++ struct bfq_entity *entity) |
10823 |
++{ |
10824 |
++ BUG_ON(entity->tree != root); |
10825 |
++ |
10826 |
++ entity->tree = NULL; |
10827 |
++ rb_erase(&entity->rb_node, root); |
10828 |
++} |
10829 |
++ |
10830 |
++/** |
10831 |
++ * bfq_idle_extract - extract an entity from the idle tree. |
10832 |
++ * @st: the service tree of the owning @entity. |
10833 |
++ * @entity: the entity being removed. |
10834 |
++ */ |
10835 |
++static void bfq_idle_extract(struct bfq_service_tree *st, |
10836 |
++ struct bfq_entity *entity) |
10837 |
++{ |
10838 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
10839 |
++ struct rb_node *next; |
10840 |
++ |
10841 |
++ BUG_ON(entity->tree != &st->idle); |
10842 |
++ |
10843 |
++ if (entity == st->first_idle) { |
10844 |
++ next = rb_next(&entity->rb_node); |
10845 |
++ st->first_idle = bfq_entity_of(next); |
10846 |
++ } |
10847 |
++ |
10848 |
++ if (entity == st->last_idle) { |
10849 |
++ next = rb_prev(&entity->rb_node); |
10850 |
++ st->last_idle = bfq_entity_of(next); |
10851 |
++ } |
10852 |
++ |
10853 |
++ bfq_extract(&st->idle, entity); |
10854 |
++ |
10855 |
++ if (bfqq != NULL) |
10856 |
++ list_del(&bfqq->bfqq_list); |
10857 |
++} |
10858 |
++ |
10859 |
++/** |
10860 |
++ * bfq_insert - generic tree insertion. |
10861 |
++ * @root: tree root. |
10862 |
++ * @entity: entity to insert. |
10863 |
++ * |
10864 |
++ * This is used for the idle and the active tree, since they are both |
10865 |
++ * ordered by finish time. |
10866 |
++ */ |
10867 |
++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) |
10868 |
++{ |
10869 |
++ struct bfq_entity *entry; |
10870 |
++ struct rb_node **node = &root->rb_node; |
10871 |
++ struct rb_node *parent = NULL; |
10872 |
++ |
10873 |
++ BUG_ON(entity->tree != NULL); |
10874 |
++ |
10875 |
++ while (*node != NULL) { |
10876 |
++ parent = *node; |
10877 |
++ entry = rb_entry(parent, struct bfq_entity, rb_node); |
10878 |
++ |
10879 |
++ if (bfq_gt(entry->finish, entity->finish)) |
10880 |
++ node = &parent->rb_left; |
10881 |
++ else |
10882 |
++ node = &parent->rb_right; |
10883 |
++ } |
10884 |
++ |
10885 |
++ rb_link_node(&entity->rb_node, parent, node); |
10886 |
++ rb_insert_color(&entity->rb_node, root); |
10887 |
++ |
10888 |
++ entity->tree = root; |
10889 |
++} |
10890 |
++ |
10891 |
++/** |
10892 |
++ * bfq_update_min - update the min_start field of a entity. |
10893 |
++ * @entity: the entity to update. |
10894 |
++ * @node: one of its children. |
10895 |
++ * |
10896 |
++ * This function is called when @entity may store an invalid value for |
10897 |
++ * min_start due to updates to the active tree. The function assumes |
10898 |
++ * that the subtree rooted at @node (which may be its left or its right |
10899 |
++ * child) has a valid min_start value. |
10900 |
++ */ |
10901 |
++static inline void bfq_update_min(struct bfq_entity *entity, |
10902 |
++ struct rb_node *node) |
10903 |
++{ |
10904 |
++ struct bfq_entity *child; |
10905 |
++ |
10906 |
++ if (node != NULL) { |
10907 |
++ child = rb_entry(node, struct bfq_entity, rb_node); |
10908 |
++ if (bfq_gt(entity->min_start, child->min_start)) |
10909 |
++ entity->min_start = child->min_start; |
10910 |
++ } |
10911 |
++} |
10912 |
++ |
10913 |
++/** |
10914 |
++ * bfq_update_active_node - recalculate min_start. |
10915 |
++ * @node: the node to update. |
10916 |
++ * |
10917 |
++ * @node may have changed position or one of its children may have moved, |
10918 |
++ * this function updates its min_start value. The left and right subtrees |
10919 |
++ * are assumed to hold a correct min_start value. |
10920 |
++ */ |
10921 |
++static inline void bfq_update_active_node(struct rb_node *node) |
10922 |
++{ |
10923 |
++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); |
10924 |
++ |
10925 |
++ entity->min_start = entity->start; |
10926 |
++ bfq_update_min(entity, node->rb_right); |
10927 |
++ bfq_update_min(entity, node->rb_left); |
10928 |
++} |
10929 |
++ |
10930 |
++/** |
10931 |
++ * bfq_update_active_tree - update min_start for the whole active tree. |
10932 |
++ * @node: the starting node. |
10933 |
++ * |
10934 |
++ * @node must be the deepest modified node after an update. This function |
10935 |
++ * updates its min_start using the values held by its children, assuming |
10936 |
++ * that they did not change, and then updates all the nodes that may have |
10937 |
++ * changed in the path to the root. The only nodes that may have changed |
10938 |
++ * are the ones in the path or their siblings. |
10939 |
++ */ |
10940 |
++static void bfq_update_active_tree(struct rb_node *node) |
10941 |
++{ |
10942 |
++ struct rb_node *parent; |
10943 |
++ |
10944 |
++up: |
10945 |
++ bfq_update_active_node(node); |
10946 |
++ |
10947 |
++ parent = rb_parent(node); |
10948 |
++ if (parent == NULL) |
10949 |
++ return; |
10950 |
++ |
10951 |
++ if (node == parent->rb_left && parent->rb_right != NULL) |
10952 |
++ bfq_update_active_node(parent->rb_right); |
10953 |
++ else if (parent->rb_left != NULL) |
10954 |
++ bfq_update_active_node(parent->rb_left); |
10955 |
++ |
10956 |
++ node = parent; |
10957 |
++ goto up; |
10958 |
++} |
10959 |
++ |
10960 |
++/** |
10961 |
++ * bfq_active_insert - insert an entity in the active tree of its group/device. |
10962 |
++ * @st: the service tree of the entity. |
10963 |
++ * @entity: the entity being inserted. |
10964 |
++ * |
10965 |
++ * The active tree is ordered by finish time, but an extra key is kept |
10966 |
++ * per each node, containing the minimum value for the start times of |
10967 |
++ * its children (and the node itself), so it's possible to search for |
10968 |
++ * the eligible node with the lowest finish time in logarithmic time. |
10969 |
++ */ |
10970 |
++static void bfq_active_insert(struct bfq_service_tree *st, |
10971 |
++ struct bfq_entity *entity) |
10972 |
++{ |
10973 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
10974 |
++ struct rb_node *node = &entity->rb_node; |
10975 |
++ |
10976 |
++ bfq_insert(&st->active, entity); |
10977 |
++ |
10978 |
++ if (node->rb_left != NULL) |
10979 |
++ node = node->rb_left; |
10980 |
++ else if (node->rb_right != NULL) |
10981 |
++ node = node->rb_right; |
10982 |
++ |
10983 |
++ bfq_update_active_tree(node); |
10984 |
++ |
10985 |
++ if (bfqq != NULL) |
10986 |
++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); |
10987 |
++} |
10988 |
++ |
10989 |
++/** |
10990 |
++ * bfq_ioprio_to_weight - calc a weight from an ioprio. |
10991 |
++ * @ioprio: the ioprio value to convert. |
10992 |
++ */ |
10993 |
++static unsigned short bfq_ioprio_to_weight(int ioprio) |
10994 |
++{ |
10995 |
++ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); |
10996 |
++ return IOPRIO_BE_NR - ioprio; |
10997 |
++} |
10998 |
++ |
10999 |
++/** |
11000 |
++ * bfq_weight_to_ioprio - calc an ioprio from a weight. |
11001 |
++ * @weight: the weight value to convert. |
11002 |
++ * |
11003 |
++ * To preserve as mush as possible the old only-ioprio user interface, |
11004 |
++ * 0 is used as an escape ioprio value for weights (numerically) equal or |
11005 |
++ * larger than IOPRIO_BE_NR |
11006 |
++ */ |
11007 |
++static unsigned short bfq_weight_to_ioprio(int weight) |
11008 |
++{ |
11009 |
++ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); |
11010 |
++ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; |
11011 |
++} |
11012 |
++ |
11013 |
++static inline void bfq_get_entity(struct bfq_entity *entity) |
11014 |
++{ |
11015 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
11016 |
++ struct bfq_sched_data *sd; |
11017 |
++ |
11018 |
++ if (bfqq != NULL) { |
11019 |
++ sd = entity->sched_data; |
11020 |
++ atomic_inc(&bfqq->ref); |
11021 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", |
11022 |
++ bfqq, atomic_read(&bfqq->ref)); |
11023 |
++ } |
11024 |
++} |
11025 |
++ |
11026 |
++/** |
11027 |
++ * bfq_find_deepest - find the deepest node that an extraction can modify. |
11028 |
++ * @node: the node being removed. |
11029 |
++ * |
11030 |
++ * Do the first step of an extraction in an rb tree, looking for the |
11031 |
++ * node that will replace @node, and returning the deepest node that |
11032 |
++ * the following modifications to the tree can touch. If @node is the |
11033 |
++ * last node in the tree return %NULL. |
11034 |
++ */ |
11035 |
++static struct rb_node *bfq_find_deepest(struct rb_node *node) |
11036 |
++{ |
11037 |
++ struct rb_node *deepest; |
11038 |
++ |
11039 |
++ if (node->rb_right == NULL && node->rb_left == NULL) |
11040 |
++ deepest = rb_parent(node); |
11041 |
++ else if (node->rb_right == NULL) |
11042 |
++ deepest = node->rb_left; |
11043 |
++ else if (node->rb_left == NULL) |
11044 |
++ deepest = node->rb_right; |
11045 |
++ else { |
11046 |
++ deepest = rb_next(node); |
11047 |
++ if (deepest->rb_right != NULL) |
11048 |
++ deepest = deepest->rb_right; |
11049 |
++ else if (rb_parent(deepest) != node) |
11050 |
++ deepest = rb_parent(deepest); |
11051 |
++ } |
11052 |
++ |
11053 |
++ return deepest; |
11054 |
++} |
11055 |
++ |
11056 |
++/** |
11057 |
++ * bfq_active_extract - remove an entity from the active tree. |
11058 |
++ * @st: the service_tree containing the tree. |
11059 |
++ * @entity: the entity being removed. |
11060 |
++ */ |
11061 |
++static void bfq_active_extract(struct bfq_service_tree *st, |
11062 |
++ struct bfq_entity *entity) |
11063 |
++{ |
11064 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
11065 |
++ struct rb_node *node; |
11066 |
++ |
11067 |
++ node = bfq_find_deepest(&entity->rb_node); |
11068 |
++ bfq_extract(&st->active, entity); |
11069 |
++ |
11070 |
++ if (node != NULL) |
11071 |
++ bfq_update_active_tree(node); |
11072 |
++ |
11073 |
++ if (bfqq != NULL) |
11074 |
++ list_del(&bfqq->bfqq_list); |
11075 |
++} |
11076 |
++ |
11077 |
++/** |
11078 |
++ * bfq_idle_insert - insert an entity into the idle tree. |
11079 |
++ * @st: the service tree containing the tree. |
11080 |
++ * @entity: the entity to insert. |
11081 |
++ */ |
11082 |
++static void bfq_idle_insert(struct bfq_service_tree *st, |
11083 |
++ struct bfq_entity *entity) |
11084 |
++{ |
11085 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
11086 |
++ struct bfq_entity *first_idle = st->first_idle; |
11087 |
++ struct bfq_entity *last_idle = st->last_idle; |
11088 |
++ |
11089 |
++ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish)) |
11090 |
++ st->first_idle = entity; |
11091 |
++ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish)) |
11092 |
++ st->last_idle = entity; |
11093 |
++ |
11094 |
++ bfq_insert(&st->idle, entity); |
11095 |
++ |
11096 |
++ if (bfqq != NULL) |
11097 |
++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); |
11098 |
++} |
11099 |
++ |
11100 |
++/** |
11101 |
++ * bfq_forget_entity - remove an entity from the wfq trees. |
11102 |
++ * @st: the service tree. |
11103 |
++ * @entity: the entity being removed. |
11104 |
++ * |
11105 |
++ * Update the device status and forget everything about @entity, putting |
11106 |
++ * the device reference to it, if it is a queue. Entities belonging to |
11107 |
++ * groups are not refcounted. |
11108 |
++ */ |
11109 |
++static void bfq_forget_entity(struct bfq_service_tree *st, |
11110 |
++ struct bfq_entity *entity) |
11111 |
++{ |
11112 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
11113 |
++ struct bfq_sched_data *sd; |
11114 |
++ |
11115 |
++ BUG_ON(!entity->on_st); |
11116 |
++ |
11117 |
++ entity->on_st = 0; |
11118 |
++ st->wsum -= entity->weight; |
11119 |
++ if (bfqq != NULL) { |
11120 |
++ sd = entity->sched_data; |
11121 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", |
11122 |
++ bfqq, atomic_read(&bfqq->ref)); |
11123 |
++ bfq_put_queue(bfqq); |
11124 |
++ } |
11125 |
++} |
11126 |
++ |
11127 |
++/** |
11128 |
++ * bfq_put_idle_entity - release the idle tree ref of an entity. |
11129 |
++ * @st: service tree for the entity. |
11130 |
++ * @entity: the entity being released. |
11131 |
++ */ |
11132 |
++static void bfq_put_idle_entity(struct bfq_service_tree *st, |
11133 |
++ struct bfq_entity *entity) |
11134 |
++{ |
11135 |
++ bfq_idle_extract(st, entity); |
11136 |
++ bfq_forget_entity(st, entity); |
11137 |
++} |
11138 |
++ |
11139 |
++/** |
11140 |
++ * bfq_forget_idle - update the idle tree if necessary. |
11141 |
++ * @st: the service tree to act upon. |
11142 |
++ * |
11143 |
++ * To preserve the global O(log N) complexity we only remove one entry here; |
11144 |
++ * as the idle tree will not grow indefinitely this can be done safely. |
11145 |
++ */ |
11146 |
++static void bfq_forget_idle(struct bfq_service_tree *st) |
11147 |
++{ |
11148 |
++ struct bfq_entity *first_idle = st->first_idle; |
11149 |
++ struct bfq_entity *last_idle = st->last_idle; |
11150 |
++ |
11151 |
++ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL && |
11152 |
++ !bfq_gt(last_idle->finish, st->vtime)) { |
11153 |
++ /* |
11154 |
++ * Forget the whole idle tree, increasing the vtime past |
11155 |
++ * the last finish time of idle entities. |
11156 |
++ */ |
11157 |
++ st->vtime = last_idle->finish; |
11158 |
++ } |
11159 |
++ |
11160 |
++ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime)) |
11161 |
++ bfq_put_idle_entity(st, first_idle); |
11162 |
++} |
11163 |
++ |
11164 |
++static struct bfq_service_tree * |
11165 |
++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, |
11166 |
++ struct bfq_entity *entity) |
11167 |
++{ |
11168 |
++ struct bfq_service_tree *new_st = old_st; |
11169 |
++ |
11170 |
++ if (entity->ioprio_changed) { |
11171 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
11172 |
++ |
11173 |
++ BUG_ON(old_st->wsum < entity->weight); |
11174 |
++ old_st->wsum -= entity->weight; |
11175 |
++ |
11176 |
++ if (entity->new_weight != entity->orig_weight) { |
11177 |
++ entity->orig_weight = entity->new_weight; |
11178 |
++ entity->ioprio = |
11179 |
++ bfq_weight_to_ioprio(entity->orig_weight); |
11180 |
++ } else if (entity->new_ioprio != entity->ioprio) { |
11181 |
++ entity->ioprio = entity->new_ioprio; |
11182 |
++ entity->orig_weight = |
11183 |
++ bfq_ioprio_to_weight(entity->ioprio); |
11184 |
++ } else |
11185 |
++ entity->new_weight = entity->orig_weight = |
11186 |
++ bfq_ioprio_to_weight(entity->ioprio); |
11187 |
++ |
11188 |
++ entity->ioprio_class = entity->new_ioprio_class; |
11189 |
++ entity->ioprio_changed = 0; |
11190 |
++ |
11191 |
++ /* |
11192 |
++ * NOTE: here we may be changing the weight too early, |
11193 |
++ * this will cause unfairness. The correct approach |
11194 |
++ * would have required additional complexity to defer |
11195 |
++ * weight changes to the proper time instants (i.e., |
11196 |
++ * when entity->finish <= old_st->vtime). |
11197 |
++ */ |
11198 |
++ new_st = bfq_entity_service_tree(entity); |
11199 |
++ entity->weight = entity->orig_weight * |
11200 |
++ (bfqq != NULL ? bfqq->raising_coeff : 1); |
11201 |
++ new_st->wsum += entity->weight; |
11202 |
++ |
11203 |
++ if (new_st != old_st) |
11204 |
++ entity->start = new_st->vtime; |
11205 |
++ } |
11206 |
++ |
11207 |
++ return new_st; |
11208 |
++} |
11209 |
++ |
11210 |
++/** |
11211 |
++ * bfq_bfqq_served - update the scheduler status after selection for service. |
11212 |
++ * @bfqq: the queue being served. |
11213 |
++ * @served: bytes to transfer. |
11214 |
++ * |
11215 |
++ * NOTE: this can be optimized, as the timestamps of upper level entities |
11216 |
++ * are synchronized every time a new bfqq is selected for service. By now, |
11217 |
++ * we keep it to better check consistency. |
11218 |
++ */ |
11219 |
++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) |
11220 |
++{ |
11221 |
++ struct bfq_entity *entity = &bfqq->entity; |
11222 |
++ struct bfq_service_tree *st; |
11223 |
++ |
11224 |
++ for_each_entity(entity) { |
11225 |
++ st = bfq_entity_service_tree(entity); |
11226 |
++ |
11227 |
++ entity->service += served; |
11228 |
++ BUG_ON(entity->service > entity->budget); |
11229 |
++ BUG_ON(st->wsum == 0); |
11230 |
++ |
11231 |
++ st->vtime += bfq_delta(served, st->wsum); |
11232 |
++ bfq_forget_idle(st); |
11233 |
++ } |
11234 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served); |
11235 |
++} |
11236 |
++ |
11237 |
++/** |
11238 |
++ * bfq_bfqq_charge_full_budget - set the service to the entity budget. |
11239 |
++ * @bfqq: the queue that needs a service update. |
11240 |
++ * |
11241 |
++ * When it's not possible to be fair in the service domain, because |
11242 |
++ * a queue is not consuming its budget fast enough (the meaning of |
11243 |
++ * fast depends on the timeout parameter), we charge it a full |
11244 |
++ * budget. In this way we should obtain a sort of time-domain |
11245 |
++ * fairness among all the seeky/slow queues. |
11246 |
++ */ |
11247 |
++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) |
11248 |
++{ |
11249 |
++ struct bfq_entity *entity = &bfqq->entity; |
11250 |
++ |
11251 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); |
11252 |
++ |
11253 |
++ bfq_bfqq_served(bfqq, entity->budget - entity->service); |
11254 |
++} |
11255 |
++ |
11256 |
++/** |
11257 |
++ * __bfq_activate_entity - activate an entity. |
11258 |
++ * @entity: the entity being activated. |
11259 |
++ * |
11260 |
++ * Called whenever an entity is activated, i.e., it is not active and one |
11261 |
++ * of its children receives a new request, or has to be reactivated due to |
11262 |
++ * budget exhaustion. It uses the current budget of the entity (and the |
11263 |
++ * service received if @entity is active) of the queue to calculate its |
11264 |
++ * timestamps. |
11265 |
++ */ |
11266 |
++static void __bfq_activate_entity(struct bfq_entity *entity) |
11267 |
++{ |
11268 |
++ struct bfq_sched_data *sd = entity->sched_data; |
11269 |
++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); |
11270 |
++ |
11271 |
++ if (entity == sd->in_service_entity) { |
11272 |
++ BUG_ON(entity->tree != NULL); |
11273 |
++ /* |
11274 |
++ * If we are requeueing the current entity we have |
11275 |
++ * to take care of not charging to it service it has |
11276 |
++ * not received. |
11277 |
++ */ |
11278 |
++ bfq_calc_finish(entity, entity->service); |
11279 |
++ entity->start = entity->finish; |
11280 |
++ sd->in_service_entity = NULL; |
11281 |
++ } else if (entity->tree == &st->active) { |
11282 |
++ /* |
11283 |
++ * Requeueing an entity due to a change of some |
11284 |
++ * next_in_service entity below it. We reuse the |
11285 |
++ * old start time. |
11286 |
++ */ |
11287 |
++ bfq_active_extract(st, entity); |
11288 |
++ } else if (entity->tree == &st->idle) { |
11289 |
++ /* |
11290 |
++ * Must be on the idle tree, bfq_idle_extract() will |
11291 |
++ * check for that. |
11292 |
++ */ |
11293 |
++ bfq_idle_extract(st, entity); |
11294 |
++ entity->start = bfq_gt(st->vtime, entity->finish) ? |
11295 |
++ st->vtime : entity->finish; |
11296 |
++ } else { |
11297 |
++ /* |
11298 |
++ * The finish time of the entity may be invalid, and |
11299 |
++ * it is in the past for sure, otherwise the queue |
11300 |
++ * would have been on the idle tree. |
11301 |
++ */ |
11302 |
++ entity->start = st->vtime; |
11303 |
++ st->wsum += entity->weight; |
11304 |
++ bfq_get_entity(entity); |
11305 |
++ |
11306 |
++ BUG_ON(entity->on_st); |
11307 |
++ entity->on_st = 1; |
11308 |
++ } |
11309 |
++ |
11310 |
++ st = __bfq_entity_update_weight_prio(st, entity); |
11311 |
++ bfq_calc_finish(entity, entity->budget); |
11312 |
++ bfq_active_insert(st, entity); |
11313 |
++} |
11314 |
++ |
11315 |
++/** |
11316 |
++ * bfq_activate_entity - activate an entity and its ancestors if necessary. |
11317 |
++ * @entity: the entity to activate. |
11318 |
++ * |
11319 |
++ * Activate @entity and all the entities on the path from it to the root. |
11320 |
++ */ |
11321 |
++static void bfq_activate_entity(struct bfq_entity *entity) |
11322 |
++{ |
11323 |
++ struct bfq_sched_data *sd; |
11324 |
++ |
11325 |
++ for_each_entity(entity) { |
11326 |
++ __bfq_activate_entity(entity); |
11327 |
++ |
11328 |
++ sd = entity->sched_data; |
11329 |
++ if (!bfq_update_next_in_service(sd)) |
11330 |
++ /* |
11331 |
++ * No need to propagate the activation to the |
11332 |
++ * upper entities, as they will be updated when |
11333 |
++ * the in-service entity is rescheduled. |
11334 |
++ */ |
11335 |
++ break; |
11336 |
++ } |
11337 |
++} |
11338 |
++ |
11339 |
++/** |
11340 |
++ * __bfq_deactivate_entity - deactivate an entity from its service tree. |
11341 |
++ * @entity: the entity to deactivate. |
11342 |
++ * @requeue: if false, the entity will not be put into the idle tree. |
11343 |
++ * |
11344 |
++ * Deactivate an entity, independently from its previous state. If the |
11345 |
++ * entity was not on a service tree just return, otherwise if it is on |
11346 |
++ * any scheduler tree, extract it from that tree, and if necessary |
11347 |
++ * and if the caller did not specify @requeue, put it on the idle tree. |
11348 |
++ * |
11349 |
++ * Return %1 if the caller should update the entity hierarchy, i.e., |
11350 |
++ * if the entity was under service or if it was the next_in_service for |
11351 |
++ * its sched_data; return %0 otherwise. |
11352 |
++ */ |
11353 |
++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) |
11354 |
++{ |
11355 |
++ struct bfq_sched_data *sd = entity->sched_data; |
11356 |
++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); |
11357 |
++ int was_in_service = entity == sd->in_service_entity; |
11358 |
++ int ret = 0; |
11359 |
++ |
11360 |
++ if (!entity->on_st) |
11361 |
++ return 0; |
11362 |
++ |
11363 |
++ BUG_ON(was_in_service && entity->tree != NULL); |
11364 |
++ |
11365 |
++ if (was_in_service) { |
11366 |
++ bfq_calc_finish(entity, entity->service); |
11367 |
++ sd->in_service_entity = NULL; |
11368 |
++ } else if (entity->tree == &st->active) |
11369 |
++ bfq_active_extract(st, entity); |
11370 |
++ else if (entity->tree == &st->idle) |
11371 |
++ bfq_idle_extract(st, entity); |
11372 |
++ else if (entity->tree != NULL) |
11373 |
++ BUG(); |
11374 |
++ |
11375 |
++ if (was_in_service || sd->next_in_service == entity) |
11376 |
++ ret = bfq_update_next_in_service(sd); |
11377 |
++ |
11378 |
++ if (!requeue || !bfq_gt(entity->finish, st->vtime)) |
11379 |
++ bfq_forget_entity(st, entity); |
11380 |
++ else |
11381 |
++ bfq_idle_insert(st, entity); |
11382 |
++ |
11383 |
++ BUG_ON(sd->in_service_entity == entity); |
11384 |
++ BUG_ON(sd->next_in_service == entity); |
11385 |
++ |
11386 |
++ return ret; |
11387 |
++} |
11388 |
++ |
11389 |
++/** |
11390 |
++ * bfq_deactivate_entity - deactivate an entity. |
11391 |
++ * @entity: the entity to deactivate. |
11392 |
++ * @requeue: true if the entity can be put on the idle tree |
11393 |
++ */ |
11394 |
++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) |
11395 |
++{ |
11396 |
++ struct bfq_sched_data *sd; |
11397 |
++ struct bfq_entity *parent; |
11398 |
++ |
11399 |
++ for_each_entity_safe(entity, parent) { |
11400 |
++ sd = entity->sched_data; |
11401 |
++ |
11402 |
++ if (!__bfq_deactivate_entity(entity, requeue)) |
11403 |
++ /* |
11404 |
++ * The parent entity is still backlogged, and |
11405 |
++ * we don't need to update it as it is still |
11406 |
++ * under service. |
11407 |
++ */ |
11408 |
++ break; |
11409 |
++ |
11410 |
++ if (sd->next_in_service != NULL) |
11411 |
++ /* |
11412 |
++ * The parent entity is still backlogged and |
11413 |
++ * the budgets on the path towards the root |
11414 |
++ * need to be updated. |
11415 |
++ */ |
11416 |
++ goto update; |
11417 |
++ |
11418 |
++ /* |
11419 |
++ * If we reach there the parent is no more backlogged and |
11420 |
++ * we want to propagate the dequeue upwards. |
11421 |
++ */ |
11422 |
++ requeue = 1; |
11423 |
++ } |
11424 |
++ |
11425 |
++ return; |
11426 |
++ |
11427 |
++update: |
11428 |
++ entity = parent; |
11429 |
++ for_each_entity(entity) { |
11430 |
++ __bfq_activate_entity(entity); |
11431 |
++ |
11432 |
++ sd = entity->sched_data; |
11433 |
++ if (!bfq_update_next_in_service(sd)) |
11434 |
++ break; |
11435 |
++ } |
11436 |
++} |
11437 |
++ |
11438 |
++/** |
11439 |
++ * bfq_update_vtime - update vtime if necessary. |
11440 |
++ * @st: the service tree to act upon. |
11441 |
++ * |
11442 |
++ * If necessary update the service tree vtime to have at least one |
11443 |
++ * eligible entity, skipping to its start time. Assumes that the |
11444 |
++ * active tree of the device is not empty. |
11445 |
++ * |
11446 |
++ * NOTE: this hierarchical implementation updates vtimes quite often, |
11447 |
++ * we may end up with reactivated tasks getting timestamps after a |
11448 |
++ * vtime skip done because we needed a ->first_active entity on some |
11449 |
++ * intermediate node. |
11450 |
++ */ |
11451 |
++static void bfq_update_vtime(struct bfq_service_tree *st) |
11452 |
++{ |
11453 |
++ struct bfq_entity *entry; |
11454 |
++ struct rb_node *node = st->active.rb_node; |
11455 |
++ |
11456 |
++ entry = rb_entry(node, struct bfq_entity, rb_node); |
11457 |
++ if (bfq_gt(entry->min_start, st->vtime)) { |
11458 |
++ st->vtime = entry->min_start; |
11459 |
++ bfq_forget_idle(st); |
11460 |
++ } |
11461 |
++} |
11462 |
++ |
11463 |
++/** |
11464 |
++ * bfq_first_active_entity - find the eligible entity with |
11465 |
++ * the smallest finish time |
11466 |
++ * @st: the service tree to select from. |
11467 |
++ * |
11468 |
++ * This function searches the first schedulable entity, starting from the |
11469 |
++ * root of the tree and going on the left every time on this side there is |
11470 |
++ * a subtree with at least one eligible (start >= vtime) entity. The path |
11471 |
++ * on the right is followed only if a) the left subtree contains no eligible |
11472 |
++ * entities and b) no eligible entity has been found yet. |
11473 |
++ */ |
11474 |
++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) |
11475 |
++{ |
11476 |
++ struct bfq_entity *entry, *first = NULL; |
11477 |
++ struct rb_node *node = st->active.rb_node; |
11478 |
++ |
11479 |
++ while (node != NULL) { |
11480 |
++ entry = rb_entry(node, struct bfq_entity, rb_node); |
11481 |
++left: |
11482 |
++ if (!bfq_gt(entry->start, st->vtime)) |
11483 |
++ first = entry; |
11484 |
++ |
11485 |
++ BUG_ON(bfq_gt(entry->min_start, st->vtime)); |
11486 |
++ |
11487 |
++ if (node->rb_left != NULL) { |
11488 |
++ entry = rb_entry(node->rb_left, |
11489 |
++ struct bfq_entity, rb_node); |
11490 |
++ if (!bfq_gt(entry->min_start, st->vtime)) { |
11491 |
++ node = node->rb_left; |
11492 |
++ goto left; |
11493 |
++ } |
11494 |
++ } |
11495 |
++ if (first != NULL) |
11496 |
++ break; |
11497 |
++ node = node->rb_right; |
11498 |
++ } |
11499 |
++ |
11500 |
++ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active)); |
11501 |
++ return first; |
11502 |
++} |
11503 |
++ |
11504 |
++/** |
11505 |
++ * __bfq_lookup_next_entity - return the first eligible entity in @st. |
11506 |
++ * @st: the service tree. |
11507 |
++ * |
11508 |
++ * Update the virtual time in @st and return the first eligible entity |
11509 |
++ * it contains. |
11510 |
++ */ |
11511 |
++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, |
11512 |
++ bool force) |
11513 |
++{ |
11514 |
++ struct bfq_entity *entity, *new_next_in_service = NULL; |
11515 |
++ |
11516 |
++ if (RB_EMPTY_ROOT(&st->active)) |
11517 |
++ return NULL; |
11518 |
++ |
11519 |
++ bfq_update_vtime(st); |
11520 |
++ entity = bfq_first_active_entity(st); |
11521 |
++ BUG_ON(bfq_gt(entity->start, st->vtime)); |
11522 |
++ |
11523 |
++ /* |
11524 |
++ * If the chosen entity does not match with the sched_data's |
11525 |
++ * next_in_service and we are forcedly serving the IDLE priority |
11526 |
++ * class tree, bubble up budget update. |
11527 |
++ */ |
11528 |
++ if (unlikely(force && entity != entity->sched_data->next_in_service)) { |
11529 |
++ new_next_in_service = entity; |
11530 |
++ for_each_entity(new_next_in_service) |
11531 |
++ bfq_update_budget(new_next_in_service); |
11532 |
++ } |
11533 |
++ |
11534 |
++ return entity; |
11535 |
++} |
11536 |
++ |
11537 |
++/** |
11538 |
++ * bfq_lookup_next_entity - return the first eligible entity in @sd. |
11539 |
++ * @sd: the sched_data. |
11540 |
++ * @extract: if true the returned entity will be also extracted from @sd. |
11541 |
++ * |
11542 |
++ * NOTE: since we cache the next_in_service entity at each level of the |
11543 |
++ * hierarchy, the complexity of the lookup can be decreased with |
11544 |
++ * absolutely no effort just returning the cached next_in_service value; |
11545 |
++ * we prefer to do full lookups to test the consistency of * the data |
11546 |
++ * structures. |
11547 |
++ */ |
11548 |
++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, |
11549 |
++ int extract, |
11550 |
++ struct bfq_data *bfqd) |
11551 |
++{ |
11552 |
++ struct bfq_service_tree *st = sd->service_tree; |
11553 |
++ struct bfq_entity *entity; |
11554 |
++ int i = 0; |
11555 |
++ |
11556 |
++ BUG_ON(sd->in_service_entity != NULL); |
11557 |
++ |
11558 |
++ if (bfqd != NULL && |
11559 |
++ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { |
11560 |
++ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, |
11561 |
++ true); |
11562 |
++ if (entity != NULL) { |
11563 |
++ i = BFQ_IOPRIO_CLASSES - 1; |
11564 |
++ bfqd->bfq_class_idle_last_service = jiffies; |
11565 |
++ sd->next_in_service = entity; |
11566 |
++ } |
11567 |
++ } |
11568 |
++ for (; i < BFQ_IOPRIO_CLASSES; i++) { |
11569 |
++ entity = __bfq_lookup_next_entity(st + i, false); |
11570 |
++ if (entity != NULL) { |
11571 |
++ if (extract) { |
11572 |
++ bfq_check_next_in_service(sd, entity); |
11573 |
++ bfq_active_extract(st + i, entity); |
11574 |
++ sd->in_service_entity = entity; |
11575 |
++ sd->next_in_service = NULL; |
11576 |
++ } |
11577 |
++ break; |
11578 |
++ } |
11579 |
++ } |
11580 |
++ |
11581 |
++ return entity; |
11582 |
++} |
11583 |
++ |
11584 |
++/* |
11585 |
++ * Get next queue for service. |
11586 |
++ */ |
11587 |
++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) |
11588 |
++{ |
11589 |
++ struct bfq_entity *entity = NULL; |
11590 |
++ struct bfq_sched_data *sd; |
11591 |
++ struct bfq_queue *bfqq; |
11592 |
++ |
11593 |
++ BUG_ON(bfqd->in_service_queue != NULL); |
11594 |
++ |
11595 |
++ if (bfqd->busy_queues == 0) |
11596 |
++ return NULL; |
11597 |
++ |
11598 |
++ sd = &bfqd->root_group->sched_data; |
11599 |
++ for (; sd != NULL; sd = entity->my_sched_data) { |
11600 |
++ entity = bfq_lookup_next_entity(sd, 1, bfqd); |
11601 |
++ BUG_ON(entity == NULL); |
11602 |
++ entity->service = 0; |
11603 |
++ } |
11604 |
++ |
11605 |
++ bfqq = bfq_entity_to_bfqq(entity); |
11606 |
++ BUG_ON(bfqq == NULL); |
11607 |
++ |
11608 |
++ return bfqq; |
11609 |
++} |
11610 |
++ |
11611 |
++/* |
11612 |
++ * Forced extraction of the given queue. |
11613 |
++ */ |
11614 |
++static void bfq_get_next_queue_forced(struct bfq_data *bfqd, |
11615 |
++ struct bfq_queue *bfqq) |
11616 |
++{ |
11617 |
++ struct bfq_entity *entity; |
11618 |
++ struct bfq_sched_data *sd; |
11619 |
++ |
11620 |
++ BUG_ON(bfqd->in_service_queue != NULL); |
11621 |
++ |
11622 |
++ entity = &bfqq->entity; |
11623 |
++ /* |
11624 |
++ * Bubble up extraction/update from the leaf to the root. |
11625 |
++ */ |
11626 |
++ for_each_entity(entity) { |
11627 |
++ sd = entity->sched_data; |
11628 |
++ bfq_update_budget(entity); |
11629 |
++ bfq_update_vtime(bfq_entity_service_tree(entity)); |
11630 |
++ bfq_active_extract(bfq_entity_service_tree(entity), entity); |
11631 |
++ sd->active_entity = entity; |
11632 |
++ sd->next_active = NULL; |
11633 |
++ entity->service = 0; |
11634 |
++ } |
11635 |
++ |
11636 |
++ return; |
11637 |
++} |
11638 |
++ |
11639 |
++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) |
11640 |
++{ |
11641 |
++ if (bfqd->in_service_bic != NULL) { |
11642 |
++ put_io_context(bfqd->in_service_bic->icq.ioc); |
11643 |
++ bfqd->in_service_bic = NULL; |
11644 |
++ } |
11645 |
++ |
11646 |
++ bfqd->in_service_queue = NULL; |
11647 |
++ del_timer(&bfqd->idle_slice_timer); |
11648 |
++} |
11649 |
++ |
11650 |
++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
11651 |
++ int requeue) |
11652 |
++{ |
11653 |
++ struct bfq_entity *entity = &bfqq->entity; |
11654 |
++ |
11655 |
++ if (bfqq == bfqd->in_service_queue) |
11656 |
++ __bfq_bfqd_reset_in_service(bfqd); |
11657 |
++ |
11658 |
++ bfq_deactivate_entity(entity, requeue); |
11659 |
++} |
11660 |
++ |
11661 |
++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
11662 |
++{ |
11663 |
++ struct bfq_entity *entity = &bfqq->entity; |
11664 |
++ |
11665 |
++ bfq_activate_entity(entity); |
11666 |
++} |
11667 |
++ |
11668 |
++/* |
11669 |
++ * Called when the bfqq no longer has requests pending, remove it from |
11670 |
++ * the service tree. |
11671 |
++ */ |
11672 |
++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
11673 |
++ int requeue) |
11674 |
++{ |
11675 |
++ BUG_ON(!bfq_bfqq_busy(bfqq)); |
11676 |
++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); |
11677 |
++ |
11678 |
++ bfq_log_bfqq(bfqd, bfqq, "del from busy"); |
11679 |
++ |
11680 |
++ bfq_clear_bfqq_busy(bfqq); |
11681 |
++ |
11682 |
++ BUG_ON(bfqd->busy_queues == 0); |
11683 |
++ bfqd->busy_queues--; |
11684 |
++ if (bfqq->raising_coeff > 1) |
11685 |
++ bfqd->raised_busy_queues--; |
11686 |
++ |
11687 |
++ bfq_deactivate_bfqq(bfqd, bfqq, requeue); |
11688 |
++} |
11689 |
++ |
11690 |
++/* |
11691 |
++ * Called when an inactive queue receives a new request. |
11692 |
++ */ |
11693 |
++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
11694 |
++{ |
11695 |
++ BUG_ON(bfq_bfqq_busy(bfqq)); |
11696 |
++ BUG_ON(bfqq == bfqd->in_service_queue); |
11697 |
++ |
11698 |
++ bfq_log_bfqq(bfqd, bfqq, "add to busy"); |
11699 |
++ |
11700 |
++ bfq_activate_bfqq(bfqd, bfqq); |
11701 |
++ |
11702 |
++ bfq_mark_bfqq_busy(bfqq); |
11703 |
++ bfqd->busy_queues++; |
11704 |
++ if (bfqq->raising_coeff > 1) |
11705 |
++ bfqd->raised_busy_queues++; |
11706 |
++} |
11707 |
+diff --git a/block/bfq.h b/block/bfq.h |
11708 |
+new file mode 100644 |
11709 |
+index 0000000..f9b5881 |
11710 |
+--- /dev/null |
11711 |
++++ b/block/bfq.h |
11712 |
+@@ -0,0 +1,614 @@ |
11713 |
++/* |
11714 |
++ * BFQ-v7r1 for 3.13.0: data structures and common functions prototypes. |
11715 |
++ * |
11716 |
++ * Based on ideas and code from CFQ: |
11717 |
++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
11718 |
++ * |
11719 |
++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
11720 |
++ * Paolo Valente <paolo.valente@×××××××.it> |
11721 |
++ * |
11722 |
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
11723 |
++ */ |
11724 |
++ |
11725 |
++#ifndef _BFQ_H |
11726 |
++#define _BFQ_H |
11727 |
++ |
11728 |
++#include <linux/blktrace_api.h> |
11729 |
++#include <linux/hrtimer.h> |
11730 |
++#include <linux/ioprio.h> |
11731 |
++#include <linux/rbtree.h> |
11732 |
++ |
11733 |
++#define BFQ_IOPRIO_CLASSES 3 |
11734 |
++#define BFQ_CL_IDLE_TIMEOUT (HZ/5) |
11735 |
++ |
11736 |
++#define BFQ_MIN_WEIGHT 1 |
11737 |
++#define BFQ_MAX_WEIGHT 1000 |
11738 |
++ |
11739 |
++#define BFQ_DEFAULT_GRP_WEIGHT 10 |
11740 |
++#define BFQ_DEFAULT_GRP_IOPRIO 0 |
11741 |
++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE |
11742 |
++ |
11743 |
++struct bfq_entity; |
11744 |
++ |
11745 |
++/** |
11746 |
++ * struct bfq_service_tree - per ioprio_class service tree. |
11747 |
++ * @active: tree for active entities (i.e., those backlogged). |
11748 |
++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). |
11749 |
++ * @first_idle: idle entity with minimum F_i. |
11750 |
++ * @last_idle: idle entity with maximum F_i. |
11751 |
++ * @vtime: scheduler virtual time. |
11752 |
++ * @wsum: scheduler weight sum; active and idle entities contribute to it. |
11753 |
++ * |
11754 |
++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each |
11755 |
++ * ioprio_class has its own independent scheduler, and so its own |
11756 |
++ * bfq_service_tree. All the fields are protected by the queue lock |
11757 |
++ * of the containing bfqd. |
11758 |
++ */ |
11759 |
++struct bfq_service_tree { |
11760 |
++ struct rb_root active; |
11761 |
++ struct rb_root idle; |
11762 |
++ |
11763 |
++ struct bfq_entity *first_idle; |
11764 |
++ struct bfq_entity *last_idle; |
11765 |
++ |
11766 |
++ u64 vtime; |
11767 |
++ unsigned long wsum; |
11768 |
++}; |
11769 |
++ |
11770 |
++/** |
11771 |
++ * struct bfq_sched_data - multi-class scheduler. |
11772 |
++ * @in_service_entity: entity under service. |
11773 |
++ * @next_in_service: head-of-the-line entity in the scheduler. |
11774 |
++ * @service_tree: array of service trees, one per ioprio_class. |
11775 |
++ * |
11776 |
++ * bfq_sched_data is the basic scheduler queue. It supports three |
11777 |
++ * ioprio_classes, and can be used either as a toplevel queue or as |
11778 |
++ * an intermediate queue on a hierarchical setup. |
11779 |
++ * @next_in_service points to the active entity of the sched_data |
11780 |
++ * service trees that will be scheduled next. |
11781 |
++ * |
11782 |
++ * The supported ioprio_classes are the same as in CFQ, in descending |
11783 |
++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. |
11784 |
++ * Requests from higher priority queues are served before all the |
11785 |
++ * requests from lower priority queues; among requests of the same |
11786 |
++ * queue requests are served according to B-WF2Q+. |
11787 |
++ * All the fields are protected by the queue lock of the containing bfqd. |
11788 |
++ */ |
11789 |
++struct bfq_sched_data { |
11790 |
++ struct bfq_entity *in_service_entity; |
11791 |
++ struct bfq_entity *next_in_service; |
11792 |
++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; |
11793 |
++}; |
11794 |
++ |
11795 |
++/** |
11796 |
++ * struct bfq_entity - schedulable entity. |
11797 |
++ * @rb_node: service_tree member. |
11798 |
++ * @on_st: flag, true if the entity is on a tree (either the active or |
11799 |
++ * the idle one of its service_tree). |
11800 |
++ * @finish: B-WF2Q+ finish timestamp (aka F_i). |
11801 |
++ * @start: B-WF2Q+ start timestamp (aka S_i). |
11802 |
++ * @tree: tree the entity is enqueued into; %NULL if not on a tree. |
11803 |
++ * @min_start: minimum start time of the (active) subtree rooted at |
11804 |
++ * this entity; used for O(log N) lookups into active trees. |
11805 |
++ * @service: service received during the last round of service. |
11806 |
++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. |
11807 |
++ * @weight: weight of the queue |
11808 |
++ * @parent: parent entity, for hierarchical scheduling. |
11809 |
++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the |
11810 |
++ * associated scheduler queue, %NULL on leaf nodes. |
11811 |
++ * @sched_data: the scheduler queue this entity belongs to. |
11812 |
++ * @ioprio: the ioprio in use. |
11813 |
++ * @new_weight: when a weight change is requested, the new weight value. |
11814 |
++ * @orig_weight: original weight, used to implement weight boosting |
11815 |
++ * @new_ioprio: when an ioprio change is requested, the new ioprio value. |
11816 |
++ * @ioprio_class: the ioprio_class in use. |
11817 |
++ * @new_ioprio_class: when an ioprio_class change is requested, the new |
11818 |
++ * ioprio_class value. |
11819 |
++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or |
11820 |
++ * ioprio_class change. |
11821 |
++ * |
11822 |
++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the |
11823 |
++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each |
11824 |
++ * entity belongs to the sched_data of the parent group in the cgroup |
11825 |
++ * hierarchy. Non-leaf entities have also their own sched_data, stored |
11826 |
++ * in @my_sched_data. |
11827 |
++ * |
11828 |
++ * Each entity stores independently its priority values; this would |
11829 |
++ * allow different weights on different devices, but this |
11830 |
++ * functionality is not exported to userspace by now. Priorities and |
11831 |
++ * weights are updated lazily, first storing the new values into the |
11832 |
++ * new_* fields, then setting the @ioprio_changed flag. As soon as |
11833 |
++ * there is a transition in the entity state that allows the priority |
11834 |
++ * update to take place the effective and the requested priority |
11835 |
++ * values are synchronized. |
11836 |
++ * |
11837 |
++ * Unless cgroups are used, the weight value is calculated from the |
11838 |
++ * ioprio to export the same interface as CFQ. When dealing with |
11839 |
++ * ``well-behaved'' queues (i.e., queues that do not spend too much |
11840 |
++ * time to consume their budget and have true sequential behavior, and |
11841 |
++ * when there are no external factors breaking anticipation) the |
11842 |
++ * relative weights at each level of the cgroups hierarchy should be |
11843 |
++ * guaranteed. All the fields are protected by the queue lock of the |
11844 |
++ * containing bfqd. |
11845 |
++ */ |
11846 |
++struct bfq_entity { |
11847 |
++ struct rb_node rb_node; |
11848 |
++ |
11849 |
++ int on_st; |
11850 |
++ |
11851 |
++ u64 finish; |
11852 |
++ u64 start; |
11853 |
++ |
11854 |
++ struct rb_root *tree; |
11855 |
++ |
11856 |
++ u64 min_start; |
11857 |
++ |
11858 |
++ unsigned long service, budget; |
11859 |
++ unsigned short weight, new_weight; |
11860 |
++ unsigned short orig_weight; |
11861 |
++ |
11862 |
++ struct bfq_entity *parent; |
11863 |
++ |
11864 |
++ struct bfq_sched_data *my_sched_data; |
11865 |
++ struct bfq_sched_data *sched_data; |
11866 |
++ |
11867 |
++ unsigned short ioprio, new_ioprio; |
11868 |
++ unsigned short ioprio_class, new_ioprio_class; |
11869 |
++ |
11870 |
++ int ioprio_changed; |
11871 |
++}; |
11872 |
++ |
11873 |
++struct bfq_group; |
11874 |
++ |
11875 |
++/** |
11876 |
++ * struct bfq_queue - leaf schedulable entity. |
11877 |
++ * @ref: reference counter. |
11878 |
++ * @bfqd: parent bfq_data. |
11879 |
++ * @new_bfqq: shared bfq_queue if queue is cooperating with |
11880 |
++ * one or more other queues. |
11881 |
++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree). |
11882 |
++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree). |
11883 |
++ * @sort_list: sorted list of pending requests. |
11884 |
++ * @next_rq: if fifo isn't expired, next request to serve. |
11885 |
++ * @queued: nr of requests queued in @sort_list. |
11886 |
++ * @allocated: currently allocated requests. |
11887 |
++ * @meta_pending: pending metadata requests. |
11888 |
++ * @fifo: fifo list of requests in sort_list. |
11889 |
++ * @entity: entity representing this queue in the scheduler. |
11890 |
++ * @max_budget: maximum budget allowed from the feedback mechanism. |
11891 |
++ * @budget_timeout: budget expiration (in jiffies). |
11892 |
++ * @dispatched: number of requests on the dispatch list or inside driver. |
11893 |
++ * @org_ioprio: saved ioprio during boosted periods. |
11894 |
++ * @flags: status flags. |
11895 |
++ * @bfqq_list: node for active/idle bfqq list inside our bfqd. |
11896 |
++ * @seek_samples: number of seeks sampled |
11897 |
++ * @seek_total: sum of the distances of the seeks sampled |
11898 |
++ * @seek_mean: mean seek distance |
11899 |
++ * @last_request_pos: position of the last request enqueued |
11900 |
++ * @pid: pid of the process owning the queue, used for logging purposes. |
11901 |
++ * @last_rais_start_time: last (idle -> weight-raised) transition attempt |
11902 |
++ * @raising_cur_max_time: current max raising time for this queue |
11903 |
++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from |
11904 |
++ * idle to backlogged |
11905 |
++ * @service_from_backlogged: cumulative service received from the @bfq_queue |
11906 |
++ * since the last transition from idle to backlogged |
11907 |
++ * |
11908 |
++ * A bfq_queue is a leaf request queue; it can be associated to an io_context |
11909 |
++ * or more (if it is an async one). @cgroup holds a reference to the |
11910 |
++ * cgroup, to be sure that it does not disappear while a bfqq still |
11911 |
++ * references it (mostly to avoid races between request issuing and task |
11912 |
++ * migration followed by cgroup distruction). |
11913 |
++ * All the fields are protected by the queue lock of the containing bfqd. |
11914 |
++ */ |
11915 |
++struct bfq_queue { |
11916 |
++ atomic_t ref; |
11917 |
++ struct bfq_data *bfqd; |
11918 |
++ |
11919 |
++ /* fields for cooperating queues handling */ |
11920 |
++ struct bfq_queue *new_bfqq; |
11921 |
++ struct rb_node pos_node; |
11922 |
++ struct rb_root *pos_root; |
11923 |
++ |
11924 |
++ struct rb_root sort_list; |
11925 |
++ struct request *next_rq; |
11926 |
++ int queued[2]; |
11927 |
++ int allocated[2]; |
11928 |
++ int meta_pending; |
11929 |
++ struct list_head fifo; |
11930 |
++ |
11931 |
++ struct bfq_entity entity; |
11932 |
++ |
11933 |
++ unsigned long max_budget; |
11934 |
++ unsigned long budget_timeout; |
11935 |
++ |
11936 |
++ int dispatched; |
11937 |
++ |
11938 |
++ unsigned short org_ioprio; |
11939 |
++ |
11940 |
++ unsigned int flags; |
11941 |
++ |
11942 |
++ struct list_head bfqq_list; |
11943 |
++ |
11944 |
++ unsigned int seek_samples; |
11945 |
++ u64 seek_total; |
11946 |
++ sector_t seek_mean; |
11947 |
++ sector_t last_request_pos; |
11948 |
++ |
11949 |
++ pid_t pid; |
11950 |
++ |
11951 |
++ /* weight-raising fields */ |
11952 |
++ unsigned long raising_cur_max_time; |
11953 |
++ unsigned long soft_rt_next_start; |
11954 |
++ unsigned long last_rais_start_finish; |
11955 |
++ unsigned int raising_coeff; |
11956 |
++ unsigned long last_idle_bklogged; |
11957 |
++ unsigned long service_from_backlogged; |
11958 |
++}; |
11959 |
++ |
11960 |
++/** |
11961 |
++ * struct bfq_ttime - per process thinktime stats. |
11962 |
++ * @ttime_total: total process thinktime |
11963 |
++ * @ttime_samples: number of thinktime samples |
11964 |
++ * @ttime_mean: average process thinktime |
11965 |
++ */ |
11966 |
++struct bfq_ttime { |
11967 |
++ unsigned long last_end_request; |
11968 |
++ |
11969 |
++ unsigned long ttime_total; |
11970 |
++ unsigned long ttime_samples; |
11971 |
++ unsigned long ttime_mean; |
11972 |
++}; |
11973 |
++ |
11974 |
++/** |
11975 |
++ * struct bfq_io_cq - per (request_queue, io_context) structure. |
11976 |
++ * @icq: associated io_cq structure |
11977 |
++ * @bfqq: array of two process queues, the sync and the async |
11978 |
++ * @ttime: associated @bfq_ttime struct |
11979 |
++ */ |
11980 |
++struct bfq_io_cq { |
11981 |
++ struct io_cq icq; /* must be the first member */ |
11982 |
++ struct bfq_queue *bfqq[2]; |
11983 |
++ struct bfq_ttime ttime; |
11984 |
++ int ioprio; |
11985 |
++}; |
11986 |
++ |
11987 |
++/** |
11988 |
++ * struct bfq_data - per device data structure. |
11989 |
++ * @queue: request queue for the managed device. |
11990 |
++ * @root_group: root bfq_group for the device. |
11991 |
++ * @rq_pos_tree: rbtree sorted by next_request position, |
11992 |
++ * used when determining if two or more queues |
11993 |
++ * have interleaving requests (see bfq_close_cooperator). |
11994 |
++ * @busy_queues: number of bfq_queues containing requests (including the |
11995 |
++ * queue under service, even if it is idling). |
11996 |
++ * @raised_busy_queues: number of weight-raised busy bfq_queues. |
11997 |
++ * @queued: number of queued requests. |
11998 |
++ * @rq_in_driver: number of requests dispatched and waiting for completion. |
11999 |
++ * @sync_flight: number of sync requests in the driver. |
12000 |
++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples |
12001 |
++ * completed requests . |
12002 |
++ * @hw_tag_samples: nr of samples used to calculate hw_tag. |
12003 |
++ * @hw_tag: flag set to one if the driver is showing a queueing behavior. |
12004 |
++ * @budgets_assigned: number of budgets assigned. |
12005 |
++ * @idle_slice_timer: timer set when idling for the next sequential request |
12006 |
++ * from the queue under service. |
12007 |
++ * @unplug_work: delayed work to restart dispatching on the request queue. |
12008 |
++ * @in_service_queue: bfq_queue under service. |
12009 |
++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. |
12010 |
++ * @last_position: on-disk position of the last served request. |
12011 |
++ * @last_budget_start: beginning of the last budget. |
12012 |
++ * @last_idling_start: beginning of the last idle slice. |
12013 |
++ * @peak_rate: peak transfer rate observed for a budget. |
12014 |
++ * @peak_rate_samples: number of samples used to calculate @peak_rate. |
12015 |
++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling. |
12016 |
++ * @group_list: list of all the bfq_groups active on the device. |
12017 |
++ * @active_list: list of all the bfq_queues active on the device. |
12018 |
++ * @idle_list: list of all the bfq_queues idle on the device. |
12019 |
++ * @bfq_quantum: max number of requests dispatched per dispatch round. |
12020 |
++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires |
12021 |
++ * requests are served in fifo order. |
12022 |
++ * @bfq_back_penalty: weight of backward seeks wrt forward ones. |
12023 |
++ * @bfq_back_max: maximum allowed backward seek. |
12024 |
++ * @bfq_slice_idle: maximum idling time. |
12025 |
++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning). |
12026 |
++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to |
12027 |
++ * async queues. |
12028 |
++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to |
12029 |
++ * to prevent seeky queues to impose long latencies to well |
12030 |
++ * behaved ones (this also implies that seeky queues cannot |
12031 |
++ * receive guarantees in the service domain; after a timeout |
12032 |
++ * they are charged for the whole allocated budget, to try |
12033 |
++ * to preserve a behavior reasonably fair among them, but |
12034 |
++ * without service-domain guarantees). |
12035 |
++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted |
12036 |
++ * queue is multiplied |
12037 |
++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies) |
12038 |
++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes |
12039 |
++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising |
12040 |
++ * may be reactivated for a queue (in jiffies) |
12041 |
++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals |
12042 |
++ * after which weight-raising may be |
12043 |
++ * reactivated for an already busy queue |
12044 |
++ * (in jiffies) |
12045 |
++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue, |
12046 |
++ * sectors per seconds |
12047 |
++ * @RT_prod: cached value of the product R*T used for computing the maximum |
12048 |
++ * duration of the weight raising automatically |
12049 |
++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions |
12050 |
++ * |
12051 |
++ * All the fields are protected by the @queue lock. |
12052 |
++ */ |
12053 |
++struct bfq_data { |
12054 |
++ struct request_queue *queue; |
12055 |
++ |
12056 |
++ struct bfq_group *root_group; |
12057 |
++ |
12058 |
++ struct rb_root rq_pos_tree; |
12059 |
++ |
12060 |
++ int busy_queues; |
12061 |
++ int raised_busy_queues; |
12062 |
++ int queued; |
12063 |
++ int rq_in_driver; |
12064 |
++ int sync_flight; |
12065 |
++ |
12066 |
++ int max_rq_in_driver; |
12067 |
++ int hw_tag_samples; |
12068 |
++ int hw_tag; |
12069 |
++ |
12070 |
++ int budgets_assigned; |
12071 |
++ |
12072 |
++ struct timer_list idle_slice_timer; |
12073 |
++ struct work_struct unplug_work; |
12074 |
++ |
12075 |
++ struct bfq_queue *in_service_queue; |
12076 |
++ struct bfq_io_cq *in_service_bic; |
12077 |
++ |
12078 |
++ sector_t last_position; |
12079 |
++ |
12080 |
++ ktime_t last_budget_start; |
12081 |
++ ktime_t last_idling_start; |
12082 |
++ int peak_rate_samples; |
12083 |
++ u64 peak_rate; |
12084 |
++ unsigned long bfq_max_budget; |
12085 |
++ |
12086 |
++ struct hlist_head group_list; |
12087 |
++ struct list_head active_list; |
12088 |
++ struct list_head idle_list; |
12089 |
++ |
12090 |
++ unsigned int bfq_quantum; |
12091 |
++ unsigned int bfq_fifo_expire[2]; |
12092 |
++ unsigned int bfq_back_penalty; |
12093 |
++ unsigned int bfq_back_max; |
12094 |
++ unsigned int bfq_slice_idle; |
12095 |
++ u64 bfq_class_idle_last_service; |
12096 |
++ |
12097 |
++ unsigned int bfq_user_max_budget; |
12098 |
++ unsigned int bfq_max_budget_async_rq; |
12099 |
++ unsigned int bfq_timeout[2]; |
12100 |
++ |
12101 |
++ bool low_latency; |
12102 |
++ |
12103 |
++ /* parameters of the low_latency heuristics */ |
12104 |
++ unsigned int bfq_raising_coeff; |
12105 |
++ unsigned int bfq_raising_max_time; |
12106 |
++ unsigned int bfq_raising_rt_max_time; |
12107 |
++ unsigned int bfq_raising_min_idle_time; |
12108 |
++ unsigned long bfq_raising_min_inter_arr_async; |
12109 |
++ unsigned int bfq_raising_max_softrt_rate; |
12110 |
++ u64 RT_prod; |
12111 |
++ |
12112 |
++ struct bfq_queue oom_bfqq; |
12113 |
++}; |
12114 |
++ |
12115 |
++enum bfqq_state_flags { |
12116 |
++ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */ |
12117 |
++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ |
12118 |
++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ |
12119 |
++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ |
12120 |
++ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ |
12121 |
++ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */ |
12122 |
++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ |
12123 |
++ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ |
12124 |
++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ |
12125 |
++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */ |
12126 |
++ BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */ |
12127 |
++}; |
12128 |
++ |
12129 |
++#define BFQ_BFQQ_FNS(name) \ |
12130 |
++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ |
12131 |
++{ \ |
12132 |
++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ |
12133 |
++} \ |
12134 |
++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ |
12135 |
++{ \ |
12136 |
++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ |
12137 |
++} \ |
12138 |
++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ |
12139 |
++{ \ |
12140 |
++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ |
12141 |
++} |
12142 |
++ |
12143 |
++BFQ_BFQQ_FNS(busy); |
12144 |
++BFQ_BFQQ_FNS(wait_request); |
12145 |
++BFQ_BFQQ_FNS(must_alloc); |
12146 |
++BFQ_BFQQ_FNS(fifo_expire); |
12147 |
++BFQ_BFQQ_FNS(idle_window); |
12148 |
++BFQ_BFQQ_FNS(prio_changed); |
12149 |
++BFQ_BFQQ_FNS(sync); |
12150 |
++BFQ_BFQQ_FNS(budget_new); |
12151 |
++BFQ_BFQQ_FNS(coop); |
12152 |
++BFQ_BFQQ_FNS(split_coop); |
12153 |
++BFQ_BFQQ_FNS(softrt_update); |
12154 |
++#undef BFQ_BFQQ_FNS |
12155 |
++ |
12156 |
++/* Logging facilities. */ |
12157 |
++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ |
12158 |
++ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) |
12159 |
++ |
12160 |
++#define bfq_log(bfqd, fmt, args...) \ |
12161 |
++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) |
12162 |
++ |
12163 |
++/* Expiration reasons. */ |
12164 |
++enum bfqq_expiration { |
12165 |
++ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */ |
12166 |
++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ |
12167 |
++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ |
12168 |
++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ |
12169 |
++}; |
12170 |
++ |
12171 |
++#ifdef CONFIG_CGROUP_BFQIO |
12172 |
++/** |
12173 |
++ * struct bfq_group - per (device, cgroup) data structure. |
12174 |
++ * @entity: schedulable entity to insert into the parent group sched_data. |
12175 |
++ * @sched_data: own sched_data, to contain child entities (they may be |
12176 |
++ * both bfq_queues and bfq_groups). |
12177 |
++ * @group_node: node to be inserted into the bfqio_cgroup->group_data |
12178 |
++ * list of the containing cgroup's bfqio_cgroup. |
12179 |
++ * @bfqd_node: node to be inserted into the @bfqd->group_list list |
12180 |
++ * of the groups active on the same device; used for cleanup. |
12181 |
++ * @bfqd: the bfq_data for the device this group acts upon. |
12182 |
++ * @async_bfqq: array of async queues for all the tasks belonging to |
12183 |
++ * the group, one queue per ioprio value per ioprio_class, |
12184 |
++ * except for the idle class that has only one queue. |
12185 |
++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). |
12186 |
++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used |
12187 |
++ * to avoid too many special cases during group creation/migration. |
12188 |
++ * |
12189 |
++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup |
12190 |
++ * there is a set of bfq_groups, each one collecting the lower-level |
12191 |
++ * entities belonging to the group that are acting on the same device. |
12192 |
++ * |
12193 |
++ * Locking works as follows: |
12194 |
++ * o @group_node is protected by the bfqio_cgroup lock, and is accessed |
12195 |
++ * via RCU from its readers. |
12196 |
++ * o @bfqd is protected by the queue lock, RCU is used to access it |
12197 |
++ * from the readers. |
12198 |
++ * o All the other fields are protected by the @bfqd queue lock. |
12199 |
++ */ |
12200 |
++struct bfq_group { |
12201 |
++ struct bfq_entity entity; |
12202 |
++ struct bfq_sched_data sched_data; |
12203 |
++ |
12204 |
++ struct hlist_node group_node; |
12205 |
++ struct hlist_node bfqd_node; |
12206 |
++ |
12207 |
++ void *bfqd; |
12208 |
++ |
12209 |
++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; |
12210 |
++ struct bfq_queue *async_idle_bfqq; |
12211 |
++ |
12212 |
++ struct bfq_entity *my_entity; |
12213 |
++}; |
12214 |
++ |
12215 |
++/** |
12216 |
++ * struct bfqio_cgroup - bfq cgroup data structure. |
12217 |
++ * @css: subsystem state for bfq in the containing cgroup. |
12218 |
++ * @online: flag marked when the subsystem is inserted. |
12219 |
++ * @weight: cgroup weight. |
12220 |
++ * @ioprio: cgroup ioprio. |
12221 |
++ * @ioprio_class: cgroup ioprio_class. |
12222 |
++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data. |
12223 |
++ * @group_data: list containing the bfq_group belonging to this cgroup. |
12224 |
++ * |
12225 |
++ * @group_data is accessed using RCU, with @lock protecting the updates, |
12226 |
++ * @ioprio and @ioprio_class are protected by @lock. |
12227 |
++ */ |
12228 |
++struct bfqio_cgroup { |
12229 |
++ struct cgroup_subsys_state css; |
12230 |
++ bool online; |
12231 |
++ |
12232 |
++ unsigned short weight, ioprio, ioprio_class; |
12233 |
++ |
12234 |
++ spinlock_t lock; |
12235 |
++ struct hlist_head group_data; |
12236 |
++}; |
12237 |
++#else |
12238 |
++struct bfq_group { |
12239 |
++ struct bfq_sched_data sched_data; |
12240 |
++ |
12241 |
++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; |
12242 |
++ struct bfq_queue *async_idle_bfqq; |
12243 |
++}; |
12244 |
++#endif |
12245 |
++ |
12246 |
++static inline struct bfq_service_tree * |
12247 |
++bfq_entity_service_tree(struct bfq_entity *entity) |
12248 |
++{ |
12249 |
++ struct bfq_sched_data *sched_data = entity->sched_data; |
12250 |
++ unsigned int idx = entity->ioprio_class - 1; |
12251 |
++ |
12252 |
++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); |
12253 |
++ BUG_ON(sched_data == NULL); |
12254 |
++ |
12255 |
++ return sched_data->service_tree + idx; |
12256 |
++} |
12257 |
++ |
12258 |
++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, |
12259 |
++ int is_sync) |
12260 |
++{ |
12261 |
++ return bic->bfqq[!!is_sync]; |
12262 |
++} |
12263 |
++ |
12264 |
++static inline void bic_set_bfqq(struct bfq_io_cq *bic, |
12265 |
++ struct bfq_queue *bfqq, int is_sync) |
12266 |
++{ |
12267 |
++ bic->bfqq[!!is_sync] = bfqq; |
12268 |
++} |
12269 |
++ |
12270 |
++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) |
12271 |
++{ |
12272 |
++ return bic->icq.q->elevator->elevator_data; |
12273 |
++} |
12274 |
++ |
12275 |
++/** |
12276 |
++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. |
12277 |
++ * @ptr: a pointer to a bfqd. |
12278 |
++ * @flags: storage for the flags to be saved. |
12279 |
++ * |
12280 |
++ * This function allows bfqg->bfqd to be protected by the |
12281 |
++ * queue lock of the bfqd they reference; the pointer is dereferenced |
12282 |
++ * under RCU, so the storage for bfqd is assured to be safe as long |
12283 |
++ * as the RCU read side critical section does not end. After the |
12284 |
++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be |
12285 |
++ * sure that no other writer accessed it. If we raced with a writer, |
12286 |
++ * the function returns NULL, with the queue unlocked, otherwise it |
12287 |
++ * returns the dereferenced pointer, with the queue locked. |
12288 |
++ */ |
12289 |
++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr, |
12290 |
++ unsigned long *flags) |
12291 |
++{ |
12292 |
++ struct bfq_data *bfqd; |
12293 |
++ |
12294 |
++ rcu_read_lock(); |
12295 |
++ bfqd = rcu_dereference(*(struct bfq_data **)ptr); |
12296 |
++ |
12297 |
++ if (bfqd != NULL) { |
12298 |
++ spin_lock_irqsave(bfqd->queue->queue_lock, *flags); |
12299 |
++ if (*ptr == bfqd) |
12300 |
++ goto out; |
12301 |
++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); |
12302 |
++ } |
12303 |
++ |
12304 |
++ bfqd = NULL; |
12305 |
++out: |
12306 |
++ rcu_read_unlock(); |
12307 |
++ return bfqd; |
12308 |
++} |
12309 |
++ |
12310 |
++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd, |
12311 |
++ unsigned long *flags) |
12312 |
++{ |
12313 |
++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); |
12314 |
++} |
12315 |
++ |
12316 |
++static void bfq_changed_ioprio(struct bfq_io_cq *bic); |
12317 |
++static void bfq_put_queue(struct bfq_queue *bfqq); |
12318 |
++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); |
12319 |
++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, |
12320 |
++ struct bfq_group *bfqg, int is_sync, |
12321 |
++ struct bfq_io_cq *bic, gfp_t gfp_mask); |
12322 |
++static void bfq_end_raising_async_queues(struct bfq_data *bfqd, |
12323 |
++ struct bfq_group *bfqg); |
12324 |
++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); |
12325 |
++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); |
12326 |
++#endif |
12327 |
+-- |
12328 |
+1.8.5.2 |
12329 |
+ |
12330 |
|
12331 |
Deleted: genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch |
12332 |
=================================================================== |
12333 |
--- genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch 2014-02-07 14:46:59 UTC (rev 2665) |
12334 |
+++ genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7-for-3.13.0.patch 2014-02-07 15:42:35 UTC (rev 2666) |
12335 |
@@ -1,1034 +0,0 @@ |
12336 |
-From 3cd9e2ea29c3ba9e420556e8ecf161d166186b63 Mon Sep 17 00:00:00 2001 |
12337 |
-From: Mauro Andreolini <mauro.andreolini@×××××××.it> |
12338 |
-Date: Thu, 23 Jan 2014 16:54:44 +0100 |
12339 |
-Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7 for |
12340 |
- 3.13.0 |
12341 |
- |
12342 |
-A set of processes may happen to perform interleaved reads, i.e., requests |
12343 |
-whose union would give rise to a sequential read pattern. There are two |
12344 |
-typical cases: in the first case, processes read fixed-size chunks of |
12345 |
-data at a fixed distance from each other, while in the second case processes |
12346 |
-may read variable-size chunks at variable distances. The latter case occurs |
12347 |
-for example with KVM, which splits the I/O generated by the guest into |
12348 |
-multiple chunks, and lets these chunks be served by a pool of cooperating |
12349 |
-processes, iteratively assigning the next chunk of I/O to the first |
12350 |
-available process. CFQ uses actual queue merging for the first type of |
12351 |
-rocesses, whereas it uses preemption to get a sequential read pattern out |
12352 |
-of the read requests performed by the second type of processes. In the end |
12353 |
-it uses two different mechanisms to achieve the same goal: boosting the |
12354 |
-throughput with interleaved I/O. |
12355 |
- |
12356 |
-This patch introduces Early Queue Merge (EQM), a unified mechanism to get a |
12357 |
-sequential read pattern with both types of processes. The main idea is |
12358 |
-checking newly arrived requests against the next request of the active queue |
12359 |
-both in case of actual request insert and in case of request merge. By doing |
12360 |
-so, both the types of processes can be handled by just merging their queues. |
12361 |
-EQM is then simpler and more compact than the pair of mechanisms used in |
12362 |
-CFQ. |
12363 |
- |
12364 |
-Finally, EQM also preserves the typical low-latency properties of BFQ, by |
12365 |
-properly restoring the weight-raising state of a queue when it gets back to |
12366 |
-a non-merged state. |
12367 |
- |
12368 |
-Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it> |
12369 |
-Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com> |
12370 |
-Reviewed-by: Paolo Valente <paolo.valente@×××××××.it> |
12371 |
---- |
12372 |
- block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++---------------- |
12373 |
- block/bfq-sched.c | 28 --- |
12374 |
- block/bfq.h | 16 ++ |
12375 |
- 3 files changed, 474 insertions(+), 227 deletions(-) |
12376 |
- |
12377 |
-diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c |
12378 |
-index 7670400..295236e 100644 |
12379 |
---- a/block/bfq-iosched.c |
12380 |
-+++ b/block/bfq-iosched.c |
12381 |
-@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd) |
12382 |
- return dur; |
12383 |
- } |
12384 |
- |
12385 |
-+static inline void |
12386 |
-+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) |
12387 |
-+{ |
12388 |
-+ if (bic->saved_idle_window) |
12389 |
-+ bfq_mark_bfqq_idle_window(bfqq); |
12390 |
-+ else |
12391 |
-+ bfq_clear_bfqq_idle_window(bfqq); |
12392 |
-+ if (bic->raising_time_left && bfqq->bfqd->low_latency) { |
12393 |
-+ /* |
12394 |
-+ * Start a weight raising period with the duration given by |
12395 |
-+ * the raising_time_left snapshot. |
12396 |
-+ */ |
12397 |
-+ if (bfq_bfqq_busy(bfqq)) |
12398 |
-+ bfqq->bfqd->raised_busy_queues++; |
12399 |
-+ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff; |
12400 |
-+ bfqq->raising_cur_max_time = bic->raising_time_left; |
12401 |
-+ bfqq->last_rais_start_finish = jiffies; |
12402 |
-+ bfqq->entity.ioprio_changed = 1; |
12403 |
-+ } |
12404 |
-+ /* |
12405 |
-+ * Clear raising_time_left to prevent bfq_bfqq_save_state() from |
12406 |
-+ * getting confused about the queue's need of a weight-raising |
12407 |
-+ * period. |
12408 |
-+ */ |
12409 |
-+ bic->raising_time_left = 0; |
12410 |
-+} |
12411 |
-+ |
12412 |
-+/* |
12413 |
-+ * Must be called with the queue_lock held. |
12414 |
-+ */ |
12415 |
-+static int bfqq_process_refs(struct bfq_queue *bfqq) |
12416 |
-+{ |
12417 |
-+ int process_refs, io_refs; |
12418 |
-+ |
12419 |
-+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; |
12420 |
-+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; |
12421 |
-+ BUG_ON(process_refs < 0); |
12422 |
-+ return process_refs; |
12423 |
-+} |
12424 |
-+ |
12425 |
- static void bfq_add_rq_rb(struct request *rq) |
12426 |
- { |
12427 |
- struct bfq_queue *bfqq = RQ_BFQQ(rq); |
12428 |
-@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq) |
12429 |
- if (!bfqd->low_latency) |
12430 |
- goto add_bfqq_busy; |
12431 |
- |
12432 |
-+ if (bfq_bfqq_just_split(bfqq)) |
12433 |
-+ goto set_ioprio_changed; |
12434 |
-+ |
12435 |
- /* |
12436 |
-- * If the queue is not being boosted and has been idle |
12437 |
-- * for enough time, start a weight-raising period |
12438 |
-+ * If the queue: |
12439 |
-+ * - is not being boosted, |
12440 |
-+ * - has been idle for enough time, |
12441 |
-+ * - is not a sync queue or is linked to a bfq_io_cq (it is |
12442 |
-+ * shared "for its nature" or it is not shared and its |
12443 |
-+ * requests have not been redirected to a shared queue) |
12444 |
-+ * start a weight-raising period. |
12445 |
- */ |
12446 |
-- if (old_raising_coeff == 1 && |
12447 |
-- (idle_for_long_time || soft_rt)) { |
12448 |
-+ if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) && |
12449 |
-+ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) { |
12450 |
- bfqq->raising_coeff = bfqd->bfq_raising_coeff; |
12451 |
- if (idle_for_long_time) |
12452 |
- bfqq->raising_cur_max_time = |
12453 |
-@@ -572,6 +620,7 @@ static void bfq_add_rq_rb(struct request *rq) |
12454 |
- bfqd->bfq_raising_rt_max_time; |
12455 |
- } |
12456 |
- } |
12457 |
-+set_ioprio_changed: |
12458 |
- if (old_raising_coeff != bfqq->raising_coeff) |
12459 |
- entity->ioprio_changed = 1; |
12460 |
- add_bfqq_busy: |
12461 |
-@@ -754,90 +803,35 @@ static void bfq_end_raising(struct bfq_data *bfqd) |
12462 |
- spin_unlock_irq(bfqd->queue->queue_lock); |
12463 |
- } |
12464 |
- |
12465 |
--static int bfq_allow_merge(struct request_queue *q, struct request *rq, |
12466 |
-- struct bio *bio) |
12467 |
--{ |
12468 |
-- struct bfq_data *bfqd = q->elevator->elevator_data; |
12469 |
-- struct bfq_io_cq *bic; |
12470 |
-- struct bfq_queue *bfqq; |
12471 |
-- |
12472 |
-- /* |
12473 |
-- * Disallow merge of a sync bio into an async request. |
12474 |
-- */ |
12475 |
-- if (bfq_bio_sync(bio) && !rq_is_sync(rq)) |
12476 |
-- return 0; |
12477 |
-- |
12478 |
-- /* |
12479 |
-- * Lookup the bfqq that this bio will be queued with. Allow |
12480 |
-- * merge only if rq is queued there. |
12481 |
-- * Queue lock is held here. |
12482 |
-- */ |
12483 |
-- bic = bfq_bic_lookup(bfqd, current->io_context); |
12484 |
-- if (bic == NULL) |
12485 |
-- return 0; |
12486 |
-- |
12487 |
-- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); |
12488 |
-- return bfqq == RQ_BFQQ(rq); |
12489 |
--} |
12490 |
-- |
12491 |
--static void __bfq_set_in_service_queue(struct bfq_data *bfqd, |
12492 |
-- struct bfq_queue *bfqq) |
12493 |
--{ |
12494 |
-- if (bfqq != NULL) { |
12495 |
-- bfq_mark_bfqq_must_alloc(bfqq); |
12496 |
-- bfq_mark_bfqq_budget_new(bfqq); |
12497 |
-- bfq_clear_bfqq_fifo_expire(bfqq); |
12498 |
-- |
12499 |
-- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; |
12500 |
-- |
12501 |
-- bfq_log_bfqq(bfqd, bfqq, |
12502 |
-- "set_in_service_queue, cur-budget = %lu", |
12503 |
-- bfqq->entity.budget); |
12504 |
-- } |
12505 |
-- |
12506 |
-- bfqd->in_service_queue = bfqq; |
12507 |
--} |
12508 |
-- |
12509 |
--/* |
12510 |
-- * Get and set a new queue for service. |
12511 |
-- */ |
12512 |
--static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd, |
12513 |
-- struct bfq_queue *bfqq) |
12514 |
-+static inline sector_t bfq_io_struct_pos(void *io_struct, bool request) |
12515 |
- { |
12516 |
-- if (!bfqq) |
12517 |
-- bfqq = bfq_get_next_queue(bfqd); |
12518 |
-+ if (request) |
12519 |
-+ return blk_rq_pos(io_struct); |
12520 |
- else |
12521 |
-- bfq_get_next_queue_forced(bfqd, bfqq); |
12522 |
-- |
12523 |
-- __bfq_set_in_service_queue(bfqd, bfqq); |
12524 |
-- return bfqq; |
12525 |
-+ return ((struct bio *)io_struct)->bi_sector; |
12526 |
- } |
12527 |
- |
12528 |
--static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd, |
12529 |
-- struct request *rq) |
12530 |
-+static inline sector_t bfq_dist_from(sector_t pos1, |
12531 |
-+ sector_t pos2) |
12532 |
- { |
12533 |
-- if (blk_rq_pos(rq) >= bfqd->last_position) |
12534 |
-- return blk_rq_pos(rq) - bfqd->last_position; |
12535 |
-+ if (pos1 >= pos2) |
12536 |
-+ return pos1 - pos2; |
12537 |
- else |
12538 |
-- return bfqd->last_position - blk_rq_pos(rq); |
12539 |
-+ return pos2 - pos1; |
12540 |
- } |
12541 |
- |
12542 |
--/* |
12543 |
-- * Return true if bfqq has no request pending and rq is close enough to |
12544 |
-- * bfqd->last_position, or if rq is closer to bfqd->last_position than |
12545 |
-- * bfqq->next_rq |
12546 |
-- */ |
12547 |
--static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq) |
12548 |
-+static inline int bfq_rq_close_to_sector(void *io_struct, bool request, |
12549 |
-+ sector_t sector) |
12550 |
- { |
12551 |
-- return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR; |
12552 |
-+ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <= |
12553 |
-+ BFQQ_SEEK_THR; |
12554 |
- } |
12555 |
- |
12556 |
--static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) |
12557 |
-+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector) |
12558 |
- { |
12559 |
- struct rb_root *root = &bfqd->rq_pos_tree; |
12560 |
- struct rb_node *parent, *node; |
12561 |
- struct bfq_queue *__bfqq; |
12562 |
-- sector_t sector = bfqd->last_position; |
12563 |
- |
12564 |
- if (RB_EMPTY_ROOT(root)) |
12565 |
- return NULL; |
12566 |
-@@ -856,7 +850,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) |
12567 |
- * position). |
12568 |
- */ |
12569 |
- __bfqq = rb_entry(parent, struct bfq_queue, pos_node); |
12570 |
-- if (bfq_rq_close(bfqd, __bfqq->next_rq)) |
12571 |
-+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) |
12572 |
- return __bfqq; |
12573 |
- |
12574 |
- if (blk_rq_pos(__bfqq->next_rq) < sector) |
12575 |
-@@ -867,7 +861,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) |
12576 |
- return NULL; |
12577 |
- |
12578 |
- __bfqq = rb_entry(node, struct bfq_queue, pos_node); |
12579 |
-- if (bfq_rq_close(bfqd, __bfqq->next_rq)) |
12580 |
-+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) |
12581 |
- return __bfqq; |
12582 |
- |
12583 |
- return NULL; |
12584 |
-@@ -876,14 +870,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) |
12585 |
- /* |
12586 |
- * bfqd - obvious |
12587 |
- * cur_bfqq - passed in so that we don't decide that the current queue |
12588 |
-- * is closely cooperating with itself. |
12589 |
-- * |
12590 |
-- * We are assuming that cur_bfqq has dispatched at least one request, |
12591 |
-- * and that bfqd->last_position reflects a position on the disk associated |
12592 |
-- * with the I/O issued by cur_bfqq. |
12593 |
-+ * is closely cooperating with itself |
12594 |
-+ * sector - used as a reference point to search for a close queue |
12595 |
- */ |
12596 |
- static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, |
12597 |
-- struct bfq_queue *cur_bfqq) |
12598 |
-+ struct bfq_queue *cur_bfqq, |
12599 |
-+ sector_t sector) |
12600 |
- { |
12601 |
- struct bfq_queue *bfqq; |
12602 |
- |
12603 |
-@@ -903,7 +895,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, |
12604 |
- * working closely on the same area of the disk. In that case, |
12605 |
- * we can group them together and don't waste time idling. |
12606 |
- */ |
12607 |
-- bfqq = bfqq_close(bfqd); |
12608 |
-+ bfqq = bfqq_close(bfqd, sector); |
12609 |
- if (bfqq == NULL || bfqq == cur_bfqq) |
12610 |
- return NULL; |
12611 |
- |
12612 |
-@@ -930,6 +922,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, |
12613 |
- return bfqq; |
12614 |
- } |
12615 |
- |
12616 |
-+static struct bfq_queue * |
12617 |
-+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) |
12618 |
-+{ |
12619 |
-+ int process_refs, new_process_refs; |
12620 |
-+ struct bfq_queue *__bfqq; |
12621 |
-+ |
12622 |
-+ /* |
12623 |
-+ * If there are no process references on the new_bfqq, then it is |
12624 |
-+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain |
12625 |
-+ * may have dropped their last reference (not just their last process |
12626 |
-+ * reference). |
12627 |
-+ */ |
12628 |
-+ if (!bfqq_process_refs(new_bfqq)) |
12629 |
-+ return NULL; |
12630 |
-+ |
12631 |
-+ /* Avoid a circular list and skip interim queue merges. */ |
12632 |
-+ while ((__bfqq = new_bfqq->new_bfqq)) { |
12633 |
-+ if (__bfqq == bfqq) |
12634 |
-+ return NULL; |
12635 |
-+ new_bfqq = __bfqq; |
12636 |
-+ } |
12637 |
-+ |
12638 |
-+ process_refs = bfqq_process_refs(bfqq); |
12639 |
-+ new_process_refs = bfqq_process_refs(new_bfqq); |
12640 |
-+ /* |
12641 |
-+ * If the process for the bfqq has gone away, there is no |
12642 |
-+ * sense in merging the queues. |
12643 |
-+ */ |
12644 |
-+ if (process_refs == 0 || new_process_refs == 0) |
12645 |
-+ return NULL; |
12646 |
-+ |
12647 |
-+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", |
12648 |
-+ new_bfqq->pid); |
12649 |
-+ |
12650 |
-+ /* |
12651 |
-+ * Merging is just a redirection: the requests of the process owning |
12652 |
-+ * one of the two queues are redirected to the other queue. The latter |
12653 |
-+ * queue, in its turn, is set as shared if this is the first time that |
12654 |
-+ * the requests of some process are redirected to it. |
12655 |
-+ * |
12656 |
-+ * We redirect bfqq to new_bfqq and not the opposite, because we |
12657 |
-+ * are in the context of the process owning bfqq, hence we have the |
12658 |
-+ * io_cq of this process. So we can immediately configure this io_cq |
12659 |
-+ * to redirect the requests of the process to new_bfqq. |
12660 |
-+ * |
12661 |
-+ * NOTE, even if new_bfqq coincides with the in-service queue, the |
12662 |
-+ * io_cq of new_bfqq is not available, because, if the in-service queue |
12663 |
-+ * is shared, bfqd->in_service_bic may not point to the io_cq of the |
12664 |
-+ * in-service queue. |
12665 |
-+ * Redirecting the requests of the process owning bfqq to the currently |
12666 |
-+ * in-service queue is in any case the best option, as we feed the |
12667 |
-+ * in-service queue with new requests close to the last request served |
12668 |
-+ * and, by doing so, hopefully increase the throughput. |
12669 |
-+ */ |
12670 |
-+ bfqq->new_bfqq = new_bfqq; |
12671 |
-+ atomic_add(process_refs, &new_bfqq->ref); |
12672 |
-+ return new_bfqq; |
12673 |
-+} |
12674 |
-+ |
12675 |
-+/* |
12676 |
-+ * Attempt to schedule a merge of bfqq with the currently in-service queue or |
12677 |
-+ * with a close queue among the scheduled queues. |
12678 |
-+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue |
12679 |
-+ * structure otherwise. |
12680 |
-+ */ |
12681 |
-+static struct bfq_queue * |
12682 |
-+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
12683 |
-+ void *io_struct, bool request) |
12684 |
-+{ |
12685 |
-+ struct bfq_queue *in_service_bfqq, *new_bfqq; |
12686 |
-+ |
12687 |
-+ if (bfqq->new_bfqq) |
12688 |
-+ return bfqq->new_bfqq; |
12689 |
-+ |
12690 |
-+ if (!io_struct) |
12691 |
-+ return NULL; |
12692 |
-+ |
12693 |
-+ in_service_bfqq = bfqd->in_service_queue; |
12694 |
-+ |
12695 |
-+ if (in_service_bfqq == NULL || in_service_bfqq == bfqq || |
12696 |
-+ !bfqd->in_service_bic) |
12697 |
-+ goto check_scheduled; |
12698 |
-+ |
12699 |
-+ if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq)) |
12700 |
-+ goto check_scheduled; |
12701 |
-+ |
12702 |
-+ if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq)) |
12703 |
-+ goto check_scheduled; |
12704 |
-+ |
12705 |
-+ if (in_service_bfqq->entity.parent != bfqq->entity.parent) |
12706 |
-+ goto check_scheduled; |
12707 |
-+ |
12708 |
-+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && |
12709 |
-+ bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) { |
12710 |
-+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); |
12711 |
-+ if (new_bfqq != NULL) |
12712 |
-+ return new_bfqq; /* Merge with the in-service queue */ |
12713 |
-+ } |
12714 |
-+ |
12715 |
-+ /* |
12716 |
-+ * Check whether there is a cooperator among currently scheduled |
12717 |
-+ * queues. The only thing we need is that the bio/request is not |
12718 |
-+ * NULL, as we need it to establish whether a cooperator exists. |
12719 |
-+ */ |
12720 |
-+check_scheduled: |
12721 |
-+ new_bfqq = bfq_close_cooperator(bfqd, bfqq, |
12722 |
-+ bfq_io_struct_pos(io_struct, request)); |
12723 |
-+ if (new_bfqq) |
12724 |
-+ return bfq_setup_merge(bfqq, new_bfqq); |
12725 |
-+ |
12726 |
-+ return NULL; |
12727 |
-+} |
12728 |
-+ |
12729 |
-+static inline void |
12730 |
-+bfq_bfqq_save_state(struct bfq_queue *bfqq) |
12731 |
-+{ |
12732 |
-+ /* |
12733 |
-+ * If bfqq->bic == NULL, the queue is already shared or its requests |
12734 |
-+ * have already been redirected to a shared queue; both idle window |
12735 |
-+ * and weight raising state have already been saved. Do nothing. |
12736 |
-+ */ |
12737 |
-+ if (bfqq->bic == NULL) |
12738 |
-+ return; |
12739 |
-+ if (bfqq->bic->raising_time_left) |
12740 |
-+ /* |
12741 |
-+ * This is the queue of a just-started process, and would |
12742 |
-+ * deserve weight raising: we set raising_time_left to the full |
12743 |
-+ * weight-raising duration to trigger weight-raising when and |
12744 |
-+ * if the queue is split and the first request of the queue |
12745 |
-+ * is enqueued. |
12746 |
-+ */ |
12747 |
-+ bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd); |
12748 |
-+ else if (bfqq->raising_coeff > 1) { |
12749 |
-+ unsigned long wrais_duration = |
12750 |
-+ jiffies - bfqq->last_rais_start_finish; |
12751 |
-+ /* |
12752 |
-+ * It may happen that a queue's weight raising period lasts |
12753 |
-+ * longer than its raising_cur_max_time, as weight raising is |
12754 |
-+ * handled only when a request is enqueued or dispatched (it |
12755 |
-+ * does not use any timer). If the weight raising period is |
12756 |
-+ * about to end, don't save it. |
12757 |
-+ */ |
12758 |
-+ if (bfqq->raising_cur_max_time <= wrais_duration) |
12759 |
-+ bfqq->bic->raising_time_left = 0; |
12760 |
-+ else |
12761 |
-+ bfqq->bic->raising_time_left = |
12762 |
-+ bfqq->raising_cur_max_time - wrais_duration; |
12763 |
-+ /* |
12764 |
-+ * The bfq_queue is becoming shared or the requests of the |
12765 |
-+ * process owning the queue are being redirected to a shared |
12766 |
-+ * queue. Stop the weight raising period of the queue, as in |
12767 |
-+ * both cases it should not be owned by an interactive or soft |
12768 |
-+ * real-time application. |
12769 |
-+ */ |
12770 |
-+ bfq_bfqq_end_raising(bfqq); |
12771 |
-+ } else |
12772 |
-+ bfqq->bic->raising_time_left = 0; |
12773 |
-+ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); |
12774 |
-+} |
12775 |
-+ |
12776 |
-+static inline void |
12777 |
-+bfq_get_bic_reference(struct bfq_queue *bfqq) |
12778 |
-+{ |
12779 |
-+ /* |
12780 |
-+ * If bfqq->bic has a non-NULL value, the bic to which it belongs |
12781 |
-+ * is about to begin using a shared bfq_queue. |
12782 |
-+ */ |
12783 |
-+ if (bfqq->bic) |
12784 |
-+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); |
12785 |
-+} |
12786 |
-+ |
12787 |
-+static void |
12788 |
-+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, |
12789 |
-+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) |
12790 |
-+{ |
12791 |
-+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", |
12792 |
-+ (long unsigned)new_bfqq->pid); |
12793 |
-+ /* Save weight raising and idle window of the merged queues */ |
12794 |
-+ bfq_bfqq_save_state(bfqq); |
12795 |
-+ bfq_bfqq_save_state(new_bfqq); |
12796 |
-+ /* |
12797 |
-+ * Grab a reference to the bic, to prevent it from being destroyed |
12798 |
-+ * before being possibly touched by a bfq_split_bfqq(). |
12799 |
-+ */ |
12800 |
-+ bfq_get_bic_reference(bfqq); |
12801 |
-+ bfq_get_bic_reference(new_bfqq); |
12802 |
-+ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */ |
12803 |
-+ bic_set_bfqq(bic, new_bfqq, 1); |
12804 |
-+ bfq_mark_bfqq_coop(new_bfqq); |
12805 |
-+ /* |
12806 |
-+ * new_bfqq now belongs to at least two bics (it is a shared queue): set |
12807 |
-+ * new_bfqq->bic to NULL. bfqq either: |
12808 |
-+ * - does not belong to any bic any more, and hence bfqq->bic must |
12809 |
-+ * be set to NULL, or |
12810 |
-+ * - is a queue whose owning bics have already been redirected to a |
12811 |
-+ * different queue, hence the queue is destined to not belong to any |
12812 |
-+ * bic soon and bfqq->bic is already NULL (therefore the next |
12813 |
-+ * assignment causes no harm). |
12814 |
-+ */ |
12815 |
-+ new_bfqq->bic = NULL; |
12816 |
-+ bfqq->bic = NULL; |
12817 |
-+ bfq_put_queue(bfqq); |
12818 |
-+} |
12819 |
-+ |
12820 |
-+static int bfq_allow_merge(struct request_queue *q, struct request *rq, |
12821 |
-+ struct bio *bio) |
12822 |
-+{ |
12823 |
-+ struct bfq_data *bfqd = q->elevator->elevator_data; |
12824 |
-+ struct bfq_io_cq *bic; |
12825 |
-+ struct bfq_queue *bfqq, *new_bfqq; |
12826 |
-+ |
12827 |
-+ /* |
12828 |
-+ * Disallow merge of a sync bio into an async request. |
12829 |
-+ */ |
12830 |
-+ if (bfq_bio_sync(bio) && !rq_is_sync(rq)) |
12831 |
-+ return 0; |
12832 |
-+ |
12833 |
-+ /* |
12834 |
-+ * Lookup the bfqq that this bio will be queued with. Allow |
12835 |
-+ * merge only if rq is queued there. |
12836 |
-+ * Queue lock is held here. |
12837 |
-+ */ |
12838 |
-+ bic = bfq_bic_lookup(bfqd, current->io_context); |
12839 |
-+ if (bic == NULL) |
12840 |
-+ return 0; |
12841 |
-+ |
12842 |
-+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); |
12843 |
-+ /* |
12844 |
-+ * We take advantage of this function to perform an early merge |
12845 |
-+ * of the queues of possible cooperating processes. |
12846 |
-+ */ |
12847 |
-+ if (bfqq != NULL) { |
12848 |
-+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); |
12849 |
-+ if (new_bfqq != NULL) { |
12850 |
-+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); |
12851 |
-+ /* |
12852 |
-+ * If we get here, the bio will be queued in the shared queue, |
12853 |
-+ * i.e., new_bfqq, so use new_bfqq to decide whether bio and |
12854 |
-+ * rq can be merged. |
12855 |
-+ */ |
12856 |
-+ bfqq = new_bfqq; |
12857 |
-+ } |
12858 |
-+ } |
12859 |
-+ |
12860 |
-+ return bfqq == RQ_BFQQ(rq); |
12861 |
-+} |
12862 |
-+ |
12863 |
-+static void __bfq_set_in_service_queue(struct bfq_data *bfqd, |
12864 |
-+ struct bfq_queue *bfqq) |
12865 |
-+{ |
12866 |
-+ if (bfqq != NULL) { |
12867 |
-+ bfq_mark_bfqq_must_alloc(bfqq); |
12868 |
-+ bfq_mark_bfqq_budget_new(bfqq); |
12869 |
-+ bfq_clear_bfqq_fifo_expire(bfqq); |
12870 |
-+ |
12871 |
-+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; |
12872 |
-+ |
12873 |
-+ bfq_log_bfqq(bfqd, bfqq, |
12874 |
-+ "set_in_service_queue, cur-budget = %lu", |
12875 |
-+ bfqq->entity.budget); |
12876 |
-+ } |
12877 |
-+ |
12878 |
-+ bfqd->in_service_queue = bfqq; |
12879 |
-+} |
12880 |
-+ |
12881 |
-+/* |
12882 |
-+ * Get and set a new queue for service. |
12883 |
-+ */ |
12884 |
-+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) |
12885 |
-+{ |
12886 |
-+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); |
12887 |
-+ |
12888 |
-+ __bfq_set_in_service_queue(bfqd, bfqq); |
12889 |
-+ return bfqq; |
12890 |
-+} |
12891 |
-+ |
12892 |
- /* |
12893 |
- * If enough samples have been computed, return the current max budget |
12894 |
- * stored in bfqd, which is dynamically updated according to the |
12895 |
-@@ -1077,63 +1345,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq) |
12896 |
- return rq; |
12897 |
- } |
12898 |
- |
12899 |
--/* |
12900 |
-- * Must be called with the queue_lock held. |
12901 |
-- */ |
12902 |
--static int bfqq_process_refs(struct bfq_queue *bfqq) |
12903 |
--{ |
12904 |
-- int process_refs, io_refs; |
12905 |
-- |
12906 |
-- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; |
12907 |
-- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; |
12908 |
-- BUG_ON(process_refs < 0); |
12909 |
-- return process_refs; |
12910 |
--} |
12911 |
-- |
12912 |
--static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) |
12913 |
--{ |
12914 |
-- int process_refs, new_process_refs; |
12915 |
-- struct bfq_queue *__bfqq; |
12916 |
-- |
12917 |
-- /* |
12918 |
-- * If there are no process references on the new_bfqq, then it is |
12919 |
-- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain |
12920 |
-- * may have dropped their last reference (not just their last process |
12921 |
-- * reference). |
12922 |
-- */ |
12923 |
-- if (!bfqq_process_refs(new_bfqq)) |
12924 |
-- return; |
12925 |
-- |
12926 |
-- /* Avoid a circular list and skip interim queue merges. */ |
12927 |
-- while ((__bfqq = new_bfqq->new_bfqq)) { |
12928 |
-- if (__bfqq == bfqq) |
12929 |
-- return; |
12930 |
-- new_bfqq = __bfqq; |
12931 |
-- } |
12932 |
-- |
12933 |
-- process_refs = bfqq_process_refs(bfqq); |
12934 |
-- new_process_refs = bfqq_process_refs(new_bfqq); |
12935 |
-- /* |
12936 |
-- * If the process for the bfqq has gone away, there is no |
12937 |
-- * sense in merging the queues. |
12938 |
-- */ |
12939 |
-- if (process_refs == 0 || new_process_refs == 0) |
12940 |
-- return; |
12941 |
-- |
12942 |
-- /* |
12943 |
-- * Merge in the direction of the lesser amount of work. |
12944 |
-- */ |
12945 |
-- if (new_process_refs >= process_refs) { |
12946 |
-- bfqq->new_bfqq = new_bfqq; |
12947 |
-- atomic_add(process_refs, &new_bfqq->ref); |
12948 |
-- } else { |
12949 |
-- new_bfqq->new_bfqq = bfqq; |
12950 |
-- atomic_add(new_process_refs, &bfqq->ref); |
12951 |
-- } |
12952 |
-- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", |
12953 |
-- new_bfqq->pid); |
12954 |
--} |
12955 |
-- |
12956 |
- static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) |
12957 |
- { |
12958 |
- struct bfq_entity *entity = &bfqq->entity; |
12959 |
-@@ -1703,7 +1914,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) |
12960 |
- */ |
12961 |
- static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) |
12962 |
- { |
12963 |
-- struct bfq_queue *bfqq, *new_bfqq = NULL; |
12964 |
-+ struct bfq_queue *bfqq; |
12965 |
- struct request *next_rq; |
12966 |
- enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; |
12967 |
- |
12968 |
-@@ -1713,17 +1924,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) |
12969 |
- |
12970 |
- bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); |
12971 |
- |
12972 |
-- /* |
12973 |
-- * If another queue has a request waiting within our mean seek |
12974 |
-- * distance, let it run. The expire code will check for close |
12975 |
-- * cooperators and put the close queue at the front of the |
12976 |
-- * service tree. If possible, merge the expiring queue with the |
12977 |
-- * new bfqq. |
12978 |
-- */ |
12979 |
-- new_bfqq = bfq_close_cooperator(bfqd, bfqq); |
12980 |
-- if (new_bfqq != NULL && bfqq->new_bfqq == NULL) |
12981 |
-- bfq_setup_merge(bfqq, new_bfqq); |
12982 |
-- |
12983 |
- if (bfq_may_expire_for_budg_timeout(bfqq) && |
12984 |
- !timer_pending(&bfqd->idle_slice_timer) && |
12985 |
- !bfq_bfqq_must_idle(bfqq)) |
12986 |
-@@ -1760,36 +1960,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) |
12987 |
- bfq_clear_bfqq_wait_request(bfqq); |
12988 |
- del_timer(&bfqd->idle_slice_timer); |
12989 |
- } |
12990 |
-- if (new_bfqq == NULL) |
12991 |
-- goto keep_queue; |
12992 |
-- else |
12993 |
-- goto expire; |
12994 |
-+ goto keep_queue; |
12995 |
- } |
12996 |
- } |
12997 |
- |
12998 |
- /* |
12999 |
-- * No requests pending. If the in-service queue has no cooperator and |
13000 |
-- * still has requests in flight (possibly waiting for a completion) |
13001 |
-- * or is idling for a new request, then keep it. |
13002 |
-+ * No requests pending. If the in-service queue still has requests in |
13003 |
-+ * flight (possibly waiting for a completion) or is idling for a new |
13004 |
-+ * request, then keep it. |
13005 |
- */ |
13006 |
-- if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) || |
13007 |
-- (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) { |
13008 |
-+ if (timer_pending(&bfqd->idle_slice_timer) || |
13009 |
-+ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) { |
13010 |
- bfqq = NULL; |
13011 |
- goto keep_queue; |
13012 |
-- } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) { |
13013 |
-- /* |
13014 |
-- * Expiring the queue because there is a close cooperator, |
13015 |
-- * cancel timer. |
13016 |
-- */ |
13017 |
-- bfq_clear_bfqq_wait_request(bfqq); |
13018 |
-- del_timer(&bfqd->idle_slice_timer); |
13019 |
- } |
13020 |
- |
13021 |
- reason = BFQ_BFQQ_NO_MORE_REQUESTS; |
13022 |
- expire: |
13023 |
- bfq_bfqq_expire(bfqd, bfqq, 0, reason); |
13024 |
- new_queue: |
13025 |
-- bfqq = bfq_set_in_service_queue(bfqd, new_bfqq); |
13026 |
-+ bfqq = bfq_set_in_service_queue(bfqd); |
13027 |
- bfq_log(bfqd, "select_queue: new queue %d returned", |
13028 |
- bfqq != NULL ? bfqq->pid : 0); |
13029 |
- keep_queue: |
13030 |
-@@ -1799,9 +1989,8 @@ keep_queue: |
13031 |
- static void bfq_update_raising_data(struct bfq_data *bfqd, |
13032 |
- struct bfq_queue *bfqq) |
13033 |
- { |
13034 |
-+ struct bfq_entity *entity = &bfqq->entity; |
13035 |
- if (bfqq->raising_coeff > 1) { /* queue is being boosted */ |
13036 |
-- struct bfq_entity *entity = &bfqq->entity; |
13037 |
-- |
13038 |
- bfq_log_bfqq(bfqd, bfqq, |
13039 |
- "raising period dur %u/%u msec, " |
13040 |
- "old raising coeff %u, w %d(%d)", |
13041 |
-@@ -1818,7 +2007,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd, |
13042 |
- "WARN: pending prio change"); |
13043 |
- /* |
13044 |
- * If too much time has elapsed from the beginning |
13045 |
-- * of this weight-raising, stop it. |
13046 |
-+ * of this weight-raising period, stop it. |
13047 |
- */ |
13048 |
- if (jiffies - bfqq->last_rais_start_finish > |
13049 |
- bfqq->raising_cur_max_time) { |
13050 |
-@@ -1830,11 +2019,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd, |
13051 |
- jiffies_to_msecs(bfqq-> |
13052 |
- raising_cur_max_time)); |
13053 |
- bfq_bfqq_end_raising(bfqq); |
13054 |
-- __bfq_entity_update_weight_prio( |
13055 |
-- bfq_entity_service_tree(entity), |
13056 |
-- entity); |
13057 |
- } |
13058 |
- } |
13059 |
-+ /* Update weight both if it must be raised and if it must be lowered */ |
13060 |
-+ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1)) |
13061 |
-+ __bfq_entity_update_weight_prio( |
13062 |
-+ bfq_entity_service_tree(entity), |
13063 |
-+ entity); |
13064 |
- } |
13065 |
- |
13066 |
- /* |
13067 |
-@@ -2075,6 +2266,25 @@ static void bfq_init_icq(struct io_cq *icq) |
13068 |
- struct bfq_io_cq *bic = icq_to_bic(icq); |
13069 |
- |
13070 |
- bic->ttime.last_end_request = jiffies; |
13071 |
-+ /* |
13072 |
-+ * A newly created bic indicates that the process has just |
13073 |
-+ * started doing I/O, and is probably mapping into memory its |
13074 |
-+ * executable and libraries: it definitely needs weight raising. |
13075 |
-+ * There is however the possibility that the process performs, |
13076 |
-+ * for a while, I/O close to some other process. EQM intercepts |
13077 |
-+ * this behavior and may merge the queue corresponding to the |
13078 |
-+ * process with some other queue, BEFORE the weight of the queue |
13079 |
-+ * is raised. Merged queues are not weight-raised (they are assumed |
13080 |
-+ * to belong to processes that benefit only from high throughput). |
13081 |
-+ * If the merge is basically the consequence of an accident, then |
13082 |
-+ * the queue will be split soon and will get back its old weight. |
13083 |
-+ * It is then important to write down somewhere that this queue |
13084 |
-+ * does need weight raising, even if it did not make it to get its |
13085 |
-+ * weight raised before being merged. To this purpose, we overload |
13086 |
-+ * the field raising_time_left and assign 1 to it, to mark the queue |
13087 |
-+ * as needing weight raising. |
13088 |
-+ */ |
13089 |
-+ bic->raising_time_left = 1; |
13090 |
- } |
13091 |
- |
13092 |
- static void bfq_exit_icq(struct io_cq *icq) |
13093 |
-@@ -2088,6 +2298,13 @@ static void bfq_exit_icq(struct io_cq *icq) |
13094 |
- } |
13095 |
- |
13096 |
- if (bic->bfqq[BLK_RW_SYNC]) { |
13097 |
-+ /* |
13098 |
-+ * If the bic is using a shared queue, put the reference |
13099 |
-+ * taken on the io_context when the bic started using a |
13100 |
-+ * shared bfq_queue. |
13101 |
-+ */ |
13102 |
-+ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) |
13103 |
-+ put_io_context(icq->ioc); |
13104 |
- bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); |
13105 |
- bic->bfqq[BLK_RW_SYNC] = NULL; |
13106 |
- } |
13107 |
-@@ -2375,6 +2592,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, |
13108 |
- if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) |
13109 |
- return; |
13110 |
- |
13111 |
-+ /* Idle window just restored, statistics are meaningless. */ |
13112 |
-+ if (bfq_bfqq_just_split(bfqq)) |
13113 |
-+ return; |
13114 |
-+ |
13115 |
- enable_idle = bfq_bfqq_idle_window(bfqq); |
13116 |
- |
13117 |
- if (atomic_read(&bic->icq.ioc->active_ref) == 0 || |
13118 |
-@@ -2415,6 +2636,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
13119 |
- if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || |
13120 |
- !BFQQ_SEEKY(bfqq)) |
13121 |
- bfq_update_idle_window(bfqd, bfqq, bic); |
13122 |
-+ bfq_clear_bfqq_just_split(bfqq); |
13123 |
- |
13124 |
- bfq_log_bfqq(bfqd, bfqq, |
13125 |
- "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", |
13126 |
-@@ -2475,13 +2697,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
13127 |
- static void bfq_insert_request(struct request_queue *q, struct request *rq) |
13128 |
- { |
13129 |
- struct bfq_data *bfqd = q->elevator->elevator_data; |
13130 |
-- struct bfq_queue *bfqq = RQ_BFQQ(rq); |
13131 |
-+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; |
13132 |
- |
13133 |
- assert_spin_locked(bfqd->queue->queue_lock); |
13134 |
-+ |
13135 |
-+ /* |
13136 |
-+ * An unplug may trigger a requeue of a request from the device |
13137 |
-+ * driver: make sure we are in process context while trying to |
13138 |
-+ * merge two bfq_queues. |
13139 |
-+ */ |
13140 |
-+ if (!in_interrupt()) { |
13141 |
-+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); |
13142 |
-+ if (new_bfqq != NULL) { |
13143 |
-+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) |
13144 |
-+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); |
13145 |
-+ /* |
13146 |
-+ * Release the request's reference to the old bfqq |
13147 |
-+ * and make sure one is taken to the shared queue. |
13148 |
-+ */ |
13149 |
-+ new_bfqq->allocated[rq_data_dir(rq)]++; |
13150 |
-+ bfqq->allocated[rq_data_dir(rq)]--; |
13151 |
-+ atomic_inc(&new_bfqq->ref); |
13152 |
-+ bfq_put_queue(bfqq); |
13153 |
-+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) |
13154 |
-+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), |
13155 |
-+ bfqq, new_bfqq); |
13156 |
-+ rq->elv.priv[1] = new_bfqq; |
13157 |
-+ bfqq = new_bfqq; |
13158 |
-+ } |
13159 |
-+ } |
13160 |
-+ |
13161 |
- bfq_init_prio_data(bfqq, RQ_BIC(rq)); |
13162 |
- |
13163 |
- bfq_add_rq_rb(rq); |
13164 |
- |
13165 |
-+ /* |
13166 |
-+ * Here a newly-created bfq_queue has already started a weight-raising |
13167 |
-+ * period: clear raising_time_left to prevent bfq_bfqq_save_state() |
13168 |
-+ * from assigning it a full weight-raising period. See the detailed |
13169 |
-+ * comments about this field in bfq_init_icq(). |
13170 |
-+ */ |
13171 |
-+ if (bfqq->bic != NULL) |
13172 |
-+ bfqq->bic->raising_time_left = 0; |
13173 |
- rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]); |
13174 |
- list_add_tail(&rq->queuelist, &bfqq->fifo); |
13175 |
- |
13176 |
-@@ -2629,18 +2886,6 @@ static void bfq_put_request(struct request *rq) |
13177 |
- } |
13178 |
- } |
13179 |
- |
13180 |
--static struct bfq_queue * |
13181 |
--bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, |
13182 |
-- struct bfq_queue *bfqq) |
13183 |
--{ |
13184 |
-- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", |
13185 |
-- (long unsigned)bfqq->new_bfqq->pid); |
13186 |
-- bic_set_bfqq(bic, bfqq->new_bfqq, 1); |
13187 |
-- bfq_mark_bfqq_coop(bfqq->new_bfqq); |
13188 |
-- bfq_put_queue(bfqq); |
13189 |
-- return bic_to_bfqq(bic, 1); |
13190 |
--} |
13191 |
-- |
13192 |
- /* |
13193 |
- * Returns NULL if a new bfqq should be allocated, or the old bfqq if this |
13194 |
- * was the last process referring to said bfqq. |
13195 |
-@@ -2649,6 +2894,9 @@ static struct bfq_queue * |
13196 |
- bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) |
13197 |
- { |
13198 |
- bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); |
13199 |
-+ |
13200 |
-+ put_io_context(bic->icq.ioc); |
13201 |
-+ |
13202 |
- if (bfqq_process_refs(bfqq) == 1) { |
13203 |
- bfqq->pid = current->pid; |
13204 |
- bfq_clear_bfqq_coop(bfqq); |
13205 |
-@@ -2677,6 +2925,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, |
13206 |
- struct bfq_queue *bfqq; |
13207 |
- struct bfq_group *bfqg; |
13208 |
- unsigned long flags; |
13209 |
-+ bool split = false; |
13210 |
- |
13211 |
- might_sleep_if(gfp_mask & __GFP_WAIT); |
13212 |
- |
13213 |
-@@ -2695,24 +2944,14 @@ new_queue: |
13214 |
- bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask); |
13215 |
- bic_set_bfqq(bic, bfqq, is_sync); |
13216 |
- } else { |
13217 |
-- /* |
13218 |
-- * If the queue was seeky for too long, break it apart. |
13219 |
-- */ |
13220 |
-+ /* If the queue was seeky for too long, break it apart. */ |
13221 |
- if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { |
13222 |
- bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); |
13223 |
- bfqq = bfq_split_bfqq(bic, bfqq); |
13224 |
-+ split = true; |
13225 |
- if (!bfqq) |
13226 |
- goto new_queue; |
13227 |
- } |
13228 |
-- |
13229 |
-- /* |
13230 |
-- * Check to see if this queue is scheduled to merge with |
13231 |
-- * another closely cooperating queue. The merging of queues |
13232 |
-- * happens here as it must be done in process context. |
13233 |
-- * The reference on new_bfqq was taken in merge_bfqqs. |
13234 |
-- */ |
13235 |
-- if (bfqq->new_bfqq != NULL) |
13236 |
-- bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq); |
13237 |
- } |
13238 |
- |
13239 |
- bfqq->allocated[rw]++; |
13240 |
-@@ -2723,6 +2962,26 @@ new_queue: |
13241 |
- rq->elv.priv[0] = bic; |
13242 |
- rq->elv.priv[1] = bfqq; |
13243 |
- |
13244 |
-+ /* |
13245 |
-+ * If a bfq_queue has only one process reference, it is owned |
13246 |
-+ * by only one bfq_io_cq: we can set the bic field of the |
13247 |
-+ * bfq_queue to the address of that structure. Also, if the |
13248 |
-+ * queue has just been split, mark a flag so that the |
13249 |
-+ * information is available to the other scheduler hooks. |
13250 |
-+ */ |
13251 |
-+ if (bfqq_process_refs(bfqq) == 1) { |
13252 |
-+ bfqq->bic = bic; |
13253 |
-+ if (split) { |
13254 |
-+ bfq_mark_bfqq_just_split(bfqq); |
13255 |
-+ /* |
13256 |
-+ * If the queue has just been split from a shared queue, |
13257 |
-+ * restore the idle window and the possible weight |
13258 |
-+ * raising period. |
13259 |
-+ */ |
13260 |
-+ bfq_bfqq_resume_state(bfqq, bic); |
13261 |
-+ } |
13262 |
-+ } |
13263 |
-+ |
13264 |
- spin_unlock_irqrestore(q->queue_lock, flags); |
13265 |
- |
13266 |
- return 0; |
13267 |
-diff --git a/block/bfq-sched.c b/block/bfq-sched.c |
13268 |
-index 30df81c..47e66a8 100644 |
13269 |
---- a/block/bfq-sched.c |
13270 |
-+++ b/block/bfq-sched.c |
13271 |
-@@ -979,34 +979,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) |
13272 |
- return bfqq; |
13273 |
- } |
13274 |
- |
13275 |
--/* |
13276 |
-- * Forced extraction of the given queue. |
13277 |
-- */ |
13278 |
--static void bfq_get_next_queue_forced(struct bfq_data *bfqd, |
13279 |
-- struct bfq_queue *bfqq) |
13280 |
--{ |
13281 |
-- struct bfq_entity *entity; |
13282 |
-- struct bfq_sched_data *sd; |
13283 |
-- |
13284 |
-- BUG_ON(bfqd->in_service_queue != NULL); |
13285 |
-- |
13286 |
-- entity = &bfqq->entity; |
13287 |
-- /* |
13288 |
-- * Bubble up extraction/update from the leaf to the root. |
13289 |
-- */ |
13290 |
-- for_each_entity(entity) { |
13291 |
-- sd = entity->sched_data; |
13292 |
-- bfq_update_budget(entity); |
13293 |
-- bfq_update_vtime(bfq_entity_service_tree(entity)); |
13294 |
-- bfq_active_extract(bfq_entity_service_tree(entity), entity); |
13295 |
-- sd->active_entity = entity; |
13296 |
-- sd->next_active = NULL; |
13297 |
-- entity->service = 0; |
13298 |
-- } |
13299 |
-- |
13300 |
-- return; |
13301 |
--} |
13302 |
-- |
13303 |
- static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) |
13304 |
- { |
13305 |
- if (bfqd->in_service_bic != NULL) { |
13306 |
-diff --git a/block/bfq.h b/block/bfq.h |
13307 |
-index 68b28e3..438f560 100644 |
13308 |
---- a/block/bfq.h |
13309 |
-+++ b/block/bfq.h |
13310 |
-@@ -192,6 +192,8 @@ struct bfq_group; |
13311 |
- * idle to backlogged |
13312 |
- * @service_from_backlogged: cumulative service received from the @bfq_queue |
13313 |
- * since the last transition from idle to backlogged |
13314 |
-+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the |
13315 |
-+ * queue is shared |
13316 |
- * |
13317 |
- * A bfq_queue is a leaf request queue; it can be associated to an io_context |
13318 |
- * or more (if it is an async one). @cgroup holds a reference to the |
13319 |
-@@ -235,6 +237,7 @@ struct bfq_queue { |
13320 |
- sector_t last_request_pos; |
13321 |
- |
13322 |
- pid_t pid; |
13323 |
-+ struct bfq_io_cq *bic; |
13324 |
- |
13325 |
- /* weight-raising fields */ |
13326 |
- unsigned int raising_cur_max_time; |
13327 |
-@@ -264,12 +267,23 @@ struct bfq_ttime { |
13328 |
- * @icq: associated io_cq structure |
13329 |
- * @bfqq: array of two process queues, the sync and the async |
13330 |
- * @ttime: associated @bfq_ttime struct |
13331 |
-+ * @raising_time_left: snapshot of the time left before weight raising ends |
13332 |
-+ * for the sync queue associated to this process; this |
13333 |
-+ * snapshot is taken to remember this value while the weight |
13334 |
-+ * raising is suspended because the queue is merged with a |
13335 |
-+ * shared queue, and is used to set @raising_cur_max_time |
13336 |
-+ * when the queue is split from the shared queue and its |
13337 |
-+ * weight is raised again |
13338 |
-+ * @saved_idle_window: same purpose as the previous field for the idle window |
13339 |
- */ |
13340 |
- struct bfq_io_cq { |
13341 |
- struct io_cq icq; /* must be the first member */ |
13342 |
- struct bfq_queue *bfqq[2]; |
13343 |
- struct bfq_ttime ttime; |
13344 |
- int ioprio; |
13345 |
-+ |
13346 |
-+ unsigned int raising_time_left; |
13347 |
-+ unsigned int saved_idle_window; |
13348 |
- }; |
13349 |
- |
13350 |
- /** |
13351 |
-@@ -411,6 +425,7 @@ enum bfqq_state_flags { |
13352 |
- BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ |
13353 |
- BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ |
13354 |
- BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */ |
13355 |
-+ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ |
13356 |
- BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */ |
13357 |
- }; |
13358 |
- |
13359 |
-@@ -438,6 +453,7 @@ BFQ_BFQQ_FNS(sync); |
13360 |
- BFQ_BFQQ_FNS(budget_new); |
13361 |
- BFQ_BFQQ_FNS(coop); |
13362 |
- BFQ_BFQQ_FNS(split_coop); |
13363 |
-+BFQ_BFQQ_FNS(just_split); |
13364 |
- BFQ_BFQQ_FNS(softrt_update); |
13365 |
- #undef BFQ_BFQQ_FNS |
13366 |
- |
13367 |
--- |
13368 |
-1.8.5.2 |
13369 |
- |
13370 |
|
13371 |
Added: genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch |
13372 |
=================================================================== |
13373 |
--- genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch (rev 0) |
13374 |
+++ genpatches-2.6/trunk/3.13/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch 2014-02-07 15:42:35 UTC (rev 2666) |
13375 |
@@ -0,0 +1,1034 @@ |
13376 |
+From 39b1dba58b2562ba0d93a33a4f9af662d3c790c5 Mon Sep 17 00:00:00 2001 |
13377 |
+From: Mauro Andreolini <mauro.andreolini@×××××××.it> |
13378 |
+Date: Thu, 23 Jan 2014 16:54:44 +0100 |
13379 |
+Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r1 for |
13380 |
+ 3.13.0 |
13381 |
+ |
13382 |
+A set of processes may happen to perform interleaved reads, i.e., requests |
13383 |
+whose union would give rise to a sequential read pattern. There are two |
13384 |
+typical cases: in the first case, processes read fixed-size chunks of |
13385 |
+data at a fixed distance from each other, while in the second case processes |
13386 |
+may read variable-size chunks at variable distances. The latter case occurs |
13387 |
+for example with KVM, which splits the I/O generated by the guest into |
13388 |
+multiple chunks, and lets these chunks be served by a pool of cooperating |
13389 |
+processes, iteratively assigning the next chunk of I/O to the first |
13390 |
+available process. CFQ uses actual queue merging for the first type of |
13391 |
+rocesses, whereas it uses preemption to get a sequential read pattern out |
13392 |
+of the read requests performed by the second type of processes. In the end |
13393 |
+it uses two different mechanisms to achieve the same goal: boosting the |
13394 |
+throughput with interleaved I/O. |
13395 |
+ |
13396 |
+This patch introduces Early Queue Merge (EQM), a unified mechanism to get a |
13397 |
+sequential read pattern with both types of processes. The main idea is |
13398 |
+checking newly arrived requests against the next request of the active queue |
13399 |
+both in case of actual request insert and in case of request merge. By doing |
13400 |
+so, both the types of processes can be handled by just merging their queues. |
13401 |
+EQM is then simpler and more compact than the pair of mechanisms used in |
13402 |
+CFQ. |
13403 |
+ |
13404 |
+Finally, EQM also preserves the typical low-latency properties of BFQ, by |
13405 |
+properly restoring the weight-raising state of a queue when it gets back to |
13406 |
+a non-merged state. |
13407 |
+ |
13408 |
+Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it> |
13409 |
+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com> |
13410 |
+Reviewed-by: Paolo Valente <paolo.valente@×××××××.it> |
13411 |
+--- |
13412 |
+ block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++---------------- |
13413 |
+ block/bfq-sched.c | 28 --- |
13414 |
+ block/bfq.h | 16 ++ |
13415 |
+ 3 files changed, 474 insertions(+), 227 deletions(-) |
13416 |
+ |
13417 |
+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c |
13418 |
+index eb760de..06ee844 100644 |
13419 |
+--- a/block/bfq-iosched.c |
13420 |
++++ b/block/bfq-iosched.c |
13421 |
+@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd) |
13422 |
+ return dur; |
13423 |
+ } |
13424 |
+ |
13425 |
++static inline void |
13426 |
++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) |
13427 |
++{ |
13428 |
++ if (bic->saved_idle_window) |
13429 |
++ bfq_mark_bfqq_idle_window(bfqq); |
13430 |
++ else |
13431 |
++ bfq_clear_bfqq_idle_window(bfqq); |
13432 |
++ if (bic->raising_time_left && bfqq->bfqd->low_latency) { |
13433 |
++ /* |
13434 |
++ * Start a weight raising period with the duration given by |
13435 |
++ * the raising_time_left snapshot. |
13436 |
++ */ |
13437 |
++ if (bfq_bfqq_busy(bfqq)) |
13438 |
++ bfqq->bfqd->raised_busy_queues++; |
13439 |
++ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff; |
13440 |
++ bfqq->raising_cur_max_time = bic->raising_time_left; |
13441 |
++ bfqq->last_rais_start_finish = jiffies; |
13442 |
++ bfqq->entity.ioprio_changed = 1; |
13443 |
++ } |
13444 |
++ /* |
13445 |
++ * Clear raising_time_left to prevent bfq_bfqq_save_state() from |
13446 |
++ * getting confused about the queue's need of a weight-raising |
13447 |
++ * period. |
13448 |
++ */ |
13449 |
++ bic->raising_time_left = 0; |
13450 |
++} |
13451 |
++ |
13452 |
++/* |
13453 |
++ * Must be called with the queue_lock held. |
13454 |
++ */ |
13455 |
++static int bfqq_process_refs(struct bfq_queue *bfqq) |
13456 |
++{ |
13457 |
++ int process_refs, io_refs; |
13458 |
++ |
13459 |
++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; |
13460 |
++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; |
13461 |
++ BUG_ON(process_refs < 0); |
13462 |
++ return process_refs; |
13463 |
++} |
13464 |
++ |
13465 |
+ static void bfq_add_rq_rb(struct request *rq) |
13466 |
+ { |
13467 |
+ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
13468 |
+@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq) |
13469 |
+ if (!bfqd->low_latency) |
13470 |
+ goto add_bfqq_busy; |
13471 |
+ |
13472 |
++ if (bfq_bfqq_just_split(bfqq)) |
13473 |
++ goto set_ioprio_changed; |
13474 |
++ |
13475 |
+ /* |
13476 |
+- * If the queue is not being boosted and has been idle |
13477 |
+- * for enough time, start a weight-raising period |
13478 |
++ * If the queue: |
13479 |
++ * - is not being boosted, |
13480 |
++ * - has been idle for enough time, |
13481 |
++ * - is not a sync queue or is linked to a bfq_io_cq (it is |
13482 |
++ * shared "for its nature" or it is not shared and its |
13483 |
++ * requests have not been redirected to a shared queue) |
13484 |
++ * start a weight-raising period. |
13485 |
+ */ |
13486 |
+- if (old_raising_coeff == 1 && |
13487 |
+- (idle_for_long_time || soft_rt)) { |
13488 |
++ if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) && |
13489 |
++ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) { |
13490 |
+ bfqq->raising_coeff = bfqd->bfq_raising_coeff; |
13491 |
+ if (idle_for_long_time) |
13492 |
+ bfqq->raising_cur_max_time = |
13493 |
+@@ -574,6 +622,7 @@ static void bfq_add_rq_rb(struct request *rq) |
13494 |
+ bfqd->bfq_raising_rt_max_time; |
13495 |
+ } |
13496 |
+ } |
13497 |
++set_ioprio_changed: |
13498 |
+ if (old_raising_coeff != bfqq->raising_coeff) |
13499 |
+ entity->ioprio_changed = 1; |
13500 |
+ add_bfqq_busy: |
13501 |
+@@ -756,90 +805,35 @@ static void bfq_end_raising(struct bfq_data *bfqd) |
13502 |
+ spin_unlock_irq(bfqd->queue->queue_lock); |
13503 |
+ } |
13504 |
+ |
13505 |
+-static int bfq_allow_merge(struct request_queue *q, struct request *rq, |
13506 |
+- struct bio *bio) |
13507 |
+-{ |
13508 |
+- struct bfq_data *bfqd = q->elevator->elevator_data; |
13509 |
+- struct bfq_io_cq *bic; |
13510 |
+- struct bfq_queue *bfqq; |
13511 |
+- |
13512 |
+- /* |
13513 |
+- * Disallow merge of a sync bio into an async request. |
13514 |
+- */ |
13515 |
+- if (bfq_bio_sync(bio) && !rq_is_sync(rq)) |
13516 |
+- return 0; |
13517 |
+- |
13518 |
+- /* |
13519 |
+- * Lookup the bfqq that this bio will be queued with. Allow |
13520 |
+- * merge only if rq is queued there. |
13521 |
+- * Queue lock is held here. |
13522 |
+- */ |
13523 |
+- bic = bfq_bic_lookup(bfqd, current->io_context); |
13524 |
+- if (bic == NULL) |
13525 |
+- return 0; |
13526 |
+- |
13527 |
+- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); |
13528 |
+- return bfqq == RQ_BFQQ(rq); |
13529 |
+-} |
13530 |
+- |
13531 |
+-static void __bfq_set_in_service_queue(struct bfq_data *bfqd, |
13532 |
+- struct bfq_queue *bfqq) |
13533 |
+-{ |
13534 |
+- if (bfqq != NULL) { |
13535 |
+- bfq_mark_bfqq_must_alloc(bfqq); |
13536 |
+- bfq_mark_bfqq_budget_new(bfqq); |
13537 |
+- bfq_clear_bfqq_fifo_expire(bfqq); |
13538 |
+- |
13539 |
+- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; |
13540 |
+- |
13541 |
+- bfq_log_bfqq(bfqd, bfqq, |
13542 |
+- "set_in_service_queue, cur-budget = %lu", |
13543 |
+- bfqq->entity.budget); |
13544 |
+- } |
13545 |
+- |
13546 |
+- bfqd->in_service_queue = bfqq; |
13547 |
+-} |
13548 |
+- |
13549 |
+-/* |
13550 |
+- * Get and set a new queue for service. |
13551 |
+- */ |
13552 |
+-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd, |
13553 |
+- struct bfq_queue *bfqq) |
13554 |
++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request) |
13555 |
+ { |
13556 |
+- if (!bfqq) |
13557 |
+- bfqq = bfq_get_next_queue(bfqd); |
13558 |
++ if (request) |
13559 |
++ return blk_rq_pos(io_struct); |
13560 |
+ else |
13561 |
+- bfq_get_next_queue_forced(bfqd, bfqq); |
13562 |
+- |
13563 |
+- __bfq_set_in_service_queue(bfqd, bfqq); |
13564 |
+- return bfqq; |
13565 |
++ return ((struct bio *)io_struct)->bi_sector; |
13566 |
+ } |
13567 |
+ |
13568 |
+-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd, |
13569 |
+- struct request *rq) |
13570 |
++static inline sector_t bfq_dist_from(sector_t pos1, |
13571 |
++ sector_t pos2) |
13572 |
+ { |
13573 |
+- if (blk_rq_pos(rq) >= bfqd->last_position) |
13574 |
+- return blk_rq_pos(rq) - bfqd->last_position; |
13575 |
++ if (pos1 >= pos2) |
13576 |
++ return pos1 - pos2; |
13577 |
+ else |
13578 |
+- return bfqd->last_position - blk_rq_pos(rq); |
13579 |
++ return pos2 - pos1; |
13580 |
+ } |
13581 |
+ |
13582 |
+-/* |
13583 |
+- * Return true if bfqq has no request pending and rq is close enough to |
13584 |
+- * bfqd->last_position, or if rq is closer to bfqd->last_position than |
13585 |
+- * bfqq->next_rq |
13586 |
+- */ |
13587 |
+-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq) |
13588 |
++static inline int bfq_rq_close_to_sector(void *io_struct, bool request, |
13589 |
++ sector_t sector) |
13590 |
+ { |
13591 |
+- return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR; |
13592 |
++ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <= |
13593 |
++ BFQQ_SEEK_THR; |
13594 |
+ } |
13595 |
+ |
13596 |
+-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) |
13597 |
++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector) |
13598 |
+ { |
13599 |
+ struct rb_root *root = &bfqd->rq_pos_tree; |
13600 |
+ struct rb_node *parent, *node; |
13601 |
+ struct bfq_queue *__bfqq; |
13602 |
+- sector_t sector = bfqd->last_position; |
13603 |
+ |
13604 |
+ if (RB_EMPTY_ROOT(root)) |
13605 |
+ return NULL; |
13606 |
+@@ -858,7 +852,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) |
13607 |
+ * position). |
13608 |
+ */ |
13609 |
+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); |
13610 |
+- if (bfq_rq_close(bfqd, __bfqq->next_rq)) |
13611 |
++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) |
13612 |
+ return __bfqq; |
13613 |
+ |
13614 |
+ if (blk_rq_pos(__bfqq->next_rq) < sector) |
13615 |
+@@ -869,7 +863,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) |
13616 |
+ return NULL; |
13617 |
+ |
13618 |
+ __bfqq = rb_entry(node, struct bfq_queue, pos_node); |
13619 |
+- if (bfq_rq_close(bfqd, __bfqq->next_rq)) |
13620 |
++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) |
13621 |
+ return __bfqq; |
13622 |
+ |
13623 |
+ return NULL; |
13624 |
+@@ -878,14 +872,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) |
13625 |
+ /* |
13626 |
+ * bfqd - obvious |
13627 |
+ * cur_bfqq - passed in so that we don't decide that the current queue |
13628 |
+- * is closely cooperating with itself. |
13629 |
+- * |
13630 |
+- * We are assuming that cur_bfqq has dispatched at least one request, |
13631 |
+- * and that bfqd->last_position reflects a position on the disk associated |
13632 |
+- * with the I/O issued by cur_bfqq. |
13633 |
++ * is closely cooperating with itself |
13634 |
++ * sector - used as a reference point to search for a close queue |
13635 |
+ */ |
13636 |
+ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, |
13637 |
+- struct bfq_queue *cur_bfqq) |
13638 |
++ struct bfq_queue *cur_bfqq, |
13639 |
++ sector_t sector) |
13640 |
+ { |
13641 |
+ struct bfq_queue *bfqq; |
13642 |
+ |
13643 |
+@@ -905,7 +897,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, |
13644 |
+ * working closely on the same area of the disk. In that case, |
13645 |
+ * we can group them together and don't waste time idling. |
13646 |
+ */ |
13647 |
+- bfqq = bfqq_close(bfqd); |
13648 |
++ bfqq = bfqq_close(bfqd, sector); |
13649 |
+ if (bfqq == NULL || bfqq == cur_bfqq) |
13650 |
+ return NULL; |
13651 |
+ |
13652 |
+@@ -932,6 +924,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, |
13653 |
+ return bfqq; |
13654 |
+ } |
13655 |
+ |
13656 |
++static struct bfq_queue * |
13657 |
++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) |
13658 |
++{ |
13659 |
++ int process_refs, new_process_refs; |
13660 |
++ struct bfq_queue *__bfqq; |
13661 |
++ |
13662 |
++ /* |
13663 |
++ * If there are no process references on the new_bfqq, then it is |
13664 |
++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain |
13665 |
++ * may have dropped their last reference (not just their last process |
13666 |
++ * reference). |
13667 |
++ */ |
13668 |
++ if (!bfqq_process_refs(new_bfqq)) |
13669 |
++ return NULL; |
13670 |
++ |
13671 |
++ /* Avoid a circular list and skip interim queue merges. */ |
13672 |
++ while ((__bfqq = new_bfqq->new_bfqq)) { |
13673 |
++ if (__bfqq == bfqq) |
13674 |
++ return NULL; |
13675 |
++ new_bfqq = __bfqq; |
13676 |
++ } |
13677 |
++ |
13678 |
++ process_refs = bfqq_process_refs(bfqq); |
13679 |
++ new_process_refs = bfqq_process_refs(new_bfqq); |
13680 |
++ /* |
13681 |
++ * If the process for the bfqq has gone away, there is no |
13682 |
++ * sense in merging the queues. |
13683 |
++ */ |
13684 |
++ if (process_refs == 0 || new_process_refs == 0) |
13685 |
++ return NULL; |
13686 |
++ |
13687 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", |
13688 |
++ new_bfqq->pid); |
13689 |
++ |
13690 |
++ /* |
13691 |
++ * Merging is just a redirection: the requests of the process owning |
13692 |
++ * one of the two queues are redirected to the other queue. The latter |
13693 |
++ * queue, in its turn, is set as shared if this is the first time that |
13694 |
++ * the requests of some process are redirected to it. |
13695 |
++ * |
13696 |
++ * We redirect bfqq to new_bfqq and not the opposite, because we |
13697 |
++ * are in the context of the process owning bfqq, hence we have the |
13698 |
++ * io_cq of this process. So we can immediately configure this io_cq |
13699 |
++ * to redirect the requests of the process to new_bfqq. |
13700 |
++ * |
13701 |
++ * NOTE, even if new_bfqq coincides with the in-service queue, the |
13702 |
++ * io_cq of new_bfqq is not available, because, if the in-service queue |
13703 |
++ * is shared, bfqd->in_service_bic may not point to the io_cq of the |
13704 |
++ * in-service queue. |
13705 |
++ * Redirecting the requests of the process owning bfqq to the currently |
13706 |
++ * in-service queue is in any case the best option, as we feed the |
13707 |
++ * in-service queue with new requests close to the last request served |
13708 |
++ * and, by doing so, hopefully increase the throughput. |
13709 |
++ */ |
13710 |
++ bfqq->new_bfqq = new_bfqq; |
13711 |
++ atomic_add(process_refs, &new_bfqq->ref); |
13712 |
++ return new_bfqq; |
13713 |
++} |
13714 |
++ |
13715 |
++/* |
13716 |
++ * Attempt to schedule a merge of bfqq with the currently in-service queue or |
13717 |
++ * with a close queue among the scheduled queues. |
13718 |
++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue |
13719 |
++ * structure otherwise. |
13720 |
++ */ |
13721 |
++static struct bfq_queue * |
13722 |
++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
13723 |
++ void *io_struct, bool request) |
13724 |
++{ |
13725 |
++ struct bfq_queue *in_service_bfqq, *new_bfqq; |
13726 |
++ |
13727 |
++ if (bfqq->new_bfqq) |
13728 |
++ return bfqq->new_bfqq; |
13729 |
++ |
13730 |
++ if (!io_struct) |
13731 |
++ return NULL; |
13732 |
++ |
13733 |
++ in_service_bfqq = bfqd->in_service_queue; |
13734 |
++ |
13735 |
++ if (in_service_bfqq == NULL || in_service_bfqq == bfqq || |
13736 |
++ !bfqd->in_service_bic) |
13737 |
++ goto check_scheduled; |
13738 |
++ |
13739 |
++ if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq)) |
13740 |
++ goto check_scheduled; |
13741 |
++ |
13742 |
++ if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq)) |
13743 |
++ goto check_scheduled; |
13744 |
++ |
13745 |
++ if (in_service_bfqq->entity.parent != bfqq->entity.parent) |
13746 |
++ goto check_scheduled; |
13747 |
++ |
13748 |
++ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && |
13749 |
++ bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) { |
13750 |
++ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); |
13751 |
++ if (new_bfqq != NULL) |
13752 |
++ return new_bfqq; /* Merge with the in-service queue */ |
13753 |
++ } |
13754 |
++ |
13755 |
++ /* |
13756 |
++ * Check whether there is a cooperator among currently scheduled |
13757 |
++ * queues. The only thing we need is that the bio/request is not |
13758 |
++ * NULL, as we need it to establish whether a cooperator exists. |
13759 |
++ */ |
13760 |
++check_scheduled: |
13761 |
++ new_bfqq = bfq_close_cooperator(bfqd, bfqq, |
13762 |
++ bfq_io_struct_pos(io_struct, request)); |
13763 |
++ if (new_bfqq) |
13764 |
++ return bfq_setup_merge(bfqq, new_bfqq); |
13765 |
++ |
13766 |
++ return NULL; |
13767 |
++} |
13768 |
++ |
13769 |
++static inline void |
13770 |
++bfq_bfqq_save_state(struct bfq_queue *bfqq) |
13771 |
++{ |
13772 |
++ /* |
13773 |
++ * If bfqq->bic == NULL, the queue is already shared or its requests |
13774 |
++ * have already been redirected to a shared queue; both idle window |
13775 |
++ * and weight raising state have already been saved. Do nothing. |
13776 |
++ */ |
13777 |
++ if (bfqq->bic == NULL) |
13778 |
++ return; |
13779 |
++ if (bfqq->bic->raising_time_left) |
13780 |
++ /* |
13781 |
++ * This is the queue of a just-started process, and would |
13782 |
++ * deserve weight raising: we set raising_time_left to the full |
13783 |
++ * weight-raising duration to trigger weight-raising when and |
13784 |
++ * if the queue is split and the first request of the queue |
13785 |
++ * is enqueued. |
13786 |
++ */ |
13787 |
++ bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd); |
13788 |
++ else if (bfqq->raising_coeff > 1) { |
13789 |
++ unsigned long wrais_duration = |
13790 |
++ jiffies - bfqq->last_rais_start_finish; |
13791 |
++ /* |
13792 |
++ * It may happen that a queue's weight raising period lasts |
13793 |
++ * longer than its raising_cur_max_time, as weight raising is |
13794 |
++ * handled only when a request is enqueued or dispatched (it |
13795 |
++ * does not use any timer). If the weight raising period is |
13796 |
++ * about to end, don't save it. |
13797 |
++ */ |
13798 |
++ if (bfqq->raising_cur_max_time <= wrais_duration) |
13799 |
++ bfqq->bic->raising_time_left = 0; |
13800 |
++ else |
13801 |
++ bfqq->bic->raising_time_left = |
13802 |
++ bfqq->raising_cur_max_time - wrais_duration; |
13803 |
++ /* |
13804 |
++ * The bfq_queue is becoming shared or the requests of the |
13805 |
++ * process owning the queue are being redirected to a shared |
13806 |
++ * queue. Stop the weight raising period of the queue, as in |
13807 |
++ * both cases it should not be owned by an interactive or soft |
13808 |
++ * real-time application. |
13809 |
++ */ |
13810 |
++ bfq_bfqq_end_raising(bfqq); |
13811 |
++ } else |
13812 |
++ bfqq->bic->raising_time_left = 0; |
13813 |
++ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); |
13814 |
++} |
13815 |
++ |
13816 |
++static inline void |
13817 |
++bfq_get_bic_reference(struct bfq_queue *bfqq) |
13818 |
++{ |
13819 |
++ /* |
13820 |
++ * If bfqq->bic has a non-NULL value, the bic to which it belongs |
13821 |
++ * is about to begin using a shared bfq_queue. |
13822 |
++ */ |
13823 |
++ if (bfqq->bic) |
13824 |
++ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); |
13825 |
++} |
13826 |
++ |
13827 |
++static void |
13828 |
++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, |
13829 |
++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) |
13830 |
++{ |
13831 |
++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", |
13832 |
++ (long unsigned)new_bfqq->pid); |
13833 |
++ /* Save weight raising and idle window of the merged queues */ |
13834 |
++ bfq_bfqq_save_state(bfqq); |
13835 |
++ bfq_bfqq_save_state(new_bfqq); |
13836 |
++ /* |
13837 |
++ * Grab a reference to the bic, to prevent it from being destroyed |
13838 |
++ * before being possibly touched by a bfq_split_bfqq(). |
13839 |
++ */ |
13840 |
++ bfq_get_bic_reference(bfqq); |
13841 |
++ bfq_get_bic_reference(new_bfqq); |
13842 |
++ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */ |
13843 |
++ bic_set_bfqq(bic, new_bfqq, 1); |
13844 |
++ bfq_mark_bfqq_coop(new_bfqq); |
13845 |
++ /* |
13846 |
++ * new_bfqq now belongs to at least two bics (it is a shared queue): set |
13847 |
++ * new_bfqq->bic to NULL. bfqq either: |
13848 |
++ * - does not belong to any bic any more, and hence bfqq->bic must |
13849 |
++ * be set to NULL, or |
13850 |
++ * - is a queue whose owning bics have already been redirected to a |
13851 |
++ * different queue, hence the queue is destined to not belong to any |
13852 |
++ * bic soon and bfqq->bic is already NULL (therefore the next |
13853 |
++ * assignment causes no harm). |
13854 |
++ */ |
13855 |
++ new_bfqq->bic = NULL; |
13856 |
++ bfqq->bic = NULL; |
13857 |
++ bfq_put_queue(bfqq); |
13858 |
++} |
13859 |
++ |
13860 |
++static int bfq_allow_merge(struct request_queue *q, struct request *rq, |
13861 |
++ struct bio *bio) |
13862 |
++{ |
13863 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
13864 |
++ struct bfq_io_cq *bic; |
13865 |
++ struct bfq_queue *bfqq, *new_bfqq; |
13866 |
++ |
13867 |
++ /* |
13868 |
++ * Disallow merge of a sync bio into an async request. |
13869 |
++ */ |
13870 |
++ if (bfq_bio_sync(bio) && !rq_is_sync(rq)) |
13871 |
++ return 0; |
13872 |
++ |
13873 |
++ /* |
13874 |
++ * Lookup the bfqq that this bio will be queued with. Allow |
13875 |
++ * merge only if rq is queued there. |
13876 |
++ * Queue lock is held here. |
13877 |
++ */ |
13878 |
++ bic = bfq_bic_lookup(bfqd, current->io_context); |
13879 |
++ if (bic == NULL) |
13880 |
++ return 0; |
13881 |
++ |
13882 |
++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); |
13883 |
++ /* |
13884 |
++ * We take advantage of this function to perform an early merge |
13885 |
++ * of the queues of possible cooperating processes. |
13886 |
++ */ |
13887 |
++ if (bfqq != NULL) { |
13888 |
++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); |
13889 |
++ if (new_bfqq != NULL) { |
13890 |
++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); |
13891 |
++ /* |
13892 |
++ * If we get here, the bio will be queued in the shared queue, |
13893 |
++ * i.e., new_bfqq, so use new_bfqq to decide whether bio and |
13894 |
++ * rq can be merged. |
13895 |
++ */ |
13896 |
++ bfqq = new_bfqq; |
13897 |
++ } |
13898 |
++ } |
13899 |
++ |
13900 |
++ return bfqq == RQ_BFQQ(rq); |
13901 |
++} |
13902 |
++ |
13903 |
++static void __bfq_set_in_service_queue(struct bfq_data *bfqd, |
13904 |
++ struct bfq_queue *bfqq) |
13905 |
++{ |
13906 |
++ if (bfqq != NULL) { |
13907 |
++ bfq_mark_bfqq_must_alloc(bfqq); |
13908 |
++ bfq_mark_bfqq_budget_new(bfqq); |
13909 |
++ bfq_clear_bfqq_fifo_expire(bfqq); |
13910 |
++ |
13911 |
++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; |
13912 |
++ |
13913 |
++ bfq_log_bfqq(bfqd, bfqq, |
13914 |
++ "set_in_service_queue, cur-budget = %lu", |
13915 |
++ bfqq->entity.budget); |
13916 |
++ } |
13917 |
++ |
13918 |
++ bfqd->in_service_queue = bfqq; |
13919 |
++} |
13920 |
++ |
13921 |
++/* |
13922 |
++ * Get and set a new queue for service. |
13923 |
++ */ |
13924 |
++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) |
13925 |
++{ |
13926 |
++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); |
13927 |
++ |
13928 |
++ __bfq_set_in_service_queue(bfqd, bfqq); |
13929 |
++ return bfqq; |
13930 |
++} |
13931 |
++ |
13932 |
+ /* |
13933 |
+ * If enough samples have been computed, return the current max budget |
13934 |
+ * stored in bfqd, which is dynamically updated according to the |
13935 |
+@@ -1079,63 +1347,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq) |
13936 |
+ return rq; |
13937 |
+ } |
13938 |
+ |
13939 |
+-/* |
13940 |
+- * Must be called with the queue_lock held. |
13941 |
+- */ |
13942 |
+-static int bfqq_process_refs(struct bfq_queue *bfqq) |
13943 |
+-{ |
13944 |
+- int process_refs, io_refs; |
13945 |
+- |
13946 |
+- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; |
13947 |
+- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; |
13948 |
+- BUG_ON(process_refs < 0); |
13949 |
+- return process_refs; |
13950 |
+-} |
13951 |
+- |
13952 |
+-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) |
13953 |
+-{ |
13954 |
+- int process_refs, new_process_refs; |
13955 |
+- struct bfq_queue *__bfqq; |
13956 |
+- |
13957 |
+- /* |
13958 |
+- * If there are no process references on the new_bfqq, then it is |
13959 |
+- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain |
13960 |
+- * may have dropped their last reference (not just their last process |
13961 |
+- * reference). |
13962 |
+- */ |
13963 |
+- if (!bfqq_process_refs(new_bfqq)) |
13964 |
+- return; |
13965 |
+- |
13966 |
+- /* Avoid a circular list and skip interim queue merges. */ |
13967 |
+- while ((__bfqq = new_bfqq->new_bfqq)) { |
13968 |
+- if (__bfqq == bfqq) |
13969 |
+- return; |
13970 |
+- new_bfqq = __bfqq; |
13971 |
+- } |
13972 |
+- |
13973 |
+- process_refs = bfqq_process_refs(bfqq); |
13974 |
+- new_process_refs = bfqq_process_refs(new_bfqq); |
13975 |
+- /* |
13976 |
+- * If the process for the bfqq has gone away, there is no |
13977 |
+- * sense in merging the queues. |
13978 |
+- */ |
13979 |
+- if (process_refs == 0 || new_process_refs == 0) |
13980 |
+- return; |
13981 |
+- |
13982 |
+- /* |
13983 |
+- * Merge in the direction of the lesser amount of work. |
13984 |
+- */ |
13985 |
+- if (new_process_refs >= process_refs) { |
13986 |
+- bfqq->new_bfqq = new_bfqq; |
13987 |
+- atomic_add(process_refs, &new_bfqq->ref); |
13988 |
+- } else { |
13989 |
+- new_bfqq->new_bfqq = bfqq; |
13990 |
+- atomic_add(new_process_refs, &bfqq->ref); |
13991 |
+- } |
13992 |
+- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", |
13993 |
+- new_bfqq->pid); |
13994 |
+-} |
13995 |
+- |
13996 |
+ static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) |
13997 |
+ { |
13998 |
+ struct bfq_entity *entity = &bfqq->entity; |
13999 |
+@@ -1729,7 +1940,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) |
14000 |
+ */ |
14001 |
+ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) |
14002 |
+ { |
14003 |
+- struct bfq_queue *bfqq, *new_bfqq = NULL; |
14004 |
++ struct bfq_queue *bfqq; |
14005 |
+ struct request *next_rq; |
14006 |
+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; |
14007 |
+ |
14008 |
+@@ -1739,17 +1950,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) |
14009 |
+ |
14010 |
+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); |
14011 |
+ |
14012 |
+- /* |
14013 |
+- * If another queue has a request waiting within our mean seek |
14014 |
+- * distance, let it run. The expire code will check for close |
14015 |
+- * cooperators and put the close queue at the front of the |
14016 |
+- * service tree. If possible, merge the expiring queue with the |
14017 |
+- * new bfqq. |
14018 |
+- */ |
14019 |
+- new_bfqq = bfq_close_cooperator(bfqd, bfqq); |
14020 |
+- if (new_bfqq != NULL && bfqq->new_bfqq == NULL) |
14021 |
+- bfq_setup_merge(bfqq, new_bfqq); |
14022 |
+- |
14023 |
+ if (bfq_may_expire_for_budg_timeout(bfqq) && |
14024 |
+ !timer_pending(&bfqd->idle_slice_timer) && |
14025 |
+ !bfq_bfqq_must_idle(bfqq)) |
14026 |
+@@ -1786,36 +1986,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) |
14027 |
+ bfq_clear_bfqq_wait_request(bfqq); |
14028 |
+ del_timer(&bfqd->idle_slice_timer); |
14029 |
+ } |
14030 |
+- if (new_bfqq == NULL) |
14031 |
+- goto keep_queue; |
14032 |
+- else |
14033 |
+- goto expire; |
14034 |
++ goto keep_queue; |
14035 |
+ } |
14036 |
+ } |
14037 |
+ |
14038 |
+ /* |
14039 |
+- * No requests pending. If the in-service queue has no cooperator and |
14040 |
+- * still has requests in flight (possibly waiting for a completion) |
14041 |
+- * or is idling for a new request, then keep it. |
14042 |
++ * No requests pending. If the in-service queue still has requests in |
14043 |
++ * flight (possibly waiting for a completion) or is idling for a new |
14044 |
++ * request, then keep it. |
14045 |
+ */ |
14046 |
+- if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) || |
14047 |
+- (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) { |
14048 |
++ if (timer_pending(&bfqd->idle_slice_timer) || |
14049 |
++ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) { |
14050 |
+ bfqq = NULL; |
14051 |
+ goto keep_queue; |
14052 |
+- } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) { |
14053 |
+- /* |
14054 |
+- * Expiring the queue because there is a close cooperator, |
14055 |
+- * cancel timer. |
14056 |
+- */ |
14057 |
+- bfq_clear_bfqq_wait_request(bfqq); |
14058 |
+- del_timer(&bfqd->idle_slice_timer); |
14059 |
+ } |
14060 |
+ |
14061 |
+ reason = BFQ_BFQQ_NO_MORE_REQUESTS; |
14062 |
+ expire: |
14063 |
+ bfq_bfqq_expire(bfqd, bfqq, 0, reason); |
14064 |
+ new_queue: |
14065 |
+- bfqq = bfq_set_in_service_queue(bfqd, new_bfqq); |
14066 |
++ bfqq = bfq_set_in_service_queue(bfqd); |
14067 |
+ bfq_log(bfqd, "select_queue: new queue %d returned", |
14068 |
+ bfqq != NULL ? bfqq->pid : 0); |
14069 |
+ keep_queue: |
14070 |
+@@ -1825,9 +2015,8 @@ keep_queue: |
14071 |
+ static void bfq_update_raising_data(struct bfq_data *bfqd, |
14072 |
+ struct bfq_queue *bfqq) |
14073 |
+ { |
14074 |
++ struct bfq_entity *entity = &bfqq->entity; |
14075 |
+ if (bfqq->raising_coeff > 1) { /* queue is being boosted */ |
14076 |
+- struct bfq_entity *entity = &bfqq->entity; |
14077 |
+- |
14078 |
+ bfq_log_bfqq(bfqd, bfqq, |
14079 |
+ "raising period dur %u/%u msec, " |
14080 |
+ "old raising coeff %u, w %d(%d)", |
14081 |
+@@ -1844,7 +2033,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd, |
14082 |
+ "WARN: pending prio change"); |
14083 |
+ /* |
14084 |
+ * If too much time has elapsed from the beginning |
14085 |
+- * of this weight-raising, stop it. |
14086 |
++ * of this weight-raising period, stop it. |
14087 |
+ */ |
14088 |
+ if (time_is_before_jiffies(bfqq->last_rais_start_finish + |
14089 |
+ bfqq->raising_cur_max_time)) { |
14090 |
+@@ -1856,11 +2045,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd, |
14091 |
+ jiffies_to_msecs(bfqq-> |
14092 |
+ raising_cur_max_time)); |
14093 |
+ bfq_bfqq_end_raising(bfqq); |
14094 |
+- __bfq_entity_update_weight_prio( |
14095 |
+- bfq_entity_service_tree(entity), |
14096 |
+- entity); |
14097 |
+ } |
14098 |
+ } |
14099 |
++ /* Update weight both if it must be raised and if it must be lowered */ |
14100 |
++ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1)) |
14101 |
++ __bfq_entity_update_weight_prio( |
14102 |
++ bfq_entity_service_tree(entity), |
14103 |
++ entity); |
14104 |
+ } |
14105 |
+ |
14106 |
+ /* |
14107 |
+@@ -2101,6 +2292,25 @@ static void bfq_init_icq(struct io_cq *icq) |
14108 |
+ struct bfq_io_cq *bic = icq_to_bic(icq); |
14109 |
+ |
14110 |
+ bic->ttime.last_end_request = jiffies; |
14111 |
++ /* |
14112 |
++ * A newly created bic indicates that the process has just |
14113 |
++ * started doing I/O, and is probably mapping into memory its |
14114 |
++ * executable and libraries: it definitely needs weight raising. |
14115 |
++ * There is however the possibility that the process performs, |
14116 |
++ * for a while, I/O close to some other process. EQM intercepts |
14117 |
++ * this behavior and may merge the queue corresponding to the |
14118 |
++ * process with some other queue, BEFORE the weight of the queue |
14119 |
++ * is raised. Merged queues are not weight-raised (they are assumed |
14120 |
++ * to belong to processes that benefit only from high throughput). |
14121 |
++ * If the merge is basically the consequence of an accident, then |
14122 |
++ * the queue will be split soon and will get back its old weight. |
14123 |
++ * It is then important to write down somewhere that this queue |
14124 |
++ * does need weight raising, even if it did not make it to get its |
14125 |
++ * weight raised before being merged. To this purpose, we overload |
14126 |
++ * the field raising_time_left and assign 1 to it, to mark the queue |
14127 |
++ * as needing weight raising. |
14128 |
++ */ |
14129 |
++ bic->raising_time_left = 1; |
14130 |
+ } |
14131 |
+ |
14132 |
+ static void bfq_exit_icq(struct io_cq *icq) |
14133 |
+@@ -2114,6 +2324,13 @@ static void bfq_exit_icq(struct io_cq *icq) |
14134 |
+ } |
14135 |
+ |
14136 |
+ if (bic->bfqq[BLK_RW_SYNC]) { |
14137 |
++ /* |
14138 |
++ * If the bic is using a shared queue, put the reference |
14139 |
++ * taken on the io_context when the bic started using a |
14140 |
++ * shared bfq_queue. |
14141 |
++ */ |
14142 |
++ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) |
14143 |
++ put_io_context(icq->ioc); |
14144 |
+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); |
14145 |
+ bic->bfqq[BLK_RW_SYNC] = NULL; |
14146 |
+ } |
14147 |
+@@ -2405,6 +2622,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, |
14148 |
+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) |
14149 |
+ return; |
14150 |
+ |
14151 |
++ /* Idle window just restored, statistics are meaningless. */ |
14152 |
++ if (bfq_bfqq_just_split(bfqq)) |
14153 |
++ return; |
14154 |
++ |
14155 |
+ enable_idle = bfq_bfqq_idle_window(bfqq); |
14156 |
+ |
14157 |
+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || |
14158 |
+@@ -2445,6 +2666,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
14159 |
+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || |
14160 |
+ !BFQQ_SEEKY(bfqq)) |
14161 |
+ bfq_update_idle_window(bfqd, bfqq, bic); |
14162 |
++ bfq_clear_bfqq_just_split(bfqq); |
14163 |
+ |
14164 |
+ bfq_log_bfqq(bfqd, bfqq, |
14165 |
+ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", |
14166 |
+@@ -2505,13 +2727,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
14167 |
+ static void bfq_insert_request(struct request_queue *q, struct request *rq) |
14168 |
+ { |
14169 |
+ struct bfq_data *bfqd = q->elevator->elevator_data; |
14170 |
+- struct bfq_queue *bfqq = RQ_BFQQ(rq); |
14171 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; |
14172 |
+ |
14173 |
+ assert_spin_locked(bfqd->queue->queue_lock); |
14174 |
++ |
14175 |
++ /* |
14176 |
++ * An unplug may trigger a requeue of a request from the device |
14177 |
++ * driver: make sure we are in process context while trying to |
14178 |
++ * merge two bfq_queues. |
14179 |
++ */ |
14180 |
++ if (!in_interrupt()) { |
14181 |
++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); |
14182 |
++ if (new_bfqq != NULL) { |
14183 |
++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) |
14184 |
++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); |
14185 |
++ /* |
14186 |
++ * Release the request's reference to the old bfqq |
14187 |
++ * and make sure one is taken to the shared queue. |
14188 |
++ */ |
14189 |
++ new_bfqq->allocated[rq_data_dir(rq)]++; |
14190 |
++ bfqq->allocated[rq_data_dir(rq)]--; |
14191 |
++ atomic_inc(&new_bfqq->ref); |
14192 |
++ bfq_put_queue(bfqq); |
14193 |
++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) |
14194 |
++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), |
14195 |
++ bfqq, new_bfqq); |
14196 |
++ rq->elv.priv[1] = new_bfqq; |
14197 |
++ bfqq = new_bfqq; |
14198 |
++ } |
14199 |
++ } |
14200 |
++ |
14201 |
+ bfq_init_prio_data(bfqq, RQ_BIC(rq)); |
14202 |
+ |
14203 |
+ bfq_add_rq_rb(rq); |
14204 |
+ |
14205 |
++ /* |
14206 |
++ * Here a newly-created bfq_queue has already started a weight-raising |
14207 |
++ * period: clear raising_time_left to prevent bfq_bfqq_save_state() |
14208 |
++ * from assigning it a full weight-raising period. See the detailed |
14209 |
++ * comments about this field in bfq_init_icq(). |
14210 |
++ */ |
14211 |
++ if (bfqq->bic != NULL) |
14212 |
++ bfqq->bic->raising_time_left = 0; |
14213 |
+ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]); |
14214 |
+ list_add_tail(&rq->queuelist, &bfqq->fifo); |
14215 |
+ |
14216 |
+@@ -2659,18 +2916,6 @@ static void bfq_put_request(struct request *rq) |
14217 |
+ } |
14218 |
+ } |
14219 |
+ |
14220 |
+-static struct bfq_queue * |
14221 |
+-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, |
14222 |
+- struct bfq_queue *bfqq) |
14223 |
+-{ |
14224 |
+- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", |
14225 |
+- (long unsigned)bfqq->new_bfqq->pid); |
14226 |
+- bic_set_bfqq(bic, bfqq->new_bfqq, 1); |
14227 |
+- bfq_mark_bfqq_coop(bfqq->new_bfqq); |
14228 |
+- bfq_put_queue(bfqq); |
14229 |
+- return bic_to_bfqq(bic, 1); |
14230 |
+-} |
14231 |
+- |
14232 |
+ /* |
14233 |
+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this |
14234 |
+ * was the last process referring to said bfqq. |
14235 |
+@@ -2679,6 +2924,9 @@ static struct bfq_queue * |
14236 |
+ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) |
14237 |
+ { |
14238 |
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); |
14239 |
++ |
14240 |
++ put_io_context(bic->icq.ioc); |
14241 |
++ |
14242 |
+ if (bfqq_process_refs(bfqq) == 1) { |
14243 |
+ bfqq->pid = current->pid; |
14244 |
+ bfq_clear_bfqq_coop(bfqq); |
14245 |
+@@ -2707,6 +2955,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, |
14246 |
+ struct bfq_queue *bfqq; |
14247 |
+ struct bfq_group *bfqg; |
14248 |
+ unsigned long flags; |
14249 |
++ bool split = false; |
14250 |
+ |
14251 |
+ might_sleep_if(gfp_mask & __GFP_WAIT); |
14252 |
+ |
14253 |
+@@ -2725,24 +2974,14 @@ new_queue: |
14254 |
+ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask); |
14255 |
+ bic_set_bfqq(bic, bfqq, is_sync); |
14256 |
+ } else { |
14257 |
+- /* |
14258 |
+- * If the queue was seeky for too long, break it apart. |
14259 |
+- */ |
14260 |
++ /* If the queue was seeky for too long, break it apart. */ |
14261 |
+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { |
14262 |
+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); |
14263 |
+ bfqq = bfq_split_bfqq(bic, bfqq); |
14264 |
++ split = true; |
14265 |
+ if (!bfqq) |
14266 |
+ goto new_queue; |
14267 |
+ } |
14268 |
+- |
14269 |
+- /* |
14270 |
+- * Check to see if this queue is scheduled to merge with |
14271 |
+- * another closely cooperating queue. The merging of queues |
14272 |
+- * happens here as it must be done in process context. |
14273 |
+- * The reference on new_bfqq was taken in merge_bfqqs. |
14274 |
+- */ |
14275 |
+- if (bfqq->new_bfqq != NULL) |
14276 |
+- bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq); |
14277 |
+ } |
14278 |
+ |
14279 |
+ bfqq->allocated[rw]++; |
14280 |
+@@ -2753,6 +2992,26 @@ new_queue: |
14281 |
+ rq->elv.priv[0] = bic; |
14282 |
+ rq->elv.priv[1] = bfqq; |
14283 |
+ |
14284 |
++ /* |
14285 |
++ * If a bfq_queue has only one process reference, it is owned |
14286 |
++ * by only one bfq_io_cq: we can set the bic field of the |
14287 |
++ * bfq_queue to the address of that structure. Also, if the |
14288 |
++ * queue has just been split, mark a flag so that the |
14289 |
++ * information is available to the other scheduler hooks. |
14290 |
++ */ |
14291 |
++ if (bfqq_process_refs(bfqq) == 1) { |
14292 |
++ bfqq->bic = bic; |
14293 |
++ if (split) { |
14294 |
++ bfq_mark_bfqq_just_split(bfqq); |
14295 |
++ /* |
14296 |
++ * If the queue has just been split from a shared queue, |
14297 |
++ * restore the idle window and the possible weight |
14298 |
++ * raising period. |
14299 |
++ */ |
14300 |
++ bfq_bfqq_resume_state(bfqq, bic); |
14301 |
++ } |
14302 |
++ } |
14303 |
++ |
14304 |
+ spin_unlock_irqrestore(q->queue_lock, flags); |
14305 |
+ |
14306 |
+ return 0; |
14307 |
+diff --git a/block/bfq-sched.c b/block/bfq-sched.c |
14308 |
+index 999b475..e54ea33 100644 |
14309 |
+--- a/block/bfq-sched.c |
14310 |
++++ b/block/bfq-sched.c |
14311 |
+@@ -980,34 +980,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) |
14312 |
+ return bfqq; |
14313 |
+ } |
14314 |
+ |
14315 |
+-/* |
14316 |
+- * Forced extraction of the given queue. |
14317 |
+- */ |
14318 |
+-static void bfq_get_next_queue_forced(struct bfq_data *bfqd, |
14319 |
+- struct bfq_queue *bfqq) |
14320 |
+-{ |
14321 |
+- struct bfq_entity *entity; |
14322 |
+- struct bfq_sched_data *sd; |
14323 |
+- |
14324 |
+- BUG_ON(bfqd->in_service_queue != NULL); |
14325 |
+- |
14326 |
+- entity = &bfqq->entity; |
14327 |
+- /* |
14328 |
+- * Bubble up extraction/update from the leaf to the root. |
14329 |
+- */ |
14330 |
+- for_each_entity(entity) { |
14331 |
+- sd = entity->sched_data; |
14332 |
+- bfq_update_budget(entity); |
14333 |
+- bfq_update_vtime(bfq_entity_service_tree(entity)); |
14334 |
+- bfq_active_extract(bfq_entity_service_tree(entity), entity); |
14335 |
+- sd->active_entity = entity; |
14336 |
+- sd->next_active = NULL; |
14337 |
+- entity->service = 0; |
14338 |
+- } |
14339 |
+- |
14340 |
+- return; |
14341 |
+-} |
14342 |
+- |
14343 |
+ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) |
14344 |
+ { |
14345 |
+ if (bfqd->in_service_bic != NULL) { |
14346 |
+diff --git a/block/bfq.h b/block/bfq.h |
14347 |
+index f9b5881..0bfad40 100644 |
14348 |
+--- a/block/bfq.h |
14349 |
++++ b/block/bfq.h |
14350 |
+@@ -192,6 +192,8 @@ struct bfq_group; |
14351 |
+ * idle to backlogged |
14352 |
+ * @service_from_backlogged: cumulative service received from the @bfq_queue |
14353 |
+ * since the last transition from idle to backlogged |
14354 |
++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the |
14355 |
++ * queue is shared |
14356 |
+ * |
14357 |
+ * A bfq_queue is a leaf request queue; it can be associated to an io_context |
14358 |
+ * or more (if it is an async one). @cgroup holds a reference to the |
14359 |
+@@ -235,6 +237,7 @@ struct bfq_queue { |
14360 |
+ sector_t last_request_pos; |
14361 |
+ |
14362 |
+ pid_t pid; |
14363 |
++ struct bfq_io_cq *bic; |
14364 |
+ |
14365 |
+ /* weight-raising fields */ |
14366 |
+ unsigned long raising_cur_max_time; |
14367 |
+@@ -264,12 +267,23 @@ struct bfq_ttime { |
14368 |
+ * @icq: associated io_cq structure |
14369 |
+ * @bfqq: array of two process queues, the sync and the async |
14370 |
+ * @ttime: associated @bfq_ttime struct |
14371 |
++ * @raising_time_left: snapshot of the time left before weight raising ends |
14372 |
++ * for the sync queue associated to this process; this |
14373 |
++ * snapshot is taken to remember this value while the weight |
14374 |
++ * raising is suspended because the queue is merged with a |
14375 |
++ * shared queue, and is used to set @raising_cur_max_time |
14376 |
++ * when the queue is split from the shared queue and its |
14377 |
++ * weight is raised again |
14378 |
++ * @saved_idle_window: same purpose as the previous field for the idle window |
14379 |
+ */ |
14380 |
+ struct bfq_io_cq { |
14381 |
+ struct io_cq icq; /* must be the first member */ |
14382 |
+ struct bfq_queue *bfqq[2]; |
14383 |
+ struct bfq_ttime ttime; |
14384 |
+ int ioprio; |
14385 |
++ |
14386 |
++ unsigned int raising_time_left; |
14387 |
++ unsigned int saved_idle_window; |
14388 |
+ }; |
14389 |
+ |
14390 |
+ /** |
14391 |
+@@ -411,6 +425,7 @@ enum bfqq_state_flags { |
14392 |
+ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ |
14393 |
+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ |
14394 |
+ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */ |
14395 |
++ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ |
14396 |
+ BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */ |
14397 |
+ }; |
14398 |
+ |
14399 |
+@@ -438,6 +453,7 @@ BFQ_BFQQ_FNS(sync); |
14400 |
+ BFQ_BFQQ_FNS(budget_new); |
14401 |
+ BFQ_BFQQ_FNS(coop); |
14402 |
+ BFQ_BFQQ_FNS(split_coop); |
14403 |
++BFQ_BFQQ_FNS(just_split); |
14404 |
+ BFQ_BFQQ_FNS(softrt_update); |
14405 |
+ #undef BFQ_BFQQ_FNS |
14406 |
+ |
14407 |
+-- |
14408 |
+1.8.5.2 |
14409 |
+ |
14410 |
|
14411 |
Modified: genpatches-2.6/trunk/3.14/0000_README |
14412 |
=================================================================== |
14413 |
--- genpatches-2.6/trunk/3.14/0000_README 2014-02-07 14:46:59 UTC (rev 2665) |
14414 |
+++ genpatches-2.6/trunk/3.14/0000_README 2014-02-07 15:42:35 UTC (rev 2666) |
14415 |
@@ -83,17 +83,17 @@ |
14416 |
From: Tom Wijsman <TomWij@g.o> |
14417 |
Desc: Add Gentoo Linux support config settings and defaults. |
14418 |
|
14419 |
-Patch: 5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7-3.13.patch |
14420 |
+Patch: 5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v7r1-3.13.patch |
14421 |
From: http://algo.ing.unimo.it/people/paolo/disk_sched/ |
14422 |
-Desc: BFQ v7 patch 1 for 3.13: Build, cgroups and kconfig bits |
14423 |
+Desc: BFQ v7r1 patch 1 for 3.13: Build, cgroups and kconfig bits |
14424 |
|
14425 |
-Patch: 5000_BFQ-2-block-introduce-the-v7-I-O-sched-for-3.13.patch1 |
14426 |
+Patch: 5000_BFQ-2-block-introduce-the-v7r1-I-O-sched-for-3.13.patch1 |
14427 |
From: http://algo.ing.unimo.it/people/paolo/disk_sched/ |
14428 |
-Desc: BFQ v7 patch 2 for 3.13: BFQ Scheduler |
14429 |
+Desc: BFQ v7r1 patch 2 for 3.13: BFQ Scheduler |
14430 |
|
14431 |
-Patch: 5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7-for-3.13.0.patch |
14432 |
+Patch: 5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r1-for-3.13.0.patch |
14433 |
From: http://algo.ing.unimo.it/people/paolo/disk_sched/ |
14434 |
-Desc: BFQ v7 patch 3 for 3.13: Early Queue Merge (EQM) |
14435 |
+Desc: BFQ v7r1 patch 3 for 3.13: Early Queue Merge (EQM) |
14436 |
|
14437 |
Patch: 5000_enable-additional-cpu-optimizations-for-gcc.patch |
14438 |
From: https://github.com/graysky2/kernel_gcc_patch/ |
14439 |
|
14440 |
Added: genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch |
14441 |
=================================================================== |
14442 |
--- genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch (rev 0) |
14443 |
+++ genpatches-2.6/trunk/3.14/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-BFQ-v7r1-3.13.patch 2014-02-07 15:42:35 UTC (rev 2666) |
14444 |
@@ -0,0 +1,104 @@ |
14445 |
+From ae1b820a5286601aa9d5426459f8f3de658342b4 Mon Sep 17 00:00:00 2001 |
14446 |
+From: Paolo Valente <paolo.valente@×××××××.it> |
14447 |
+Date: Tue, 3 Sep 2013 16:50:42 +0200 |
14448 |
+Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r1-3.13 |
14449 |
+ |
14450 |
+Update Kconfig.iosched and do the related Makefile changes to include |
14451 |
+kernel configuration options for BFQ. Also add the bfqio controller |
14452 |
+to the cgroups subsystem. |
14453 |
+ |
14454 |
+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it> |
14455 |
+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com> |
14456 |
+--- |
14457 |
+ block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++ |
14458 |
+ block/Makefile | 1 + |
14459 |
+ include/linux/cgroup_subsys.h | 4 ++++ |
14460 |
+ 3 files changed, 37 insertions(+) |
14461 |
+ |
14462 |
+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched |
14463 |
+index 421bef9..8f552ba 100644 |
14464 |
+--- a/block/Kconfig.iosched |
14465 |
++++ b/block/Kconfig.iosched |
14466 |
+@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED |
14467 |
+ ---help--- |
14468 |
+ Enable group IO scheduling in CFQ. |
14469 |
+ |
14470 |
++config IOSCHED_BFQ |
14471 |
++ tristate "BFQ I/O scheduler" |
14472 |
++ default n |
14473 |
++ ---help--- |
14474 |
++ The BFQ I/O scheduler tries to distribute bandwidth among |
14475 |
++ all processes according to their weights. |
14476 |
++ It aims at distributing the bandwidth as desired, independently of |
14477 |
++ the disk parameters and with any workload. It also tries to |
14478 |
++ guarantee low latency to interactive and soft real-time |
14479 |
++ applications. If compiled built-in (saying Y here), BFQ can |
14480 |
++ be configured to support hierarchical scheduling. |
14481 |
++ |
14482 |
++config CGROUP_BFQIO |
14483 |
++ bool "BFQ hierarchical scheduling support" |
14484 |
++ depends on CGROUPS && IOSCHED_BFQ=y |
14485 |
++ default n |
14486 |
++ ---help--- |
14487 |
++ Enable hierarchical scheduling in BFQ, using the cgroups |
14488 |
++ filesystem interface. The name of the subsystem will be |
14489 |
++ bfqio. |
14490 |
++ |
14491 |
+ choice |
14492 |
+ prompt "Default I/O scheduler" |
14493 |
+ default DEFAULT_CFQ |
14494 |
+@@ -52,6 +73,16 @@ choice |
14495 |
+ config DEFAULT_CFQ |
14496 |
+ bool "CFQ" if IOSCHED_CFQ=y |
14497 |
+ |
14498 |
++ config DEFAULT_BFQ |
14499 |
++ bool "BFQ" if IOSCHED_BFQ=y |
14500 |
++ help |
14501 |
++ Selects BFQ as the default I/O scheduler which will be |
14502 |
++ used by default for all block devices. |
14503 |
++ The BFQ I/O scheduler aims at distributing the bandwidth |
14504 |
++ as desired, independently of the disk parameters and with |
14505 |
++ any workload. It also tries to guarantee low latency to |
14506 |
++ interactive and soft real-time applications. |
14507 |
++ |
14508 |
+ config DEFAULT_NOOP |
14509 |
+ bool "No-op" |
14510 |
+ |
14511 |
+@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED |
14512 |
+ string |
14513 |
+ default "deadline" if DEFAULT_DEADLINE |
14514 |
+ default "cfq" if DEFAULT_CFQ |
14515 |
++ default "bfq" if DEFAULT_BFQ |
14516 |
+ default "noop" if DEFAULT_NOOP |
14517 |
+ |
14518 |
+ endmenu |
14519 |
+diff --git a/block/Makefile b/block/Makefile |
14520 |
+index 20645e8..cbd83fb 100644 |
14521 |
+--- a/block/Makefile |
14522 |
++++ b/block/Makefile |
14523 |
+@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o |
14524 |
+ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o |
14525 |
+ obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o |
14526 |
+ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o |
14527 |
++obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o |
14528 |
+ |
14529 |
+ obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o |
14530 |
+ obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o |
14531 |
+diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h |
14532 |
+index b613ffd..43c5dc9 100644 |
14533 |
+--- a/include/linux/cgroup_subsys.h |
14534 |
++++ b/include/linux/cgroup_subsys.h |
14535 |
+@@ -39,6 +39,10 @@ SUBSYS(net_cls) |
14536 |
+ SUBSYS(blkio) |
14537 |
+ #endif |
14538 |
+ |
14539 |
++#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO) |
14540 |
++SUBSYS(bfqio) |
14541 |
++#endif |
14542 |
++ |
14543 |
+ #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF) |
14544 |
+ SUBSYS(perf) |
14545 |
+ #endif |
14546 |
+-- |
14547 |
+1.8.5.2 |
14548 |
+ |
14549 |
|
14550 |
Added: genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1 |
14551 |
=================================================================== |
14552 |
--- genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1 (rev 0) |
14553 |
+++ genpatches-2.6/trunk/3.14/5000_BFQ-2-block-introduce-the-BFQ-v7r1-I-O-sched-for-3.13.patch1 2014-02-07 15:42:35 UTC (rev 2666) |
14554 |
@@ -0,0 +1,6040 @@ |
14555 |
+From be5107dc591f7ae692ca7cceecbba72e4c174c37 Mon Sep 17 00:00:00 2001 |
14556 |
+From: Paolo Valente <paolo.valente@×××××××.it> |
14557 |
+Date: Thu, 9 May 2013 19:10:02 +0200 |
14558 |
+Subject: [PATCH 2/3] block: introduce the BFQ-v7r1 I/O sched for 3.13 |
14559 |
+ |
14560 |
+Add the BFQ-v7r1 I/O scheduler to 3.13. |
14561 |
+The general structure is borrowed from CFQ, as much of the code for |
14562 |
+handling I/O contexts Over time, several useful features have been |
14563 |
+ported from CFQ as well (details in the changelog in README.BFQ). A |
14564 |
+(bfq_)queue is associated to each task doing I/O on a device, and each |
14565 |
+time a scheduling decision has to be made a queue is selected and served |
14566 |
+until it expires. |
14567 |
+ |
14568 |
+ - Slices are given in the service domain: tasks are assigned |
14569 |
+ budgets, measured in number of sectors. Once got the disk, a task |
14570 |
+ must however consume its assigned budget within a configurable |
14571 |
+ maximum time (by default, the maximum possible value of the |
14572 |
+ budgets is automatically computed to comply with this timeout). |
14573 |
+ This allows the desired latency vs "throughput boosting" tradeoff |
14574 |
+ to be set. |
14575 |
+ |
14576 |
+ - Budgets are scheduled according to a variant of WF2Q+, implemented |
14577 |
+ using an augmented rb-tree to take eligibility into account while |
14578 |
+ preserving an O(log N) overall complexity. |
14579 |
+ |
14580 |
+ - A low-latency tunable is provided; if enabled, both interactive |
14581 |
+ and soft real-time applications are guaranteed a very low latency. |
14582 |
+ |
14583 |
+ - Latency guarantees are preserved also in the presence of NCQ. |
14584 |
+ |
14585 |
+ - Also with flash-based devices, a high throughput is achieved |
14586 |
+ while still preserving latency guarantees. |
14587 |
+ |
14588 |
+ - BFQ features Early Queue Merge (EQM), a sort of fusion of the |
14589 |
+ cooperating-queue-merging and the preemption mechanisms present |
14590 |
+ in CFQ. EQM is in fact a unified mechanism that tries to get a |
14591 |
+ sequential read pattern, and hence a high throughput, with any |
14592 |
+ set of processes performing interleaved I/O over a contiguous |
14593 |
+ sequence of sectors. |
14594 |
+ |
14595 |
+ - BFQ supports full hierarchical scheduling, exporting a cgroups |
14596 |
+ interface. Since each node has a full scheduler, each group can |
14597 |
+ be assigned its own weight. |
14598 |
+ |
14599 |
+ - If the cgroups interface is not used, only I/O priorities can be |
14600 |
+ assigned to processes, with ioprio values mapped to weights |
14601 |
+ with the relation weight = IOPRIO_BE_NR - ioprio. |
14602 |
+ |
14603 |
+ - ioprio classes are served in strict priority order, i.e., lower |
14604 |
+ priority queues are not served as long as there are higher |
14605 |
+ priority queues. Among queues in the same class the bandwidth is |
14606 |
+ distributed in proportion to the weight of each queue. A very |
14607 |
+ thin extra bandwidth is however guaranteed to the Idle class, to |
14608 |
+ prevent it from starving. |
14609 |
+ |
14610 |
+Signed-off-by: Paolo Valente <paolo.valente@×××××××.it> |
14611 |
+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com> |
14612 |
+--- |
14613 |
+ block/bfq-cgroup.c | 911 ++++++++++++++ |
14614 |
+ block/bfq-ioc.c | 36 + |
14615 |
+ block/bfq-iosched.c | 3298 +++++++++++++++++++++++++++++++++++++++++++++++++++ |
14616 |
+ block/bfq-sched.c | 1078 +++++++++++++++++ |
14617 |
+ block/bfq.h | 614 ++++++++++ |
14618 |
+ 5 files changed, 5937 insertions(+) |
14619 |
+ create mode 100644 block/bfq-cgroup.c |
14620 |
+ create mode 100644 block/bfq-ioc.c |
14621 |
+ create mode 100644 block/bfq-iosched.c |
14622 |
+ create mode 100644 block/bfq-sched.c |
14623 |
+ create mode 100644 block/bfq.h |
14624 |
+ |
14625 |
+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c |
14626 |
+new file mode 100644 |
14627 |
+index 0000000..79a288a |
14628 |
+--- /dev/null |
14629 |
++++ b/block/bfq-cgroup.c |
14630 |
+@@ -0,0 +1,911 @@ |
14631 |
++/* |
14632 |
++ * BFQ: CGROUPS support. |
14633 |
++ * |
14634 |
++ * Based on ideas and code from CFQ: |
14635 |
++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
14636 |
++ * |
14637 |
++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
14638 |
++ * Paolo Valente <paolo.valente@×××××××.it> |
14639 |
++ * |
14640 |
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
14641 |
++ * |
14642 |
++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. |
14643 |
++ */ |
14644 |
++ |
14645 |
++#ifdef CONFIG_CGROUP_BFQIO |
14646 |
++ |
14647 |
++static DEFINE_MUTEX(bfqio_mutex); |
14648 |
++ |
14649 |
++static bool bfqio_is_removed(struct bfqio_cgroup *bgrp) |
14650 |
++{ |
14651 |
++ return bgrp ? !bgrp->online : false; |
14652 |
++} |
14653 |
++ |
14654 |
++static struct bfqio_cgroup bfqio_root_cgroup = { |
14655 |
++ .weight = BFQ_DEFAULT_GRP_WEIGHT, |
14656 |
++ .ioprio = BFQ_DEFAULT_GRP_IOPRIO, |
14657 |
++ .ioprio_class = BFQ_DEFAULT_GRP_CLASS, |
14658 |
++}; |
14659 |
++ |
14660 |
++static inline void bfq_init_entity(struct bfq_entity *entity, |
14661 |
++ struct bfq_group *bfqg) |
14662 |
++{ |
14663 |
++ entity->weight = entity->new_weight; |
14664 |
++ entity->orig_weight = entity->new_weight; |
14665 |
++ entity->ioprio = entity->new_ioprio; |
14666 |
++ entity->ioprio_class = entity->new_ioprio_class; |
14667 |
++ entity->parent = bfqg->my_entity; |
14668 |
++ entity->sched_data = &bfqg->sched_data; |
14669 |
++} |
14670 |
++ |
14671 |
++static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css) |
14672 |
++{ |
14673 |
++ return css ? container_of(css, struct bfqio_cgroup, css) : NULL; |
14674 |
++} |
14675 |
++ |
14676 |
++/* |
14677 |
++ * Search the bfq_group for bfqd into the hash table (by now only a list) |
14678 |
++ * of bgrp. Must be called under rcu_read_lock(). |
14679 |
++ */ |
14680 |
++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp, |
14681 |
++ struct bfq_data *bfqd) |
14682 |
++{ |
14683 |
++ struct bfq_group *bfqg; |
14684 |
++ void *key; |
14685 |
++ |
14686 |
++ hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) { |
14687 |
++ key = rcu_dereference(bfqg->bfqd); |
14688 |
++ if (key == bfqd) |
14689 |
++ return bfqg; |
14690 |
++ } |
14691 |
++ |
14692 |
++ return NULL; |
14693 |
++} |
14694 |
++ |
14695 |
++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp, |
14696 |
++ struct bfq_group *bfqg) |
14697 |
++{ |
14698 |
++ struct bfq_entity *entity = &bfqg->entity; |
14699 |
++ |
14700 |
++ /* |
14701 |
++ * If the weight of the entity has never been set via the sysfs |
14702 |
++ * interface, then bgrp->weight == 0. In this case we initialize |
14703 |
++ * the weight from the current ioprio value. Otherwise, the group |
14704 |
++ * weight, if set, has priority over the ioprio value. |
14705 |
++ */ |
14706 |
++ if (bgrp->weight == 0) { |
14707 |
++ entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio); |
14708 |
++ entity->new_ioprio = bgrp->ioprio; |
14709 |
++ } else { |
14710 |
++ entity->new_weight = bgrp->weight; |
14711 |
++ entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight); |
14712 |
++ } |
14713 |
++ entity->orig_weight = entity->weight = entity->new_weight; |
14714 |
++ entity->ioprio = entity->new_ioprio; |
14715 |
++ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class; |
14716 |
++ entity->my_sched_data = &bfqg->sched_data; |
14717 |
++} |
14718 |
++ |
14719 |
++static inline void bfq_group_set_parent(struct bfq_group *bfqg, |
14720 |
++ struct bfq_group *parent) |
14721 |
++{ |
14722 |
++ struct bfq_entity *entity; |
14723 |
++ |
14724 |
++ BUG_ON(parent == NULL); |
14725 |
++ BUG_ON(bfqg == NULL); |
14726 |
++ |
14727 |
++ entity = &bfqg->entity; |
14728 |
++ entity->parent = parent->my_entity; |
14729 |
++ entity->sched_data = &parent->sched_data; |
14730 |
++} |
14731 |
++ |
14732 |
++/** |
14733 |
++ * bfq_group_chain_alloc - allocate a chain of groups. |
14734 |
++ * @bfqd: queue descriptor. |
14735 |
++ * @css: the leaf cgroup_subsys_state this chain starts from. |
14736 |
++ * |
14737 |
++ * Allocate a chain of groups starting from the one belonging to |
14738 |
++ * @cgroup up to the root cgroup. Stop if a cgroup on the chain |
14739 |
++ * to the root has already an allocated group on @bfqd. |
14740 |
++ */ |
14741 |
++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd, |
14742 |
++ struct cgroup_subsys_state *css) |
14743 |
++{ |
14744 |
++ struct bfqio_cgroup *bgrp; |
14745 |
++ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL; |
14746 |
++ |
14747 |
++ for (; css != NULL; css = css->parent) { |
14748 |
++ bgrp = css_to_bfqio(css); |
14749 |
++ |
14750 |
++ bfqg = bfqio_lookup_group(bgrp, bfqd); |
14751 |
++ if (bfqg != NULL) { |
14752 |
++ /* |
14753 |
++ * All the cgroups in the path from there to the |
14754 |
++ * root must have a bfq_group for bfqd, so we don't |
14755 |
++ * need any more allocations. |
14756 |
++ */ |
14757 |
++ break; |
14758 |
++ } |
14759 |
++ |
14760 |
++ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC); |
14761 |
++ if (bfqg == NULL) |
14762 |
++ goto cleanup; |
14763 |
++ |
14764 |
++ bfq_group_init_entity(bgrp, bfqg); |
14765 |
++ bfqg->my_entity = &bfqg->entity; |
14766 |
++ |
14767 |
++ if (leaf == NULL) { |
14768 |
++ leaf = bfqg; |
14769 |
++ prev = leaf; |
14770 |
++ } else { |
14771 |
++ bfq_group_set_parent(prev, bfqg); |
14772 |
++ /* |
14773 |
++ * Build a list of allocated nodes using the bfqd |
14774 |
++ * filed, that is still unused and will be initialized |
14775 |
++ * only after the node will be connected. |
14776 |
++ */ |
14777 |
++ prev->bfqd = bfqg; |
14778 |
++ prev = bfqg; |
14779 |
++ } |
14780 |
++ } |
14781 |
++ |
14782 |
++ return leaf; |
14783 |
++ |
14784 |
++cleanup: |
14785 |
++ while (leaf != NULL) { |
14786 |
++ prev = leaf; |
14787 |
++ leaf = leaf->bfqd; |
14788 |
++ kfree(prev); |
14789 |
++ } |
14790 |
++ |
14791 |
++ return NULL; |
14792 |
++} |
14793 |
++ |
14794 |
++/** |
14795 |
++ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy. |
14796 |
++ * @bfqd: the queue descriptor. |
14797 |
++ * @css: the leaf cgroup_subsys_state to start from. |
14798 |
++ * @leaf: the leaf group (to be associated to @cgroup). |
14799 |
++ * |
14800 |
++ * Try to link a chain of groups to a cgroup hierarchy, connecting the |
14801 |
++ * nodes bottom-up, so we can be sure that when we find a cgroup in the |
14802 |
++ * hierarchy that already as a group associated to @bfqd all the nodes |
14803 |
++ * in the path to the root cgroup have one too. |
14804 |
++ * |
14805 |
++ * On locking: the queue lock protects the hierarchy (there is a hierarchy |
14806 |
++ * per device) while the bfqio_cgroup lock protects the list of groups |
14807 |
++ * belonging to the same cgroup. |
14808 |
++ */ |
14809 |
++static void bfq_group_chain_link(struct bfq_data *bfqd, |
14810 |
++ struct cgroup_subsys_state *css, |
14811 |
++ struct bfq_group *leaf) |
14812 |
++{ |
14813 |
++ struct bfqio_cgroup *bgrp; |
14814 |
++ struct bfq_group *bfqg, *next, *prev = NULL; |
14815 |
++ unsigned long flags; |
14816 |
++ |
14817 |
++ assert_spin_locked(bfqd->queue->queue_lock); |
14818 |
++ |
14819 |
++ for (; css != NULL && leaf != NULL; css = css->parent) { |
14820 |
++ bgrp = css_to_bfqio(css); |
14821 |
++ next = leaf->bfqd; |
14822 |
++ |
14823 |
++ bfqg = bfqio_lookup_group(bgrp, bfqd); |
14824 |
++ BUG_ON(bfqg != NULL); |
14825 |
++ |
14826 |
++ spin_lock_irqsave(&bgrp->lock, flags); |
14827 |
++ |
14828 |
++ rcu_assign_pointer(leaf->bfqd, bfqd); |
14829 |
++ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data); |
14830 |
++ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list); |
14831 |
++ |
14832 |
++ spin_unlock_irqrestore(&bgrp->lock, flags); |
14833 |
++ |
14834 |
++ prev = leaf; |
14835 |
++ leaf = next; |
14836 |
++ } |
14837 |
++ |
14838 |
++ BUG_ON(css == NULL && leaf != NULL); |
14839 |
++ if (css != NULL && prev != NULL) { |
14840 |
++ bgrp = css_to_bfqio(css); |
14841 |
++ bfqg = bfqio_lookup_group(bgrp, bfqd); |
14842 |
++ bfq_group_set_parent(prev, bfqg); |
14843 |
++ } |
14844 |
++} |
14845 |
++ |
14846 |
++/** |
14847 |
++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup. |
14848 |
++ * @bfqd: queue descriptor. |
14849 |
++ * @cgroup: cgroup being searched for. |
14850 |
++ * |
14851 |
++ * Return a group associated to @bfqd in @cgroup, allocating one if |
14852 |
++ * necessary. When a group is returned all the cgroups in the path |
14853 |
++ * to the root have a group associated to @bfqd. |
14854 |
++ * |
14855 |
++ * If the allocation fails, return the root group: this breaks guarantees |
14856 |
++ * but is a safe fallbak. If this loss becames a problem it can be |
14857 |
++ * mitigated using the equivalent weight (given by the product of the |
14858 |
++ * weights of the groups in the path from @group to the root) in the |
14859 |
++ * root scheduler. |
14860 |
++ * |
14861 |
++ * We allocate all the missing nodes in the path from the leaf cgroup |
14862 |
++ * to the root and we connect the nodes only after all the allocations |
14863 |
++ * have been successful. |
14864 |
++ */ |
14865 |
++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, |
14866 |
++ struct cgroup_subsys_state *css) |
14867 |
++{ |
14868 |
++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); |
14869 |
++ struct bfq_group *bfqg; |
14870 |
++ |
14871 |
++ bfqg = bfqio_lookup_group(bgrp, bfqd); |
14872 |
++ if (bfqg != NULL) |
14873 |
++ return bfqg; |
14874 |
++ |
14875 |
++ bfqg = bfq_group_chain_alloc(bfqd, css); |
14876 |
++ if (bfqg != NULL) |
14877 |
++ bfq_group_chain_link(bfqd, css, bfqg); |
14878 |
++ else |
14879 |
++ bfqg = bfqd->root_group; |
14880 |
++ |
14881 |
++ return bfqg; |
14882 |
++} |
14883 |
++ |
14884 |
++/** |
14885 |
++ * bfq_bfqq_move - migrate @bfqq to @bfqg. |
14886 |
++ * @bfqd: queue descriptor. |
14887 |
++ * @bfqq: the queue to move. |
14888 |
++ * @entity: @bfqq's entity. |
14889 |
++ * @bfqg: the group to move to. |
14890 |
++ * |
14891 |
++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating |
14892 |
++ * it on the new one. Avoid putting the entity on the old group idle tree. |
14893 |
++ * |
14894 |
++ * Must be called under the queue lock; the cgroup owning @bfqg must |
14895 |
++ * not disappear (by now this just means that we are called under |
14896 |
++ * rcu_read_lock()). |
14897 |
++ */ |
14898 |
++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
14899 |
++ struct bfq_entity *entity, struct bfq_group *bfqg) |
14900 |
++{ |
14901 |
++ int busy, resume; |
14902 |
++ |
14903 |
++ busy = bfq_bfqq_busy(bfqq); |
14904 |
++ resume = !RB_EMPTY_ROOT(&bfqq->sort_list); |
14905 |
++ |
14906 |
++ BUG_ON(resume && !entity->on_st); |
14907 |
++ BUG_ON(busy && !resume && entity->on_st && |
14908 |
++ bfqq != bfqd->in_service_queue); |
14909 |
++ |
14910 |
++ if (busy) { |
14911 |
++ BUG_ON(atomic_read(&bfqq->ref) < 2); |
14912 |
++ |
14913 |
++ if (!resume) |
14914 |
++ bfq_del_bfqq_busy(bfqd, bfqq, 0); |
14915 |
++ else |
14916 |
++ bfq_deactivate_bfqq(bfqd, bfqq, 0); |
14917 |
++ } else if (entity->on_st) |
14918 |
++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); |
14919 |
++ |
14920 |
++ /* |
14921 |
++ * Here we use a reference to bfqg. We don't need a refcounter |
14922 |
++ * as the cgroup reference will not be dropped, so that its |
14923 |
++ * destroy() callback will not be invoked. |
14924 |
++ */ |
14925 |
++ entity->parent = bfqg->my_entity; |
14926 |
++ entity->sched_data = &bfqg->sched_data; |
14927 |
++ |
14928 |
++ if (busy && resume) |
14929 |
++ bfq_activate_bfqq(bfqd, bfqq); |
14930 |
++ |
14931 |
++ if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver) |
14932 |
++ bfq_schedule_dispatch(bfqd); |
14933 |
++} |
14934 |
++ |
14935 |
++/** |
14936 |
++ * __bfq_bic_change_cgroup - move @bic to @cgroup. |
14937 |
++ * @bfqd: the queue descriptor. |
14938 |
++ * @bic: the bic to move. |
14939 |
++ * @cgroup: the cgroup to move to. |
14940 |
++ * |
14941 |
++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller |
14942 |
++ * has to make sure that the reference to cgroup is valid across the call. |
14943 |
++ * |
14944 |
++ * NOTE: an alternative approach might have been to store the current |
14945 |
++ * cgroup in bfqq and getting a reference to it, reducing the lookup |
14946 |
++ * time here, at the price of slightly more complex code. |
14947 |
++ */ |
14948 |
++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, |
14949 |
++ struct bfq_io_cq *bic, |
14950 |
++ struct cgroup_subsys_state *css) |
14951 |
++{ |
14952 |
++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); |
14953 |
++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); |
14954 |
++ struct bfq_entity *entity; |
14955 |
++ struct bfq_group *bfqg; |
14956 |
++ struct bfqio_cgroup *bgrp; |
14957 |
++ |
14958 |
++ bgrp = css_to_bfqio(css); |
14959 |
++ |
14960 |
++ bfqg = bfq_find_alloc_group(bfqd, css); |
14961 |
++ if (async_bfqq != NULL) { |
14962 |
++ entity = &async_bfqq->entity; |
14963 |
++ |
14964 |
++ if (entity->sched_data != &bfqg->sched_data) { |
14965 |
++ bic_set_bfqq(bic, NULL, 0); |
14966 |
++ bfq_log_bfqq(bfqd, async_bfqq, |
14967 |
++ "bic_change_group: %p %d", |
14968 |
++ async_bfqq, atomic_read(&async_bfqq->ref)); |
14969 |
++ bfq_put_queue(async_bfqq); |
14970 |
++ } |
14971 |
++ } |
14972 |
++ |
14973 |
++ if (sync_bfqq != NULL) { |
14974 |
++ entity = &sync_bfqq->entity; |
14975 |
++ if (entity->sched_data != &bfqg->sched_data) |
14976 |
++ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); |
14977 |
++ } |
14978 |
++ |
14979 |
++ return bfqg; |
14980 |
++} |
14981 |
++ |
14982 |
++/** |
14983 |
++ * bfq_bic_change_cgroup - move @bic to @cgroup. |
14984 |
++ * @bic: the bic being migrated. |
14985 |
++ * @cgroup: the destination cgroup. |
14986 |
++ * |
14987 |
++ * When the task owning @bic is moved to @cgroup, @bic is immediately |
14988 |
++ * moved into its new parent group. |
14989 |
++ */ |
14990 |
++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic, |
14991 |
++ struct cgroup_subsys_state *css) |
14992 |
++{ |
14993 |
++ struct bfq_data *bfqd; |
14994 |
++ unsigned long uninitialized_var(flags); |
14995 |
++ |
14996 |
++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), |
14997 |
++ &flags); |
14998 |
++ if (bfqd != NULL) { |
14999 |
++ __bfq_bic_change_cgroup(bfqd, bic, css); |
15000 |
++ bfq_put_bfqd_unlock(bfqd, &flags); |
15001 |
++ } |
15002 |
++} |
15003 |
++ |
15004 |
++/** |
15005 |
++ * bfq_bic_update_cgroup - update the cgroup of @bic. |
15006 |
++ * @bic: the @bic to update. |
15007 |
++ * |
15008 |
++ * Make sure that @bic is enqueued in the cgroup of the current task. |
15009 |
++ * We need this in addition to moving bics during the cgroup attach |
15010 |
++ * phase because the task owning @bic could be at its first disk |
15011 |
++ * access or we may end up in the root cgroup as the result of a |
15012 |
++ * memory allocation failure and here we try to move to the right |
15013 |
++ * group. |
15014 |
++ * |
15015 |
++ * Must be called under the queue lock. It is safe to use the returned |
15016 |
++ * value even after the rcu_read_unlock() as the migration/destruction |
15017 |
++ * paths act under the queue lock too. IOW it is impossible to race with |
15018 |
++ * group migration/destruction and end up with an invalid group as: |
15019 |
++ * a) here cgroup has not yet been destroyed, nor its destroy callback |
15020 |
++ * has started execution, as current holds a reference to it, |
15021 |
++ * b) if it is destroyed after rcu_read_unlock() [after current is |
15022 |
++ * migrated to a different cgroup] its attach() callback will have |
15023 |
++ * taken care of remove all the references to the old cgroup data. |
15024 |
++ */ |
15025 |
++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic) |
15026 |
++{ |
15027 |
++ struct bfq_data *bfqd = bic_to_bfqd(bic); |
15028 |
++ struct bfq_group *bfqg; |
15029 |
++ struct cgroup_subsys_state *css; |
15030 |
++ |
15031 |
++ BUG_ON(bfqd == NULL); |
15032 |
++ |
15033 |
++ rcu_read_lock(); |
15034 |
++ css = task_css(current, bfqio_subsys_id); |
15035 |
++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, css); |
15036 |
++ rcu_read_unlock(); |
15037 |
++ |
15038 |
++ return bfqg; |
15039 |
++} |
15040 |
++ |
15041 |
++/** |
15042 |
++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. |
15043 |
++ * @st: the service tree being flushed. |
15044 |
++ */ |
15045 |
++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st) |
15046 |
++{ |
15047 |
++ struct bfq_entity *entity = st->first_idle; |
15048 |
++ |
15049 |
++ for (; entity != NULL; entity = st->first_idle) |
15050 |
++ __bfq_deactivate_entity(entity, 0); |
15051 |
++} |
15052 |
++ |
15053 |
++/** |
15054 |
++ * bfq_reparent_leaf_entity - move leaf entity to the root_group. |
15055 |
++ * @bfqd: the device data structure with the root group. |
15056 |
++ * @entity: the entity to move. |
15057 |
++ */ |
15058 |
++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd, |
15059 |
++ struct bfq_entity *entity) |
15060 |
++{ |
15061 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
15062 |
++ |
15063 |
++ BUG_ON(bfqq == NULL); |
15064 |
++ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); |
15065 |
++ return; |
15066 |
++} |
15067 |
++ |
15068 |
++/** |
15069 |
++ * bfq_reparent_active_entities - move to the root group all active entities. |
15070 |
++ * @bfqd: the device data structure with the root group. |
15071 |
++ * @bfqg: the group to move from. |
15072 |
++ * @st: the service tree with the entities. |
15073 |
++ * |
15074 |
++ * Needs queue_lock to be taken and reference to be valid over the call. |
15075 |
++ */ |
15076 |
++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd, |
15077 |
++ struct bfq_group *bfqg, |
15078 |
++ struct bfq_service_tree *st) |
15079 |
++{ |
15080 |
++ struct rb_root *active = &st->active; |
15081 |
++ struct bfq_entity *entity = NULL; |
15082 |
++ |
15083 |
++ if (!RB_EMPTY_ROOT(&st->active)) |
15084 |
++ entity = bfq_entity_of(rb_first(active)); |
15085 |
++ |
15086 |
++ for (; entity != NULL; entity = bfq_entity_of(rb_first(active))) |
15087 |
++ bfq_reparent_leaf_entity(bfqd, entity); |
15088 |
++ |
15089 |
++ if (bfqg->sched_data.in_service_entity != NULL) |
15090 |
++ bfq_reparent_leaf_entity(bfqd, |
15091 |
++ bfqg->sched_data.in_service_entity); |
15092 |
++ |
15093 |
++ return; |
15094 |
++} |
15095 |
++ |
15096 |
++/** |
15097 |
++ * bfq_destroy_group - destroy @bfqg. |
15098 |
++ * @bgrp: the bfqio_cgroup containing @bfqg. |
15099 |
++ * @bfqg: the group being destroyed. |
15100 |
++ * |
15101 |
++ * Destroy @bfqg, making sure that it is not referenced from its parent. |
15102 |
++ */ |
15103 |
++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) |
15104 |
++{ |
15105 |
++ struct bfq_data *bfqd; |
15106 |
++ struct bfq_service_tree *st; |
15107 |
++ struct bfq_entity *entity = bfqg->my_entity; |
15108 |
++ unsigned long uninitialized_var(flags); |
15109 |
++ int i; |
15110 |
++ |
15111 |
++ hlist_del(&bfqg->group_node); |
15112 |
++ |
15113 |
++ /* |
15114 |
++ * Empty all service_trees belonging to this group before deactivating |
15115 |
++ * the group itself. |
15116 |
++ */ |
15117 |
++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { |
15118 |
++ st = bfqg->sched_data.service_tree + i; |
15119 |
++ |
15120 |
++ /* |
15121 |
++ * The idle tree may still contain bfq_queues belonging |
15122 |
++ * to exited task because they never migrated to a different |
15123 |
++ * cgroup from the one being destroyed now. Noone else |
15124 |
++ * can access them so it's safe to act without any lock. |
15125 |
++ */ |
15126 |
++ bfq_flush_idle_tree(st); |
15127 |
++ |
15128 |
++ /* |
15129 |
++ * It may happen that some queues are still active |
15130 |
++ * (busy) upon group destruction (if the corresponding |
15131 |
++ * processes have been forced to terminate). We move |
15132 |
++ * all the leaf entities corresponding to these queues |
15133 |
++ * to the root_group. |
15134 |
++ * Also, it may happen that the group has an entity |
15135 |
++ * under service, which is disconnected from the active |
15136 |
++ * tree: it must be moved, too. |
15137 |
++ * There is no need to put the sync queues, as the |
15138 |
++ * scheduler has taken no reference. |
15139 |
++ */ |
15140 |
++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); |
15141 |
++ if (bfqd != NULL) { |
15142 |
++ bfq_reparent_active_entities(bfqd, bfqg, st); |
15143 |
++ bfq_put_bfqd_unlock(bfqd, &flags); |
15144 |
++ } |
15145 |
++ BUG_ON(!RB_EMPTY_ROOT(&st->active)); |
15146 |
++ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); |
15147 |
++ } |
15148 |
++ BUG_ON(bfqg->sched_data.next_in_service != NULL); |
15149 |
++ BUG_ON(bfqg->sched_data.in_service_entity != NULL); |
15150 |
++ |
15151 |
++ /* |
15152 |
++ * We may race with device destruction, take extra care when |
15153 |
++ * dereferencing bfqg->bfqd. |
15154 |
++ */ |
15155 |
++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); |
15156 |
++ if (bfqd != NULL) { |
15157 |
++ hlist_del(&bfqg->bfqd_node); |
15158 |
++ __bfq_deactivate_entity(entity, 0); |
15159 |
++ bfq_put_async_queues(bfqd, bfqg); |
15160 |
++ bfq_put_bfqd_unlock(bfqd, &flags); |
15161 |
++ } |
15162 |
++ BUG_ON(entity->tree != NULL); |
15163 |
++ |
15164 |
++ /* |
15165 |
++ * No need to defer the kfree() to the end of the RCU grace |
15166 |
++ * period: we are called from the destroy() callback of our |
15167 |
++ * cgroup, so we can be sure that noone is a) still using |
15168 |
++ * this cgroup or b) doing lookups in it. |
15169 |
++ */ |
15170 |
++ kfree(bfqg); |
15171 |
++} |
15172 |
++ |
15173 |
++static void bfq_end_raising_async(struct bfq_data *bfqd) |
15174 |
++{ |
15175 |
++ struct hlist_node *tmp; |
15176 |
++ struct bfq_group *bfqg; |
15177 |
++ |
15178 |
++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) |
15179 |
++ bfq_end_raising_async_queues(bfqd, bfqg); |
15180 |
++ bfq_end_raising_async_queues(bfqd, bfqd->root_group); |
15181 |
++} |
15182 |
++ |
15183 |
++/** |
15184 |
++ * bfq_disconnect_groups - diconnect @bfqd from all its groups. |
15185 |
++ * @bfqd: the device descriptor being exited. |
15186 |
++ * |
15187 |
++ * When the device exits we just make sure that no lookup can return |
15188 |
++ * the now unused group structures. They will be deallocated on cgroup |
15189 |
++ * destruction. |
15190 |
++ */ |
15191 |
++static void bfq_disconnect_groups(struct bfq_data *bfqd) |
15192 |
++{ |
15193 |
++ struct hlist_node *tmp; |
15194 |
++ struct bfq_group *bfqg; |
15195 |
++ |
15196 |
++ bfq_log(bfqd, "disconnect_groups beginning"); |
15197 |
++ hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) { |
15198 |
++ hlist_del(&bfqg->bfqd_node); |
15199 |
++ |
15200 |
++ __bfq_deactivate_entity(bfqg->my_entity, 0); |
15201 |
++ |
15202 |
++ /* |
15203 |
++ * Don't remove from the group hash, just set an |
15204 |
++ * invalid key. No lookups can race with the |
15205 |
++ * assignment as bfqd is being destroyed; this |
15206 |
++ * implies also that new elements cannot be added |
15207 |
++ * to the list. |
15208 |
++ */ |
15209 |
++ rcu_assign_pointer(bfqg->bfqd, NULL); |
15210 |
++ |
15211 |
++ bfq_log(bfqd, "disconnect_groups: put async for group %p", |
15212 |
++ bfqg); |
15213 |
++ bfq_put_async_queues(bfqd, bfqg); |
15214 |
++ } |
15215 |
++} |
15216 |
++ |
15217 |
++static inline void bfq_free_root_group(struct bfq_data *bfqd) |
15218 |
++{ |
15219 |
++ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup; |
15220 |
++ struct bfq_group *bfqg = bfqd->root_group; |
15221 |
++ |
15222 |
++ bfq_put_async_queues(bfqd, bfqg); |
15223 |
++ |
15224 |
++ spin_lock_irq(&bgrp->lock); |
15225 |
++ hlist_del_rcu(&bfqg->group_node); |
15226 |
++ spin_unlock_irq(&bgrp->lock); |
15227 |
++ |
15228 |
++ /* |
15229 |
++ * No need to synchronize_rcu() here: since the device is gone |
15230 |
++ * there cannot be any read-side access to its root_group. |
15231 |
++ */ |
15232 |
++ kfree(bfqg); |
15233 |
++} |
15234 |
++ |
15235 |
++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) |
15236 |
++{ |
15237 |
++ struct bfq_group *bfqg; |
15238 |
++ struct bfqio_cgroup *bgrp; |
15239 |
++ int i; |
15240 |
++ |
15241 |
++ bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node); |
15242 |
++ if (bfqg == NULL) |
15243 |
++ return NULL; |
15244 |
++ |
15245 |
++ bfqg->entity.parent = NULL; |
15246 |
++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) |
15247 |
++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; |
15248 |
++ |
15249 |
++ bgrp = &bfqio_root_cgroup; |
15250 |
++ spin_lock_irq(&bgrp->lock); |
15251 |
++ rcu_assign_pointer(bfqg->bfqd, bfqd); |
15252 |
++ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data); |
15253 |
++ spin_unlock_irq(&bgrp->lock); |
15254 |
++ |
15255 |
++ return bfqg; |
15256 |
++} |
15257 |
++ |
15258 |
++#define SHOW_FUNCTION(__VAR) \ |
15259 |
++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \ |
15260 |
++ struct cftype *cftype) \ |
15261 |
++{ \ |
15262 |
++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \ |
15263 |
++ u64 ret = -ENODEV; \ |
15264 |
++ \ |
15265 |
++ mutex_lock(&bfqio_mutex); \ |
15266 |
++ if (bfqio_is_removed(bgrp)) \ |
15267 |
++ goto out_unlock; \ |
15268 |
++ \ |
15269 |
++ spin_lock_irq(&bgrp->lock); \ |
15270 |
++ ret = bgrp->__VAR; \ |
15271 |
++ spin_unlock_irq(&bgrp->lock); \ |
15272 |
++ \ |
15273 |
++out_unlock: \ |
15274 |
++ mutex_unlock(&bfqio_mutex); \ |
15275 |
++ return ret; \ |
15276 |
++} |
15277 |
++ |
15278 |
++SHOW_FUNCTION(weight); |
15279 |
++SHOW_FUNCTION(ioprio); |
15280 |
++SHOW_FUNCTION(ioprio_class); |
15281 |
++#undef SHOW_FUNCTION |
15282 |
++ |
15283 |
++#define STORE_FUNCTION(__VAR, __MIN, __MAX) \ |
15284 |
++static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\ |
15285 |
++ struct cftype *cftype, \ |
15286 |
++ u64 val) \ |
15287 |
++{ \ |
15288 |
++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); \ |
15289 |
++ struct bfq_group *bfqg; \ |
15290 |
++ int ret = -EINVAL; \ |
15291 |
++ \ |
15292 |
++ if (val < (__MIN) || val > (__MAX)) \ |
15293 |
++ return ret; \ |
15294 |
++ \ |
15295 |
++ ret = -ENODEV; \ |
15296 |
++ mutex_lock(&bfqio_mutex); \ |
15297 |
++ if (bfqio_is_removed(bgrp)) \ |
15298 |
++ goto out_unlock; \ |
15299 |
++ ret = 0; \ |
15300 |
++ \ |
15301 |
++ spin_lock_irq(&bgrp->lock); \ |
15302 |
++ bgrp->__VAR = (unsigned short)val; \ |
15303 |
++ hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \ |
15304 |
++ /* \ |
15305 |
++ * Setting the ioprio_changed flag of the entity \ |
15306 |
++ * to 1 with new_##__VAR == ##__VAR would re-set \ |
15307 |
++ * the value of the weight to its ioprio mapping. \ |
15308 |
++ * Set the flag only if necessary. \ |
15309 |
++ */ \ |
15310 |
++ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \ |
15311 |
++ bfqg->entity.new_##__VAR = (unsigned short)val; \ |
15312 |
++ smp_wmb(); \ |
15313 |
++ bfqg->entity.ioprio_changed = 1; \ |
15314 |
++ } \ |
15315 |
++ } \ |
15316 |
++ spin_unlock_irq(&bgrp->lock); \ |
15317 |
++ \ |
15318 |
++out_unlock: \ |
15319 |
++ mutex_unlock(&bfqio_mutex); \ |
15320 |
++ return ret; \ |
15321 |
++} |
15322 |
++ |
15323 |
++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT); |
15324 |
++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1); |
15325 |
++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); |
15326 |
++#undef STORE_FUNCTION |
15327 |
++ |
15328 |
++static struct cftype bfqio_files[] = { |
15329 |
++ { |
15330 |
++ .name = "weight", |
15331 |
++ .read_u64 = bfqio_cgroup_weight_read, |
15332 |
++ .write_u64 = bfqio_cgroup_weight_write, |
15333 |
++ }, |
15334 |
++ { |
15335 |
++ .name = "ioprio", |
15336 |
++ .read_u64 = bfqio_cgroup_ioprio_read, |
15337 |
++ .write_u64 = bfqio_cgroup_ioprio_write, |
15338 |
++ }, |
15339 |
++ { |
15340 |
++ .name = "ioprio_class", |
15341 |
++ .read_u64 = bfqio_cgroup_ioprio_class_read, |
15342 |
++ .write_u64 = bfqio_cgroup_ioprio_class_write, |
15343 |
++ }, |
15344 |
++ { }, /* terminate */ |
15345 |
++}; |
15346 |
++ |
15347 |
++static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state |
15348 |
++ *parent_css) |
15349 |
++{ |
15350 |
++ struct bfqio_cgroup *bgrp; |
15351 |
++ |
15352 |
++ if (parent_css != NULL) { |
15353 |
++ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL); |
15354 |
++ if (bgrp == NULL) |
15355 |
++ return ERR_PTR(-ENOMEM); |
15356 |
++ } else |
15357 |
++ bgrp = &bfqio_root_cgroup; |
15358 |
++ |
15359 |
++ spin_lock_init(&bgrp->lock); |
15360 |
++ INIT_HLIST_HEAD(&bgrp->group_data); |
15361 |
++ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO; |
15362 |
++ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS; |
15363 |
++ |
15364 |
++ return &bgrp->css; |
15365 |
++} |
15366 |
++ |
15367 |
++/* |
15368 |
++ * We cannot support shared io contexts, as we have no means to support |
15369 |
++ * two tasks with the same ioc in two different groups without major rework |
15370 |
++ * of the main bic/bfqq data structures. By now we allow a task to change |
15371 |
++ * its cgroup only if it's the only owner of its ioc; the drawback of this |
15372 |
++ * behavior is that a group containing a task that forked using CLONE_IO |
15373 |
++ * will not be destroyed until the tasks sharing the ioc die. |
15374 |
++ */ |
15375 |
++static int bfqio_can_attach(struct cgroup_subsys_state *css, |
15376 |
++ struct cgroup_taskset *tset) |
15377 |
++{ |
15378 |
++ struct task_struct *task; |
15379 |
++ struct io_context *ioc; |
15380 |
++ int ret = 0; |
15381 |
++ |
15382 |
++ cgroup_taskset_for_each(task, css, tset) { |
15383 |
++ /* |
15384 |
++ * task_lock() is needed to avoid races with |
15385 |
++ * exit_io_context() |
15386 |
++ */ |
15387 |
++ task_lock(task); |
15388 |
++ ioc = task->io_context; |
15389 |
++ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) |
15390 |
++ /* |
15391 |
++ * ioc == NULL means that the task is either too young |
15392 |
++ * or exiting: if it has still no ioc the ioc can't be |
15393 |
++ * shared, if the task is exiting the attach will fail |
15394 |
++ * anyway, no matter what we return here. |
15395 |
++ */ |
15396 |
++ ret = -EINVAL; |
15397 |
++ task_unlock(task); |
15398 |
++ if (ret) |
15399 |
++ break; |
15400 |
++ } |
15401 |
++ |
15402 |
++ return ret; |
15403 |
++} |
15404 |
++ |
15405 |
++static void bfqio_attach(struct cgroup_subsys_state *css, |
15406 |
++ struct cgroup_taskset *tset) |
15407 |
++{ |
15408 |
++ struct task_struct *task; |
15409 |
++ struct io_context *ioc; |
15410 |
++ struct io_cq *icq; |
15411 |
++ |
15412 |
++ /* |
15413 |
++ * IMPORTANT NOTE: The move of more than one process at a time to a |
15414 |
++ * new group has not yet been tested. |
15415 |
++ */ |
15416 |
++ cgroup_taskset_for_each(task, css, tset) { |
15417 |
++ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); |
15418 |
++ if (ioc) { |
15419 |
++ /* |
15420 |
++ * Handle cgroup change here. |
15421 |
++ */ |
15422 |
++ rcu_read_lock(); |
15423 |
++ hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node) |
15424 |
++ if (!strncmp( |
15425 |
++ icq->q->elevator->type->elevator_name, |
15426 |
++ "bfq", ELV_NAME_MAX)) |
15427 |
++ bfq_bic_change_cgroup(icq_to_bic(icq), |
15428 |
++ css); |
15429 |
++ rcu_read_unlock(); |
15430 |
++ put_io_context(ioc); |
15431 |
++ } |
15432 |
++ } |
15433 |
++} |
15434 |
++ |
15435 |
++static void bfqio_destroy(struct cgroup_subsys_state *css) |
15436 |
++{ |
15437 |
++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); |
15438 |
++ struct hlist_node *tmp; |
15439 |
++ struct bfq_group *bfqg; |
15440 |
++ |
15441 |
++ /* |
15442 |
++ * Since we are destroying the cgroup, there are no more tasks |
15443 |
++ * referencing it, and all the RCU grace periods that may have |
15444 |
++ * referenced it are ended (as the destruction of the parent |
15445 |
++ * cgroup is RCU-safe); bgrp->group_data will not be accessed by |
15446 |
++ * anything else and we don't need any synchronization. |
15447 |
++ */ |
15448 |
++ hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node) |
15449 |
++ bfq_destroy_group(bgrp, bfqg); |
15450 |
++ |
15451 |
++ BUG_ON(!hlist_empty(&bgrp->group_data)); |
15452 |
++ |
15453 |
++ kfree(bgrp); |
15454 |
++} |
15455 |
++ |
15456 |
++static int bfqio_css_online(struct cgroup_subsys_state *css) |
15457 |
++{ |
15458 |
++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); |
15459 |
++ |
15460 |
++ mutex_lock(&bfqio_mutex); |
15461 |
++ bgrp->online = true; |
15462 |
++ mutex_unlock(&bfqio_mutex); |
15463 |
++ |
15464 |
++ return 0; |
15465 |
++} |
15466 |
++ |
15467 |
++static void bfqio_css_offline(struct cgroup_subsys_state *css) |
15468 |
++{ |
15469 |
++ struct bfqio_cgroup *bgrp = css_to_bfqio(css); |
15470 |
++ |
15471 |
++ mutex_lock(&bfqio_mutex); |
15472 |
++ bgrp->online = false; |
15473 |
++ mutex_unlock(&bfqio_mutex); |
15474 |
++} |
15475 |
++ |
15476 |
++struct cgroup_subsys bfqio_subsys = { |
15477 |
++ .name = "bfqio", |
15478 |
++ .css_alloc = bfqio_create, |
15479 |
++ .css_online = bfqio_css_online, |
15480 |
++ .css_offline = bfqio_css_offline, |
15481 |
++ .can_attach = bfqio_can_attach, |
15482 |
++ .attach = bfqio_attach, |
15483 |
++ .css_free = bfqio_destroy, |
15484 |
++ .subsys_id = bfqio_subsys_id, |
15485 |
++ .base_cftypes = bfqio_files, |
15486 |
++}; |
15487 |
++#else |
15488 |
++static inline void bfq_init_entity(struct bfq_entity *entity, |
15489 |
++ struct bfq_group *bfqg) |
15490 |
++{ |
15491 |
++ entity->weight = entity->new_weight; |
15492 |
++ entity->orig_weight = entity->new_weight; |
15493 |
++ entity->ioprio = entity->new_ioprio; |
15494 |
++ entity->ioprio_class = entity->new_ioprio_class; |
15495 |
++ entity->sched_data = &bfqg->sched_data; |
15496 |
++} |
15497 |
++ |
15498 |
++static inline struct bfq_group * |
15499 |
++bfq_bic_update_cgroup(struct bfq_io_cq *bic) |
15500 |
++{ |
15501 |
++ struct bfq_data *bfqd = bic_to_bfqd(bic); |
15502 |
++ return bfqd->root_group; |
15503 |
++} |
15504 |
++ |
15505 |
++static inline void bfq_bfqq_move(struct bfq_data *bfqd, |
15506 |
++ struct bfq_queue *bfqq, |
15507 |
++ struct bfq_entity *entity, |
15508 |
++ struct bfq_group *bfqg) |
15509 |
++{ |
15510 |
++} |
15511 |
++ |
15512 |
++static void bfq_end_raising_async(struct bfq_data *bfqd) |
15513 |
++{ |
15514 |
++ bfq_end_raising_async_queues(bfqd, bfqd->root_group); |
15515 |
++} |
15516 |
++ |
15517 |
++static inline void bfq_disconnect_groups(struct bfq_data *bfqd) |
15518 |
++{ |
15519 |
++ bfq_put_async_queues(bfqd, bfqd->root_group); |
15520 |
++} |
15521 |
++ |
15522 |
++static inline void bfq_free_root_group(struct bfq_data *bfqd) |
15523 |
++{ |
15524 |
++ kfree(bfqd->root_group); |
15525 |
++} |
15526 |
++ |
15527 |
++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) |
15528 |
++{ |
15529 |
++ struct bfq_group *bfqg; |
15530 |
++ int i; |
15531 |
++ |
15532 |
++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); |
15533 |
++ if (bfqg == NULL) |
15534 |
++ return NULL; |
15535 |
++ |
15536 |
++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) |
15537 |
++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; |
15538 |
++ |
15539 |
++ return bfqg; |
15540 |
++} |
15541 |
++#endif |
15542 |
+diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c |
15543 |
+new file mode 100644 |
15544 |
+index 0000000..7f6b000 |
15545 |
+--- /dev/null |
15546 |
++++ b/block/bfq-ioc.c |
15547 |
+@@ -0,0 +1,36 @@ |
15548 |
++/* |
15549 |
++ * BFQ: I/O context handling. |
15550 |
++ * |
15551 |
++ * Based on ideas and code from CFQ: |
15552 |
++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
15553 |
++ * |
15554 |
++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
15555 |
++ * Paolo Valente <paolo.valente@×××××××.it> |
15556 |
++ * |
15557 |
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
15558 |
++ */ |
15559 |
++ |
15560 |
++/** |
15561 |
++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. |
15562 |
++ * @icq: the iocontext queue. |
15563 |
++ */ |
15564 |
++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq) |
15565 |
++{ |
15566 |
++ /* bic->icq is the first member, %NULL will convert to %NULL */ |
15567 |
++ return container_of(icq, struct bfq_io_cq, icq); |
15568 |
++} |
15569 |
++ |
15570 |
++/** |
15571 |
++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. |
15572 |
++ * @bfqd: the lookup key. |
15573 |
++ * @ioc: the io_context of the process doing I/O. |
15574 |
++ * |
15575 |
++ * Queue lock must be held. |
15576 |
++ */ |
15577 |
++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, |
15578 |
++ struct io_context *ioc) |
15579 |
++{ |
15580 |
++ if (ioc) |
15581 |
++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); |
15582 |
++ return NULL; |
15583 |
++} |
15584 |
+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c |
15585 |
+new file mode 100644 |
15586 |
+index 0000000..eb760de |
15587 |
+--- /dev/null |
15588 |
++++ b/block/bfq-iosched.c |
15589 |
+@@ -0,0 +1,3298 @@ |
15590 |
++/* |
15591 |
++ * BFQ, or Budget Fair Queueing, disk scheduler. |
15592 |
++ * |
15593 |
++ * Based on ideas and code from CFQ: |
15594 |
++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
15595 |
++ * |
15596 |
++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
15597 |
++ * Paolo Valente <paolo.valente@×××××××.it> |
15598 |
++ * |
15599 |
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
15600 |
++ * |
15601 |
++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. |
15602 |
++ * |
15603 |
++ * BFQ is a proportional share disk scheduling algorithm based on the |
15604 |
++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, measured in |
15605 |
++ * number of sectors, to tasks instead of time slices. The disk is not granted |
15606 |
++ * to the in-service task for a given time slice, but until it has exahusted |
15607 |
++ * its assigned budget. This change from the time to the service domain allows |
15608 |
++ * BFQ to distribute the disk bandwidth among tasks as desired, without any |
15609 |
++ * distortion due to ZBR, workload fluctuations or other factors. BFQ uses an |
15610 |
++ * ad hoc internal scheduler, called B-WF2Q+, to schedule tasks according to |
15611 |
++ * their budgets (more precisely BFQ schedules queues associated to tasks). |
15612 |
++ * Thanks to this accurate scheduler, BFQ can afford to assign high budgets to |
15613 |
++ * disk-bound non-seeky tasks (to boost the throughput), and yet guarantee low |
15614 |
++ * latencies to interactive and soft real-time applications. |
15615 |
++ * |
15616 |
++ * BFQ is described in [1], where also a reference to the initial, more |
15617 |
++ * theoretical paper on BFQ can be found. The interested reader can find in |
15618 |
++ * the latter paper full details on the main algorithm as well as formulas of |
15619 |
++ * the guarantees, plus formal proofs of all the properties. With respect to |
15620 |
++ * the version of BFQ presented in these papers, this implementation adds a |
15621 |
++ * few more heuristics, such as the one that guarantees a low latency to soft |
15622 |
++ * real-time applications, and a hierarchical extension based on H-WF2Q+. |
15623 |
++ * |
15624 |
++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with |
15625 |
++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) |
15626 |
++ * complexity derives from the one introduced with EEVDF in [3]. |
15627 |
++ * |
15628 |
++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness |
15629 |
++ * with the BFQ Disk I/O Scheduler'', |
15630 |
++ * Proceedings of the 5th Annual International Systems and Storage |
15631 |
++ * Conference (SYSTOR '12), June 2012. |
15632 |
++ * |
15633 |
++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf |
15634 |
++ * |
15635 |
++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing |
15636 |
++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, |
15637 |
++ * Oct 1997. |
15638 |
++ * |
15639 |
++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz |
15640 |
++ * |
15641 |
++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline |
15642 |
++ * First: A Flexible and Accurate Mechanism for Proportional Share |
15643 |
++ * Resource Allocation,'' technical report. |
15644 |
++ * |
15645 |
++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf |
15646 |
++ */ |
15647 |
++#include <linux/module.h> |
15648 |
++#include <linux/slab.h> |
15649 |
++#include <linux/blkdev.h> |
15650 |
++#include <linux/cgroup.h> |
15651 |
++#include <linux/elevator.h> |
15652 |
++#include <linux/jiffies.h> |
15653 |
++#include <linux/rbtree.h> |
15654 |
++#include <linux/ioprio.h> |
15655 |
++#include "bfq.h" |
15656 |
++#include "blk.h" |
15657 |
++ |
15658 |
++/* Max number of dispatches in one round of service. */ |
15659 |
++static const int bfq_quantum = 4; |
15660 |
++ |
15661 |
++/* Expiration time of sync (0) and async (1) requests, in jiffies. */ |
15662 |
++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; |
15663 |
++ |
15664 |
++/* Maximum backwards seek, in KiB. */ |
15665 |
++static const int bfq_back_max = 16 * 1024; |
15666 |
++ |
15667 |
++/* Penalty of a backwards seek, in number of sectors. */ |
15668 |
++static const int bfq_back_penalty = 2; |
15669 |
++ |
15670 |
++/* Idling period duration, in jiffies. */ |
15671 |
++static int bfq_slice_idle = HZ / 125; |
15672 |
++ |
15673 |
++/* Default maximum budget values, in sectors and number of requests. */ |
15674 |
++static const int bfq_default_max_budget = 16 * 1024; |
15675 |
++static const int bfq_max_budget_async_rq = 4; |
15676 |
++ |
15677 |
++/* |
15678 |
++ * Async to sync throughput distribution is controlled as follows: |
15679 |
++ * when an async request is served, the entity is charged the number |
15680 |
++ * of sectors of the request, multipled by the factor below |
15681 |
++ */ |
15682 |
++static const int bfq_async_charge_factor = 10; |
15683 |
++ |
15684 |
++/* Default timeout values, in jiffies, approximating CFQ defaults. */ |
15685 |
++static const int bfq_timeout_sync = HZ / 8; |
15686 |
++static int bfq_timeout_async = HZ / 25; |
15687 |
++ |
15688 |
++struct kmem_cache *bfq_pool; |
15689 |
++ |
15690 |
++/* Below this threshold (in ms), we consider thinktime immediate. */ |
15691 |
++#define BFQ_MIN_TT 2 |
15692 |
++ |
15693 |
++/* hw_tag detection: parallel requests threshold and min samples needed. */ |
15694 |
++#define BFQ_HW_QUEUE_THRESHOLD 4 |
15695 |
++#define BFQ_HW_QUEUE_SAMPLES 32 |
15696 |
++ |
15697 |
++#define BFQQ_SEEK_THR (sector_t)(8 * 1024) |
15698 |
++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) |
15699 |
++ |
15700 |
++/* Min samples used for peak rate estimation (for autotuning). */ |
15701 |
++#define BFQ_PEAK_RATE_SAMPLES 32 |
15702 |
++ |
15703 |
++/* Shift used for peak rate fixed precision calculations. */ |
15704 |
++#define BFQ_RATE_SHIFT 16 |
15705 |
++ |
15706 |
++/* |
15707 |
++ * The duration of the weight raising for interactive applications is |
15708 |
++ * computed automatically (as default behaviour), using the following |
15709 |
++ * formula: duration = (R / r) * T, where r is the peak rate of the |
15710 |
++ * disk, and R and T are two reference parameters. In particular, R is |
15711 |
++ * the peak rate of a reference disk, and T is about the maximum time |
15712 |
++ * for starting popular large applications on that disk, under BFQ and |
15713 |
++ * while reading two files in parallel. Finally, BFQ uses two |
15714 |
++ * different pairs (R, T) depending on whether the disk is rotational |
15715 |
++ * or non-rotational. |
15716 |
++ */ |
15717 |
++#define T_rot (msecs_to_jiffies(5500)) |
15718 |
++#define T_nonrot (msecs_to_jiffies(2000)) |
15719 |
++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */ |
15720 |
++#define R_rot 17415 |
15721 |
++#define R_nonrot 34791 |
15722 |
++ |
15723 |
++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ |
15724 |
++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) |
15725 |
++ |
15726 |
++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) |
15727 |
++#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) |
15728 |
++ |
15729 |
++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd); |
15730 |
++ |
15731 |
++#include "bfq-ioc.c" |
15732 |
++#include "bfq-sched.c" |
15733 |
++#include "bfq-cgroup.c" |
15734 |
++ |
15735 |
++#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\ |
15736 |
++ IOPRIO_CLASS_IDLE) |
15737 |
++#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\ |
15738 |
++ IOPRIO_CLASS_RT) |
15739 |
++ |
15740 |
++#define bfq_sample_valid(samples) ((samples) > 80) |
15741 |
++ |
15742 |
++/* |
15743 |
++ * We regard a request as SYNC, if either it's a read or has the SYNC bit |
15744 |
++ * set (in which case it could also be a direct WRITE). |
15745 |
++ */ |
15746 |
++static inline int bfq_bio_sync(struct bio *bio) |
15747 |
++{ |
15748 |
++ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) |
15749 |
++ return 1; |
15750 |
++ |
15751 |
++ return 0; |
15752 |
++} |
15753 |
++ |
15754 |
++/* |
15755 |
++ * Scheduler run of queue, if there are requests pending and no one in the |
15756 |
++ * driver that will restart queueing. |
15757 |
++ */ |
15758 |
++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd) |
15759 |
++{ |
15760 |
++ if (bfqd->queued != 0) { |
15761 |
++ bfq_log(bfqd, "schedule dispatch"); |
15762 |
++ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work); |
15763 |
++ } |
15764 |
++} |
15765 |
++ |
15766 |
++/* |
15767 |
++ * Lifted from AS - choose which of rq1 and rq2 that is best served now. |
15768 |
++ * We choose the request that is closesr to the head right now. Distance |
15769 |
++ * behind the head is penalized and only allowed to a certain extent. |
15770 |
++ */ |
15771 |
++static struct request *bfq_choose_req(struct bfq_data *bfqd, |
15772 |
++ struct request *rq1, |
15773 |
++ struct request *rq2, |
15774 |
++ sector_t last) |
15775 |
++{ |
15776 |
++ sector_t s1, s2, d1 = 0, d2 = 0; |
15777 |
++ unsigned long back_max; |
15778 |
++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ |
15779 |
++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ |
15780 |
++ unsigned wrap = 0; /* bit mask: requests behind the disk head? */ |
15781 |
++ |
15782 |
++ if (rq1 == NULL || rq1 == rq2) |
15783 |
++ return rq2; |
15784 |
++ if (rq2 == NULL) |
15785 |
++ return rq1; |
15786 |
++ |
15787 |
++ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) |
15788 |
++ return rq1; |
15789 |
++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) |
15790 |
++ return rq2; |
15791 |
++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) |
15792 |
++ return rq1; |
15793 |
++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) |
15794 |
++ return rq2; |
15795 |
++ |
15796 |
++ s1 = blk_rq_pos(rq1); |
15797 |
++ s2 = blk_rq_pos(rq2); |
15798 |
++ |
15799 |
++ /* |
15800 |
++ * By definition, 1KiB is 2 sectors. |
15801 |
++ */ |
15802 |
++ back_max = bfqd->bfq_back_max * 2; |
15803 |
++ |
15804 |
++ /* |
15805 |
++ * Strict one way elevator _except_ in the case where we allow |
15806 |
++ * short backward seeks which are biased as twice the cost of a |
15807 |
++ * similar forward seek. |
15808 |
++ */ |
15809 |
++ if (s1 >= last) |
15810 |
++ d1 = s1 - last; |
15811 |
++ else if (s1 + back_max >= last) |
15812 |
++ d1 = (last - s1) * bfqd->bfq_back_penalty; |
15813 |
++ else |
15814 |
++ wrap |= BFQ_RQ1_WRAP; |
15815 |
++ |
15816 |
++ if (s2 >= last) |
15817 |
++ d2 = s2 - last; |
15818 |
++ else if (s2 + back_max >= last) |
15819 |
++ d2 = (last - s2) * bfqd->bfq_back_penalty; |
15820 |
++ else |
15821 |
++ wrap |= BFQ_RQ2_WRAP; |
15822 |
++ |
15823 |
++ /* Found required data */ |
15824 |
++ |
15825 |
++ /* |
15826 |
++ * By doing switch() on the bit mask "wrap" we avoid having to |
15827 |
++ * check two variables for all permutations: --> faster! |
15828 |
++ */ |
15829 |
++ switch (wrap) { |
15830 |
++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ |
15831 |
++ if (d1 < d2) |
15832 |
++ return rq1; |
15833 |
++ else if (d2 < d1) |
15834 |
++ return rq2; |
15835 |
++ else { |
15836 |
++ if (s1 >= s2) |
15837 |
++ return rq1; |
15838 |
++ else |
15839 |
++ return rq2; |
15840 |
++ } |
15841 |
++ |
15842 |
++ case BFQ_RQ2_WRAP: |
15843 |
++ return rq1; |
15844 |
++ case BFQ_RQ1_WRAP: |
15845 |
++ return rq2; |
15846 |
++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ |
15847 |
++ default: |
15848 |
++ /* |
15849 |
++ * Since both rqs are wrapped, |
15850 |
++ * start with the one that's further behind head |
15851 |
++ * (--> only *one* back seek required), |
15852 |
++ * since back seek takes more time than forward. |
15853 |
++ */ |
15854 |
++ if (s1 <= s2) |
15855 |
++ return rq1; |
15856 |
++ else |
15857 |
++ return rq2; |
15858 |
++ } |
15859 |
++} |
15860 |
++ |
15861 |
++static struct bfq_queue * |
15862 |
++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, |
15863 |
++ sector_t sector, struct rb_node **ret_parent, |
15864 |
++ struct rb_node ***rb_link) |
15865 |
++{ |
15866 |
++ struct rb_node **p, *parent; |
15867 |
++ struct bfq_queue *bfqq = NULL; |
15868 |
++ |
15869 |
++ parent = NULL; |
15870 |
++ p = &root->rb_node; |
15871 |
++ while (*p) { |
15872 |
++ struct rb_node **n; |
15873 |
++ |
15874 |
++ parent = *p; |
15875 |
++ bfqq = rb_entry(parent, struct bfq_queue, pos_node); |
15876 |
++ |
15877 |
++ /* |
15878 |
++ * Sort strictly based on sector. Smallest to the left, |
15879 |
++ * largest to the right. |
15880 |
++ */ |
15881 |
++ if (sector > blk_rq_pos(bfqq->next_rq)) |
15882 |
++ n = &(*p)->rb_right; |
15883 |
++ else if (sector < blk_rq_pos(bfqq->next_rq)) |
15884 |
++ n = &(*p)->rb_left; |
15885 |
++ else |
15886 |
++ break; |
15887 |
++ p = n; |
15888 |
++ bfqq = NULL; |
15889 |
++ } |
15890 |
++ |
15891 |
++ *ret_parent = parent; |
15892 |
++ if (rb_link) |
15893 |
++ *rb_link = p; |
15894 |
++ |
15895 |
++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", |
15896 |
++ (long long unsigned)sector, |
15897 |
++ bfqq != NULL ? bfqq->pid : 0); |
15898 |
++ |
15899 |
++ return bfqq; |
15900 |
++} |
15901 |
++ |
15902 |
++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
15903 |
++{ |
15904 |
++ struct rb_node **p, *parent; |
15905 |
++ struct bfq_queue *__bfqq; |
15906 |
++ |
15907 |
++ if (bfqq->pos_root != NULL) { |
15908 |
++ rb_erase(&bfqq->pos_node, bfqq->pos_root); |
15909 |
++ bfqq->pos_root = NULL; |
15910 |
++ } |
15911 |
++ |
15912 |
++ if (bfq_class_idle(bfqq)) |
15913 |
++ return; |
15914 |
++ if (!bfqq->next_rq) |
15915 |
++ return; |
15916 |
++ |
15917 |
++ bfqq->pos_root = &bfqd->rq_pos_tree; |
15918 |
++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, |
15919 |
++ blk_rq_pos(bfqq->next_rq), &parent, &p); |
15920 |
++ if (__bfqq == NULL) { |
15921 |
++ rb_link_node(&bfqq->pos_node, parent, p); |
15922 |
++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); |
15923 |
++ } else |
15924 |
++ bfqq->pos_root = NULL; |
15925 |
++} |
15926 |
++ |
15927 |
++static struct request *bfq_find_next_rq(struct bfq_data *bfqd, |
15928 |
++ struct bfq_queue *bfqq, |
15929 |
++ struct request *last) |
15930 |
++{ |
15931 |
++ struct rb_node *rbnext = rb_next(&last->rb_node); |
15932 |
++ struct rb_node *rbprev = rb_prev(&last->rb_node); |
15933 |
++ struct request *next = NULL, *prev = NULL; |
15934 |
++ |
15935 |
++ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); |
15936 |
++ |
15937 |
++ if (rbprev != NULL) |
15938 |
++ prev = rb_entry_rq(rbprev); |
15939 |
++ |
15940 |
++ if (rbnext != NULL) |
15941 |
++ next = rb_entry_rq(rbnext); |
15942 |
++ else { |
15943 |
++ rbnext = rb_first(&bfqq->sort_list); |
15944 |
++ if (rbnext && rbnext != &last->rb_node) |
15945 |
++ next = rb_entry_rq(rbnext); |
15946 |
++ } |
15947 |
++ |
15948 |
++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); |
15949 |
++} |
15950 |
++ |
15951 |
++static void bfq_del_rq_rb(struct request *rq) |
15952 |
++{ |
15953 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
15954 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
15955 |
++ const int sync = rq_is_sync(rq); |
15956 |
++ |
15957 |
++ BUG_ON(bfqq->queued[sync] == 0); |
15958 |
++ bfqq->queued[sync]--; |
15959 |
++ bfqd->queued--; |
15960 |
++ |
15961 |
++ elv_rb_del(&bfqq->sort_list, rq); |
15962 |
++ |
15963 |
++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { |
15964 |
++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) |
15965 |
++ bfq_del_bfqq_busy(bfqd, bfqq, 1); |
15966 |
++ /* |
15967 |
++ * Remove queue from request-position tree as it is empty. |
15968 |
++ */ |
15969 |
++ if (bfqq->pos_root != NULL) { |
15970 |
++ rb_erase(&bfqq->pos_node, bfqq->pos_root); |
15971 |
++ bfqq->pos_root = NULL; |
15972 |
++ } |
15973 |
++ } |
15974 |
++} |
15975 |
++ |
15976 |
++/* see the definition of bfq_async_charge_factor for details */ |
15977 |
++static inline unsigned long bfq_serv_to_charge(struct request *rq, |
15978 |
++ struct bfq_queue *bfqq) |
15979 |
++{ |
15980 |
++ return blk_rq_sectors(rq) * |
15981 |
++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) * |
15982 |
++ bfq_async_charge_factor)); |
15983 |
++} |
15984 |
++ |
15985 |
++/** |
15986 |
++ * bfq_updated_next_req - update the queue after a new next_rq selection. |
15987 |
++ * @bfqd: the device data the queue belongs to. |
15988 |
++ * @bfqq: the queue to update. |
15989 |
++ * |
15990 |
++ * If the first request of a queue changes we make sure that the queue |
15991 |
++ * has enough budget to serve at least its first request (if the |
15992 |
++ * request has grown). We do this because if the queue has not enough |
15993 |
++ * budget for its first request, it has to go through two dispatch |
15994 |
++ * rounds to actually get it dispatched. |
15995 |
++ */ |
15996 |
++static void bfq_updated_next_req(struct bfq_data *bfqd, |
15997 |
++ struct bfq_queue *bfqq) |
15998 |
++{ |
15999 |
++ struct bfq_entity *entity = &bfqq->entity; |
16000 |
++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); |
16001 |
++ struct request *next_rq = bfqq->next_rq; |
16002 |
++ unsigned long new_budget; |
16003 |
++ |
16004 |
++ if (next_rq == NULL) |
16005 |
++ return; |
16006 |
++ |
16007 |
++ if (bfqq == bfqd->in_service_queue) |
16008 |
++ /* |
16009 |
++ * In order not to break guarantees, budgets cannot be |
16010 |
++ * changed after an entity has been selected. |
16011 |
++ */ |
16012 |
++ return; |
16013 |
++ |
16014 |
++ BUG_ON(entity->tree != &st->active); |
16015 |
++ BUG_ON(entity == entity->sched_data->in_service_entity); |
16016 |
++ |
16017 |
++ new_budget = max_t(unsigned long, bfqq->max_budget, |
16018 |
++ bfq_serv_to_charge(next_rq, bfqq)); |
16019 |
++ entity->budget = new_budget; |
16020 |
++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); |
16021 |
++ bfq_activate_bfqq(bfqd, bfqq); |
16022 |
++} |
16023 |
++ |
16024 |
++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd) |
16025 |
++{ |
16026 |
++ u64 dur; |
16027 |
++ |
16028 |
++ if (bfqd->bfq_raising_max_time > 0) |
16029 |
++ return bfqd->bfq_raising_max_time; |
16030 |
++ |
16031 |
++ dur = bfqd->RT_prod; |
16032 |
++ do_div(dur, bfqd->peak_rate); |
16033 |
++ |
16034 |
++ return dur; |
16035 |
++} |
16036 |
++ |
16037 |
++static void bfq_add_rq_rb(struct request *rq) |
16038 |
++{ |
16039 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
16040 |
++ struct bfq_entity *entity = &bfqq->entity; |
16041 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
16042 |
++ struct request *next_rq, *prev; |
16043 |
++ unsigned long old_raising_coeff = bfqq->raising_coeff; |
16044 |
++ int idle_for_long_time = 0; |
16045 |
++ |
16046 |
++ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq)); |
16047 |
++ bfqq->queued[rq_is_sync(rq)]++; |
16048 |
++ bfqd->queued++; |
16049 |
++ |
16050 |
++ elv_rb_add(&bfqq->sort_list, rq); |
16051 |
++ |
16052 |
++ /* |
16053 |
++ * Check if this request is a better next-serve candidate. |
16054 |
++ */ |
16055 |
++ prev = bfqq->next_rq; |
16056 |
++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); |
16057 |
++ BUG_ON(next_rq == NULL); |
16058 |
++ bfqq->next_rq = next_rq; |
16059 |
++ |
16060 |
++ /* |
16061 |
++ * Adjust priority tree position, if next_rq changes. |
16062 |
++ */ |
16063 |
++ if (prev != bfqq->next_rq) |
16064 |
++ bfq_rq_pos_tree_add(bfqd, bfqq); |
16065 |
++ |
16066 |
++ if (!bfq_bfqq_busy(bfqq)) { |
16067 |
++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 && |
16068 |
++ time_is_before_jiffies(bfqq->soft_rt_next_start); |
16069 |
++ idle_for_long_time = time_is_before_jiffies( |
16070 |
++ bfqq->budget_timeout + |
16071 |
++ bfqd->bfq_raising_min_idle_time); |
16072 |
++ entity->budget = max_t(unsigned long, bfqq->max_budget, |
16073 |
++ bfq_serv_to_charge(next_rq, bfqq)); |
16074 |
++ |
16075 |
++ if (!bfqd->low_latency) |
16076 |
++ goto add_bfqq_busy; |
16077 |
++ |
16078 |
++ /* |
16079 |
++ * If the queue is not being boosted and has been idle |
16080 |
++ * for enough time, start a weight-raising period |
16081 |
++ */ |
16082 |
++ if (old_raising_coeff == 1 && |
16083 |
++ (idle_for_long_time || soft_rt)) { |
16084 |
++ bfqq->raising_coeff = bfqd->bfq_raising_coeff; |
16085 |
++ if (idle_for_long_time) |
16086 |
++ bfqq->raising_cur_max_time = |
16087 |
++ bfq_wrais_duration(bfqd); |
16088 |
++ else |
16089 |
++ bfqq->raising_cur_max_time = |
16090 |
++ bfqd->bfq_raising_rt_max_time; |
16091 |
++ bfq_log_bfqq(bfqd, bfqq, |
16092 |
++ "wrais starting at %lu, " |
16093 |
++ "rais_max_time %u", |
16094 |
++ jiffies, |
16095 |
++ jiffies_to_msecs(bfqq-> |
16096 |
++ raising_cur_max_time)); |
16097 |
++ } else if (old_raising_coeff > 1) { |
16098 |
++ if (idle_for_long_time) |
16099 |
++ bfqq->raising_cur_max_time = |
16100 |
++ bfq_wrais_duration(bfqd); |
16101 |
++ else if (bfqq->raising_cur_max_time == |
16102 |
++ bfqd->bfq_raising_rt_max_time && |
16103 |
++ !soft_rt) { |
16104 |
++ bfqq->raising_coeff = 1; |
16105 |
++ bfq_log_bfqq(bfqd, bfqq, |
16106 |
++ "wrais ending at %lu, " |
16107 |
++ "rais_max_time %u", |
16108 |
++ jiffies, |
16109 |
++ jiffies_to_msecs(bfqq-> |
16110 |
++ raising_cur_max_time)); |
16111 |
++ } else if (time_before( |
16112 |
++ bfqq->last_rais_start_finish + |
16113 |
++ bfqq->raising_cur_max_time, |
16114 |
++ jiffies + |
16115 |
++ bfqd->bfq_raising_rt_max_time) && |
16116 |
++ soft_rt) { |
16117 |
++ /* |
16118 |
++ * |
16119 |
++ * The remaining weight-raising time is lower |
16120 |
++ * than bfqd->bfq_raising_rt_max_time, which |
16121 |
++ * means that the application is enjoying |
16122 |
++ * weight raising either because deemed soft rt |
16123 |
++ * in the near past, or because deemed |
16124 |
++ * interactive a long ago. In both cases, |
16125 |
++ * resetting now the current remaining weight- |
16126 |
++ * raising time for the application to the |
16127 |
++ * weight-raising duration for soft rt |
16128 |
++ * applications would not cause any latency |
16129 |
++ * increase for the application (as the new |
16130 |
++ * duration would be higher than the remaining |
16131 |
++ * time). |
16132 |
++ * |
16133 |
++ * In addition, the application is now meeting |
16134 |
++ * the requirements for being deemed soft rt. |
16135 |
++ * In the end we can correctly and safely |
16136 |
++ * (re)charge the weight-raising duration for |
16137 |
++ * the application with the weight-raising |
16138 |
++ * duration for soft rt applications. |
16139 |
++ * |
16140 |
++ * In particular, doing this recharge now, i.e., |
16141 |
++ * before the weight-raising period for the |
16142 |
++ * application finishes, reduces the probability |
16143 |
++ * of the following negative scenario: |
16144 |
++ * 1) the weight of a soft rt application is |
16145 |
++ * raised at startup (as for any newly |
16146 |
++ * created application), |
16147 |
++ * 2) since the application is not interactive, |
16148 |
++ * at a certain time weight-raising is |
16149 |
++ * stopped for the application, |
16150 |
++ * 3) at that time the application happens to |
16151 |
++ * still have pending requests, and hence |
16152 |
++ * is destined to not have a chance to be |
16153 |
++ * deemed soft rt before these requests are |
16154 |
++ * completed (see the comments to the |
16155 |
++ * function bfq_bfqq_softrt_next_start() |
16156 |
++ * for details on soft rt detection), |
16157 |
++ * 4) these pending requests experience a high |
16158 |
++ * latency because the application is not |
16159 |
++ * weight-raised while they are pending. |
16160 |
++ */ |
16161 |
++ bfqq->last_rais_start_finish = jiffies; |
16162 |
++ bfqq->raising_cur_max_time = |
16163 |
++ bfqd->bfq_raising_rt_max_time; |
16164 |
++ } |
16165 |
++ } |
16166 |
++ if (old_raising_coeff != bfqq->raising_coeff) |
16167 |
++ entity->ioprio_changed = 1; |
16168 |
++add_bfqq_busy: |
16169 |
++ bfqq->last_idle_bklogged = jiffies; |
16170 |
++ bfqq->service_from_backlogged = 0; |
16171 |
++ bfq_clear_bfqq_softrt_update(bfqq); |
16172 |
++ bfq_add_bfqq_busy(bfqd, bfqq); |
16173 |
++ } else { |
16174 |
++ if (bfqd->low_latency && old_raising_coeff == 1 && |
16175 |
++ !rq_is_sync(rq) && |
16176 |
++ time_is_before_jiffies( |
16177 |
++ bfqq->last_rais_start_finish + |
16178 |
++ bfqd->bfq_raising_min_inter_arr_async)) { |
16179 |
++ bfqq->raising_coeff = bfqd->bfq_raising_coeff; |
16180 |
++ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd); |
16181 |
++ |
16182 |
++ bfqd->raised_busy_queues++; |
16183 |
++ entity->ioprio_changed = 1; |
16184 |
++ bfq_log_bfqq(bfqd, bfqq, |
16185 |
++ "non-idle wrais starting at %lu, " |
16186 |
++ "rais_max_time %u", |
16187 |
++ jiffies, |
16188 |
++ jiffies_to_msecs(bfqq-> |
16189 |
++ raising_cur_max_time)); |
16190 |
++ } |
16191 |
++ bfq_updated_next_req(bfqd, bfqq); |
16192 |
++ } |
16193 |
++ |
16194 |
++ if (bfqd->low_latency && |
16195 |
++ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 || |
16196 |
++ idle_for_long_time)) |
16197 |
++ bfqq->last_rais_start_finish = jiffies; |
16198 |
++} |
16199 |
++ |
16200 |
++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq) |
16201 |
++{ |
16202 |
++ elv_rb_del(&bfqq->sort_list, rq); |
16203 |
++ bfqq->queued[rq_is_sync(rq)]--; |
16204 |
++ bfqq->bfqd->queued--; |
16205 |
++ bfq_add_rq_rb(rq); |
16206 |
++} |
16207 |
++ |
16208 |
++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, |
16209 |
++ struct bio *bio) |
16210 |
++{ |
16211 |
++ struct task_struct *tsk = current; |
16212 |
++ struct bfq_io_cq *bic; |
16213 |
++ struct bfq_queue *bfqq; |
16214 |
++ |
16215 |
++ bic = bfq_bic_lookup(bfqd, tsk->io_context); |
16216 |
++ if (bic == NULL) |
16217 |
++ return NULL; |
16218 |
++ |
16219 |
++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); |
16220 |
++ if (bfqq != NULL) |
16221 |
++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); |
16222 |
++ |
16223 |
++ return NULL; |
16224 |
++} |
16225 |
++ |
16226 |
++static void bfq_activate_request(struct request_queue *q, struct request *rq) |
16227 |
++{ |
16228 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
16229 |
++ |
16230 |
++ bfqd->rq_in_driver++; |
16231 |
++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); |
16232 |
++ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", |
16233 |
++ (long long unsigned)bfqd->last_position); |
16234 |
++} |
16235 |
++ |
16236 |
++static void bfq_deactivate_request(struct request_queue *q, struct request *rq) |
16237 |
++{ |
16238 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
16239 |
++ |
16240 |
++ WARN_ON(bfqd->rq_in_driver == 0); |
16241 |
++ bfqd->rq_in_driver--; |
16242 |
++} |
16243 |
++ |
16244 |
++static void bfq_remove_request(struct request *rq) |
16245 |
++{ |
16246 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
16247 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
16248 |
++ |
16249 |
++ if (bfqq->next_rq == rq) { |
16250 |
++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); |
16251 |
++ bfq_updated_next_req(bfqd, bfqq); |
16252 |
++ } |
16253 |
++ |
16254 |
++ list_del_init(&rq->queuelist); |
16255 |
++ bfq_del_rq_rb(rq); |
16256 |
++ |
16257 |
++ if (rq->cmd_flags & REQ_META) { |
16258 |
++ WARN_ON(bfqq->meta_pending == 0); |
16259 |
++ bfqq->meta_pending--; |
16260 |
++ } |
16261 |
++} |
16262 |
++ |
16263 |
++static int bfq_merge(struct request_queue *q, struct request **req, |
16264 |
++ struct bio *bio) |
16265 |
++{ |
16266 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
16267 |
++ struct request *__rq; |
16268 |
++ |
16269 |
++ __rq = bfq_find_rq_fmerge(bfqd, bio); |
16270 |
++ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) { |
16271 |
++ *req = __rq; |
16272 |
++ return ELEVATOR_FRONT_MERGE; |
16273 |
++ } |
16274 |
++ |
16275 |
++ return ELEVATOR_NO_MERGE; |
16276 |
++} |
16277 |
++ |
16278 |
++static void bfq_merged_request(struct request_queue *q, struct request *req, |
16279 |
++ int type) |
16280 |
++{ |
16281 |
++ if (type == ELEVATOR_FRONT_MERGE) { |
16282 |
++ struct bfq_queue *bfqq = RQ_BFQQ(req); |
16283 |
++ |
16284 |
++ bfq_reposition_rq_rb(bfqq, req); |
16285 |
++ } |
16286 |
++} |
16287 |
++ |
16288 |
++static void bfq_merged_requests(struct request_queue *q, struct request *rq, |
16289 |
++ struct request *next) |
16290 |
++{ |
16291 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
16292 |
++ |
16293 |
++ /* |
16294 |
++ * Reposition in fifo if next is older than rq. |
16295 |
++ */ |
16296 |
++ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && |
16297 |
++ time_before(rq_fifo_time(next), rq_fifo_time(rq))) { |
16298 |
++ list_move(&rq->queuelist, &next->queuelist); |
16299 |
++ rq_set_fifo_time(rq, rq_fifo_time(next)); |
16300 |
++ } |
16301 |
++ |
16302 |
++ if (bfqq->next_rq == next) |
16303 |
++ bfqq->next_rq = rq; |
16304 |
++ |
16305 |
++ bfq_remove_request(next); |
16306 |
++} |
16307 |
++ |
16308 |
++/* Must be called with bfqq != NULL */ |
16309 |
++static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq) |
16310 |
++{ |
16311 |
++ BUG_ON(bfqq == NULL); |
16312 |
++ if (bfq_bfqq_busy(bfqq)) |
16313 |
++ bfqq->bfqd->raised_busy_queues--; |
16314 |
++ bfqq->raising_coeff = 1; |
16315 |
++ bfqq->raising_cur_max_time = 0; |
16316 |
++ /* Trigger a weight change on the next activation of the queue */ |
16317 |
++ bfqq->entity.ioprio_changed = 1; |
16318 |
++} |
16319 |
++ |
16320 |
++static void bfq_end_raising_async_queues(struct bfq_data *bfqd, |
16321 |
++ struct bfq_group *bfqg) |
16322 |
++{ |
16323 |
++ int i, j; |
16324 |
++ |
16325 |
++ for (i = 0; i < 2; i++) |
16326 |
++ for (j = 0; j < IOPRIO_BE_NR; j++) |
16327 |
++ if (bfqg->async_bfqq[i][j] != NULL) |
16328 |
++ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]); |
16329 |
++ if (bfqg->async_idle_bfqq != NULL) |
16330 |
++ bfq_bfqq_end_raising(bfqg->async_idle_bfqq); |
16331 |
++} |
16332 |
++ |
16333 |
++static void bfq_end_raising(struct bfq_data *bfqd) |
16334 |
++{ |
16335 |
++ struct bfq_queue *bfqq; |
16336 |
++ |
16337 |
++ spin_lock_irq(bfqd->queue->queue_lock); |
16338 |
++ |
16339 |
++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) |
16340 |
++ bfq_bfqq_end_raising(bfqq); |
16341 |
++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) |
16342 |
++ bfq_bfqq_end_raising(bfqq); |
16343 |
++ bfq_end_raising_async(bfqd); |
16344 |
++ |
16345 |
++ spin_unlock_irq(bfqd->queue->queue_lock); |
16346 |
++} |
16347 |
++ |
16348 |
++static int bfq_allow_merge(struct request_queue *q, struct request *rq, |
16349 |
++ struct bio *bio) |
16350 |
++{ |
16351 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
16352 |
++ struct bfq_io_cq *bic; |
16353 |
++ struct bfq_queue *bfqq; |
16354 |
++ |
16355 |
++ /* |
16356 |
++ * Disallow merge of a sync bio into an async request. |
16357 |
++ */ |
16358 |
++ if (bfq_bio_sync(bio) && !rq_is_sync(rq)) |
16359 |
++ return 0; |
16360 |
++ |
16361 |
++ /* |
16362 |
++ * Lookup the bfqq that this bio will be queued with. Allow |
16363 |
++ * merge only if rq is queued there. |
16364 |
++ * Queue lock is held here. |
16365 |
++ */ |
16366 |
++ bic = bfq_bic_lookup(bfqd, current->io_context); |
16367 |
++ if (bic == NULL) |
16368 |
++ return 0; |
16369 |
++ |
16370 |
++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); |
16371 |
++ return bfqq == RQ_BFQQ(rq); |
16372 |
++} |
16373 |
++ |
16374 |
++static void __bfq_set_in_service_queue(struct bfq_data *bfqd, |
16375 |
++ struct bfq_queue *bfqq) |
16376 |
++{ |
16377 |
++ if (bfqq != NULL) { |
16378 |
++ bfq_mark_bfqq_must_alloc(bfqq); |
16379 |
++ bfq_mark_bfqq_budget_new(bfqq); |
16380 |
++ bfq_clear_bfqq_fifo_expire(bfqq); |
16381 |
++ |
16382 |
++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; |
16383 |
++ |
16384 |
++ bfq_log_bfqq(bfqd, bfqq, |
16385 |
++ "set_in_service_queue, cur-budget = %lu", |
16386 |
++ bfqq->entity.budget); |
16387 |
++ } |
16388 |
++ |
16389 |
++ bfqd->in_service_queue = bfqq; |
16390 |
++} |
16391 |
++ |
16392 |
++/* |
16393 |
++ * Get and set a new queue for service. |
16394 |
++ */ |
16395 |
++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd, |
16396 |
++ struct bfq_queue *bfqq) |
16397 |
++{ |
16398 |
++ if (!bfqq) |
16399 |
++ bfqq = bfq_get_next_queue(bfqd); |
16400 |
++ else |
16401 |
++ bfq_get_next_queue_forced(bfqd, bfqq); |
16402 |
++ |
16403 |
++ __bfq_set_in_service_queue(bfqd, bfqq); |
16404 |
++ return bfqq; |
16405 |
++} |
16406 |
++ |
16407 |
++static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd, |
16408 |
++ struct request *rq) |
16409 |
++{ |
16410 |
++ if (blk_rq_pos(rq) >= bfqd->last_position) |
16411 |
++ return blk_rq_pos(rq) - bfqd->last_position; |
16412 |
++ else |
16413 |
++ return bfqd->last_position - blk_rq_pos(rq); |
16414 |
++} |
16415 |
++ |
16416 |
++/* |
16417 |
++ * Return true if bfqq has no request pending and rq is close enough to |
16418 |
++ * bfqd->last_position, or if rq is closer to bfqd->last_position than |
16419 |
++ * bfqq->next_rq |
16420 |
++ */ |
16421 |
++static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq) |
16422 |
++{ |
16423 |
++ return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR; |
16424 |
++} |
16425 |
++ |
16426 |
++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) |
16427 |
++{ |
16428 |
++ struct rb_root *root = &bfqd->rq_pos_tree; |
16429 |
++ struct rb_node *parent, *node; |
16430 |
++ struct bfq_queue *__bfqq; |
16431 |
++ sector_t sector = bfqd->last_position; |
16432 |
++ |
16433 |
++ if (RB_EMPTY_ROOT(root)) |
16434 |
++ return NULL; |
16435 |
++ |
16436 |
++ /* |
16437 |
++ * First, if we find a request starting at the end of the last |
16438 |
++ * request, choose it. |
16439 |
++ */ |
16440 |
++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); |
16441 |
++ if (__bfqq != NULL) |
16442 |
++ return __bfqq; |
16443 |
++ |
16444 |
++ /* |
16445 |
++ * If the exact sector wasn't found, the parent of the NULL leaf |
16446 |
++ * will contain the closest sector (rq_pos_tree sorted by next_request |
16447 |
++ * position). |
16448 |
++ */ |
16449 |
++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); |
16450 |
++ if (bfq_rq_close(bfqd, __bfqq->next_rq)) |
16451 |
++ return __bfqq; |
16452 |
++ |
16453 |
++ if (blk_rq_pos(__bfqq->next_rq) < sector) |
16454 |
++ node = rb_next(&__bfqq->pos_node); |
16455 |
++ else |
16456 |
++ node = rb_prev(&__bfqq->pos_node); |
16457 |
++ if (node == NULL) |
16458 |
++ return NULL; |
16459 |
++ |
16460 |
++ __bfqq = rb_entry(node, struct bfq_queue, pos_node); |
16461 |
++ if (bfq_rq_close(bfqd, __bfqq->next_rq)) |
16462 |
++ return __bfqq; |
16463 |
++ |
16464 |
++ return NULL; |
16465 |
++} |
16466 |
++ |
16467 |
++/* |
16468 |
++ * bfqd - obvious |
16469 |
++ * cur_bfqq - passed in so that we don't decide that the current queue |
16470 |
++ * is closely cooperating with itself. |
16471 |
++ * |
16472 |
++ * We are assuming that cur_bfqq has dispatched at least one request, |
16473 |
++ * and that bfqd->last_position reflects a position on the disk associated |
16474 |
++ * with the I/O issued by cur_bfqq. |
16475 |
++ */ |
16476 |
++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, |
16477 |
++ struct bfq_queue *cur_bfqq) |
16478 |
++{ |
16479 |
++ struct bfq_queue *bfqq; |
16480 |
++ |
16481 |
++ if (bfq_class_idle(cur_bfqq)) |
16482 |
++ return NULL; |
16483 |
++ if (!bfq_bfqq_sync(cur_bfqq)) |
16484 |
++ return NULL; |
16485 |
++ if (BFQQ_SEEKY(cur_bfqq)) |
16486 |
++ return NULL; |
16487 |
++ |
16488 |
++ /* If device has only one backlogged bfq_queue, don't search. */ |
16489 |
++ if (bfqd->busy_queues == 1) |
16490 |
++ return NULL; |
16491 |
++ |
16492 |
++ /* |
16493 |
++ * We should notice if some of the queues are cooperating, e.g. |
16494 |
++ * working closely on the same area of the disk. In that case, |
16495 |
++ * we can group them together and don't waste time idling. |
16496 |
++ */ |
16497 |
++ bfqq = bfqq_close(bfqd); |
16498 |
++ if (bfqq == NULL || bfqq == cur_bfqq) |
16499 |
++ return NULL; |
16500 |
++ |
16501 |
++ /* |
16502 |
++ * Do not merge queues from different bfq_groups. |
16503 |
++ */ |
16504 |
++ if (bfqq->entity.parent != cur_bfqq->entity.parent) |
16505 |
++ return NULL; |
16506 |
++ |
16507 |
++ /* |
16508 |
++ * It only makes sense to merge sync queues. |
16509 |
++ */ |
16510 |
++ if (!bfq_bfqq_sync(bfqq)) |
16511 |
++ return NULL; |
16512 |
++ if (BFQQ_SEEKY(bfqq)) |
16513 |
++ return NULL; |
16514 |
++ |
16515 |
++ /* |
16516 |
++ * Do not merge queues of different priority classes. |
16517 |
++ */ |
16518 |
++ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq)) |
16519 |
++ return NULL; |
16520 |
++ |
16521 |
++ return bfqq; |
16522 |
++} |
16523 |
++ |
16524 |
++/* |
16525 |
++ * If enough samples have been computed, return the current max budget |
16526 |
++ * stored in bfqd, which is dynamically updated according to the |
16527 |
++ * estimated disk peak rate; otherwise return the default max budget |
16528 |
++ */ |
16529 |
++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd) |
16530 |
++{ |
16531 |
++ if (bfqd->budgets_assigned < 194) |
16532 |
++ return bfq_default_max_budget; |
16533 |
++ else |
16534 |
++ return bfqd->bfq_max_budget; |
16535 |
++} |
16536 |
++ |
16537 |
++/* |
16538 |
++ * Return min budget, which is a fraction of the current or default |
16539 |
++ * max budget (trying with 1/32) |
16540 |
++ */ |
16541 |
++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd) |
16542 |
++{ |
16543 |
++ if (bfqd->budgets_assigned < 194) |
16544 |
++ return bfq_default_max_budget / 32; |
16545 |
++ else |
16546 |
++ return bfqd->bfq_max_budget / 32; |
16547 |
++} |
16548 |
++ |
16549 |
++/* |
16550 |
++ * Decides whether idling should be done for given device and |
16551 |
++ * given in-service queue. |
16552 |
++ */ |
16553 |
++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd, |
16554 |
++ struct bfq_queue *in_service_bfqq) |
16555 |
++{ |
16556 |
++ if (in_service_bfqq == NULL) |
16557 |
++ return false; |
16558 |
++ /* |
16559 |
++ * If the device is non-rotational, and hence has no seek penalty, |
16560 |
++ * disable idling; but do so only if: |
16561 |
++ * - device does not support queuing, otherwise we still have |
16562 |
++ * a problem with sync vs async workloads; |
16563 |
++ * - the queue is not weight-raised, to preserve guarantees. |
16564 |
++ */ |
16565 |
++ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag && |
16566 |
++ in_service_bfqq->raising_coeff == 1); |
16567 |
++} |
16568 |
++ |
16569 |
++static void bfq_arm_slice_timer(struct bfq_data *bfqd) |
16570 |
++{ |
16571 |
++ struct bfq_queue *bfqq = bfqd->in_service_queue; |
16572 |
++ struct bfq_io_cq *bic; |
16573 |
++ unsigned long sl; |
16574 |
++ |
16575 |
++ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); |
16576 |
++ |
16577 |
++ /* Tasks have exited, don't wait. */ |
16578 |
++ bic = bfqd->in_service_bic; |
16579 |
++ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0) |
16580 |
++ return; |
16581 |
++ |
16582 |
++ bfq_mark_bfqq_wait_request(bfqq); |
16583 |
++ |
16584 |
++ /* |
16585 |
++ * We don't want to idle for seeks, but we do want to allow |
16586 |
++ * fair distribution of slice time for a process doing back-to-back |
16587 |
++ * seeks. So allow a little bit of time for him to submit a new rq. |
16588 |
++ * |
16589 |
++ * To prevent processes with (partly) seeky workloads from |
16590 |
++ * being too ill-treated, grant them a small fraction of the |
16591 |
++ * assigned budget before reducing the waiting time to |
16592 |
++ * BFQ_MIN_TT. This happened to help reduce latency. |
16593 |
++ */ |
16594 |
++ sl = bfqd->bfq_slice_idle; |
16595 |
++ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) && |
16596 |
++ bfqq->entity.service > bfq_max_budget(bfqd) / 8 && |
16597 |
++ bfqq->raising_coeff == 1) |
16598 |
++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); |
16599 |
++ else if (bfqq->raising_coeff > 1) |
16600 |
++ sl = sl * 3; |
16601 |
++ bfqd->last_idling_start = ktime_get(); |
16602 |
++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl); |
16603 |
++ bfq_log(bfqd, "arm idle: %u/%u ms", |
16604 |
++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); |
16605 |
++} |
16606 |
++ |
16607 |
++/* |
16608 |
++ * Set the maximum time for the in-service queue to consume its |
16609 |
++ * budget. This prevents seeky processes from lowering the disk |
16610 |
++ * throughput (always guaranteed with a time slice scheme as in CFQ). |
16611 |
++ */ |
16612 |
++static void bfq_set_budget_timeout(struct bfq_data *bfqd) |
16613 |
++{ |
16614 |
++ struct bfq_queue *bfqq = bfqd->in_service_queue; |
16615 |
++ unsigned int timeout_coeff; |
16616 |
++ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time) |
16617 |
++ timeout_coeff = 1; |
16618 |
++ else |
16619 |
++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; |
16620 |
++ |
16621 |
++ bfqd->last_budget_start = ktime_get(); |
16622 |
++ |
16623 |
++ bfq_clear_bfqq_budget_new(bfqq); |
16624 |
++ bfqq->budget_timeout = jiffies + |
16625 |
++ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; |
16626 |
++ |
16627 |
++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", |
16628 |
++ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * |
16629 |
++ timeout_coeff)); |
16630 |
++} |
16631 |
++ |
16632 |
++/* |
16633 |
++ * Move request from internal lists to the request queue dispatch list. |
16634 |
++ */ |
16635 |
++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) |
16636 |
++{ |
16637 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
16638 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
16639 |
++ |
16640 |
++ bfq_remove_request(rq); |
16641 |
++ bfqq->dispatched++; |
16642 |
++ elv_dispatch_sort(q, rq); |
16643 |
++ |
16644 |
++ if (bfq_bfqq_sync(bfqq)) |
16645 |
++ bfqd->sync_flight++; |
16646 |
++} |
16647 |
++ |
16648 |
++/* |
16649 |
++ * Return expired entry, or NULL to just start from scratch in rbtree. |
16650 |
++ */ |
16651 |
++static struct request *bfq_check_fifo(struct bfq_queue *bfqq) |
16652 |
++{ |
16653 |
++ struct request *rq = NULL; |
16654 |
++ |
16655 |
++ if (bfq_bfqq_fifo_expire(bfqq)) |
16656 |
++ return NULL; |
16657 |
++ |
16658 |
++ bfq_mark_bfqq_fifo_expire(bfqq); |
16659 |
++ |
16660 |
++ if (list_empty(&bfqq->fifo)) |
16661 |
++ return NULL; |
16662 |
++ |
16663 |
++ rq = rq_entry_fifo(bfqq->fifo.next); |
16664 |
++ |
16665 |
++ if (time_before(jiffies, rq_fifo_time(rq))) |
16666 |
++ return NULL; |
16667 |
++ |
16668 |
++ return rq; |
16669 |
++} |
16670 |
++ |
16671 |
++/* |
16672 |
++ * Must be called with the queue_lock held. |
16673 |
++ */ |
16674 |
++static int bfqq_process_refs(struct bfq_queue *bfqq) |
16675 |
++{ |
16676 |
++ int process_refs, io_refs; |
16677 |
++ |
16678 |
++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; |
16679 |
++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; |
16680 |
++ BUG_ON(process_refs < 0); |
16681 |
++ return process_refs; |
16682 |
++} |
16683 |
++ |
16684 |
++static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) |
16685 |
++{ |
16686 |
++ int process_refs, new_process_refs; |
16687 |
++ struct bfq_queue *__bfqq; |
16688 |
++ |
16689 |
++ /* |
16690 |
++ * If there are no process references on the new_bfqq, then it is |
16691 |
++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain |
16692 |
++ * may have dropped their last reference (not just their last process |
16693 |
++ * reference). |
16694 |
++ */ |
16695 |
++ if (!bfqq_process_refs(new_bfqq)) |
16696 |
++ return; |
16697 |
++ |
16698 |
++ /* Avoid a circular list and skip interim queue merges. */ |
16699 |
++ while ((__bfqq = new_bfqq->new_bfqq)) { |
16700 |
++ if (__bfqq == bfqq) |
16701 |
++ return; |
16702 |
++ new_bfqq = __bfqq; |
16703 |
++ } |
16704 |
++ |
16705 |
++ process_refs = bfqq_process_refs(bfqq); |
16706 |
++ new_process_refs = bfqq_process_refs(new_bfqq); |
16707 |
++ /* |
16708 |
++ * If the process for the bfqq has gone away, there is no |
16709 |
++ * sense in merging the queues. |
16710 |
++ */ |
16711 |
++ if (process_refs == 0 || new_process_refs == 0) |
16712 |
++ return; |
16713 |
++ |
16714 |
++ /* |
16715 |
++ * Merge in the direction of the lesser amount of work. |
16716 |
++ */ |
16717 |
++ if (new_process_refs >= process_refs) { |
16718 |
++ bfqq->new_bfqq = new_bfqq; |
16719 |
++ atomic_add(process_refs, &new_bfqq->ref); |
16720 |
++ } else { |
16721 |
++ new_bfqq->new_bfqq = bfqq; |
16722 |
++ atomic_add(new_process_refs, &bfqq->ref); |
16723 |
++ } |
16724 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", |
16725 |
++ new_bfqq->pid); |
16726 |
++} |
16727 |
++ |
16728 |
++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) |
16729 |
++{ |
16730 |
++ struct bfq_entity *entity = &bfqq->entity; |
16731 |
++ return entity->budget - entity->service; |
16732 |
++} |
16733 |
++ |
16734 |
++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
16735 |
++{ |
16736 |
++ BUG_ON(bfqq != bfqd->in_service_queue); |
16737 |
++ |
16738 |
++ __bfq_bfqd_reset_in_service(bfqd); |
16739 |
++ |
16740 |
++ /* |
16741 |
++ * If this bfqq is shared between multiple processes, check |
16742 |
++ * to make sure that those processes are still issuing I/Os |
16743 |
++ * within the mean seek distance. If not, it may be time to |
16744 |
++ * break the queues apart again. |
16745 |
++ */ |
16746 |
++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) |
16747 |
++ bfq_mark_bfqq_split_coop(bfqq); |
16748 |
++ |
16749 |
++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { |
16750 |
++ /* |
16751 |
++ * overloading budget_timeout field to store when |
16752 |
++ * the queue remains with no backlog, used by |
16753 |
++ * the weight-raising mechanism |
16754 |
++ */ |
16755 |
++ bfqq->budget_timeout = jiffies; |
16756 |
++ bfq_del_bfqq_busy(bfqd, bfqq, 1); |
16757 |
++ } else { |
16758 |
++ bfq_activate_bfqq(bfqd, bfqq); |
16759 |
++ /* |
16760 |
++ * Resort priority tree of potential close cooperators. |
16761 |
++ */ |
16762 |
++ bfq_rq_pos_tree_add(bfqd, bfqq); |
16763 |
++ } |
16764 |
++} |
16765 |
++ |
16766 |
++/** |
16767 |
++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. |
16768 |
++ * @bfqd: device data. |
16769 |
++ * @bfqq: queue to update. |
16770 |
++ * @reason: reason for expiration. |
16771 |
++ * |
16772 |
++ * Handle the feedback on @bfqq budget. See the body for detailed |
16773 |
++ * comments. |
16774 |
++ */ |
16775 |
++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, |
16776 |
++ struct bfq_queue *bfqq, |
16777 |
++ enum bfqq_expiration reason) |
16778 |
++{ |
16779 |
++ struct request *next_rq; |
16780 |
++ unsigned long budget, min_budget; |
16781 |
++ |
16782 |
++ budget = bfqq->max_budget; |
16783 |
++ min_budget = bfq_min_budget(bfqd); |
16784 |
++ |
16785 |
++ BUG_ON(bfqq != bfqd->in_service_queue); |
16786 |
++ |
16787 |
++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu", |
16788 |
++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); |
16789 |
++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu", |
16790 |
++ budget, bfq_min_budget(bfqd)); |
16791 |
++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", |
16792 |
++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); |
16793 |
++ |
16794 |
++ if (bfq_bfqq_sync(bfqq)) { |
16795 |
++ switch (reason) { |
16796 |
++ /* |
16797 |
++ * Caveat: in all the following cases we trade latency |
16798 |
++ * for throughput. |
16799 |
++ */ |
16800 |
++ case BFQ_BFQQ_TOO_IDLE: |
16801 |
++ /* |
16802 |
++ * This is the only case where we may reduce |
16803 |
++ * the budget: if there is no requets of the |
16804 |
++ * process still waiting for completion, then |
16805 |
++ * we assume (tentatively) that the timer has |
16806 |
++ * expired because the batch of requests of |
16807 |
++ * the process could have been served with a |
16808 |
++ * smaller budget. Hence, betting that |
16809 |
++ * process will behave in the same way when it |
16810 |
++ * becomes backlogged again, we reduce its |
16811 |
++ * next budget. As long as we guess right, |
16812 |
++ * this budget cut reduces the latency |
16813 |
++ * experienced by the process. |
16814 |
++ * |
16815 |
++ * However, if there are still outstanding |
16816 |
++ * requests, then the process may have not yet |
16817 |
++ * issued its next request just because it is |
16818 |
++ * still waiting for the completion of some of |
16819 |
++ * the still oustanding ones. So in this |
16820 |
++ * subcase we do not reduce its budget, on the |
16821 |
++ * contrary we increase it to possibly boost |
16822 |
++ * the throughput, as discussed in the |
16823 |
++ * comments to the BUDGET_TIMEOUT case. |
16824 |
++ */ |
16825 |
++ if (bfqq->dispatched > 0) /* still oustanding reqs */ |
16826 |
++ budget = min(budget * 2, bfqd->bfq_max_budget); |
16827 |
++ else { |
16828 |
++ if (budget > 5 * min_budget) |
16829 |
++ budget -= 4 * min_budget; |
16830 |
++ else |
16831 |
++ budget = min_budget; |
16832 |
++ } |
16833 |
++ break; |
16834 |
++ case BFQ_BFQQ_BUDGET_TIMEOUT: |
16835 |
++ /* |
16836 |
++ * We double the budget here because: 1) it |
16837 |
++ * gives the chance to boost the throughput if |
16838 |
++ * this is not a seeky process (which may have |
16839 |
++ * bumped into this timeout because of, e.g., |
16840 |
++ * ZBR), 2) together with charge_full_budget |
16841 |
++ * it helps give seeky processes higher |
16842 |
++ * timestamps, and hence be served less |
16843 |
++ * frequently. |
16844 |
++ */ |
16845 |
++ budget = min(budget * 2, bfqd->bfq_max_budget); |
16846 |
++ break; |
16847 |
++ case BFQ_BFQQ_BUDGET_EXHAUSTED: |
16848 |
++ /* |
16849 |
++ * The process still has backlog, and did not |
16850 |
++ * let either the budget timeout or the disk |
16851 |
++ * idling timeout expire. Hence it is not |
16852 |
++ * seeky, has a short thinktime and may be |
16853 |
++ * happy with a higher budget too. So |
16854 |
++ * definitely increase the budget of this good |
16855 |
++ * candidate to boost the disk throughput. |
16856 |
++ */ |
16857 |
++ budget = min(budget * 4, bfqd->bfq_max_budget); |
16858 |
++ break; |
16859 |
++ case BFQ_BFQQ_NO_MORE_REQUESTS: |
16860 |
++ /* |
16861 |
++ * Leave the budget unchanged. |
16862 |
++ */ |
16863 |
++ default: |
16864 |
++ return; |
16865 |
++ } |
16866 |
++ } else /* async queue */ |
16867 |
++ /* async queues get always the maximum possible budget |
16868 |
++ * (their ability to dispatch is limited by |
16869 |
++ * @bfqd->bfq_max_budget_async_rq). |
16870 |
++ */ |
16871 |
++ budget = bfqd->bfq_max_budget; |
16872 |
++ |
16873 |
++ bfqq->max_budget = budget; |
16874 |
++ |
16875 |
++ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 && |
16876 |
++ bfqq->max_budget > bfqd->bfq_max_budget) |
16877 |
++ bfqq->max_budget = bfqd->bfq_max_budget; |
16878 |
++ |
16879 |
++ /* |
16880 |
++ * Make sure that we have enough budget for the next request. |
16881 |
++ * Since the finish time of the bfqq must be kept in sync with |
16882 |
++ * the budget, be sure to call __bfq_bfqq_expire() after the |
16883 |
++ * update. |
16884 |
++ */ |
16885 |
++ next_rq = bfqq->next_rq; |
16886 |
++ if (next_rq != NULL) |
16887 |
++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, |
16888 |
++ bfq_serv_to_charge(next_rq, bfqq)); |
16889 |
++ else |
16890 |
++ bfqq->entity.budget = bfqq->max_budget; |
16891 |
++ |
16892 |
++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu", |
16893 |
++ next_rq != NULL ? blk_rq_sectors(next_rq) : 0, |
16894 |
++ bfqq->entity.budget); |
16895 |
++} |
16896 |
++ |
16897 |
++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) |
16898 |
++{ |
16899 |
++ unsigned long max_budget; |
16900 |
++ |
16901 |
++ /* |
16902 |
++ * The max_budget calculated when autotuning is equal to the |
16903 |
++ * amount of sectors transfered in timeout_sync at the |
16904 |
++ * estimated peak rate. |
16905 |
++ */ |
16906 |
++ max_budget = (unsigned long)(peak_rate * 1000 * |
16907 |
++ timeout >> BFQ_RATE_SHIFT); |
16908 |
++ |
16909 |
++ return max_budget; |
16910 |
++} |
16911 |
++ |
16912 |
++/* |
16913 |
++ * In addition to updating the peak rate, checks whether the process |
16914 |
++ * is "slow", and returns 1 if so. This slow flag is used, in addition |
16915 |
++ * to the budget timeout, to reduce the amount of service provided to |
16916 |
++ * seeky processes, and hence reduce their chances to lower the |
16917 |
++ * throughput. See the code for more details. |
16918 |
++ */ |
16919 |
++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
16920 |
++ int compensate, enum bfqq_expiration reason) |
16921 |
++{ |
16922 |
++ u64 bw, usecs, expected, timeout; |
16923 |
++ ktime_t delta; |
16924 |
++ int update = 0; |
16925 |
++ |
16926 |
++ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) |
16927 |
++ return 0; |
16928 |
++ |
16929 |
++ if (compensate) |
16930 |
++ delta = bfqd->last_idling_start; |
16931 |
++ else |
16932 |
++ delta = ktime_get(); |
16933 |
++ delta = ktime_sub(delta, bfqd->last_budget_start); |
16934 |
++ usecs = ktime_to_us(delta); |
16935 |
++ |
16936 |
++ /* Don't trust short/unrealistic values. */ |
16937 |
++ if (usecs < 100 || usecs >= LONG_MAX) |
16938 |
++ return 0; |
16939 |
++ |
16940 |
++ /* |
16941 |
++ * Calculate the bandwidth for the last slice. We use a 64 bit |
16942 |
++ * value to store the peak rate, in sectors per usec in fixed |
16943 |
++ * point math. We do so to have enough precision in the estimate |
16944 |
++ * and to avoid overflows. |
16945 |
++ */ |
16946 |
++ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; |
16947 |
++ do_div(bw, (unsigned long)usecs); |
16948 |
++ |
16949 |
++ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); |
16950 |
++ |
16951 |
++ /* |
16952 |
++ * Use only long (> 20ms) intervals to filter out spikes for |
16953 |
++ * the peak rate estimation. |
16954 |
++ */ |
16955 |
++ if (usecs > 20000) { |
16956 |
++ if (bw > bfqd->peak_rate || |
16957 |
++ (!BFQQ_SEEKY(bfqq) && |
16958 |
++ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { |
16959 |
++ bfq_log(bfqd, "measured bw =%llu", bw); |
16960 |
++ /* |
16961 |
++ * To smooth oscillations use a low-pass filter with |
16962 |
++ * alpha=7/8, i.e., |
16963 |
++ * new_rate = (7/8) * old_rate + (1/8) * bw |
16964 |
++ */ |
16965 |
++ do_div(bw, 8); |
16966 |
++ if (bw == 0) |
16967 |
++ return 0; |
16968 |
++ bfqd->peak_rate *= 7; |
16969 |
++ do_div(bfqd->peak_rate, 8); |
16970 |
++ bfqd->peak_rate += bw; |
16971 |
++ update = 1; |
16972 |
++ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); |
16973 |
++ } |
16974 |
++ |
16975 |
++ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; |
16976 |
++ |
16977 |
++ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) |
16978 |
++ bfqd->peak_rate_samples++; |
16979 |
++ |
16980 |
++ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && |
16981 |
++ update && bfqd->bfq_user_max_budget == 0) { |
16982 |
++ bfqd->bfq_max_budget = |
16983 |
++ bfq_calc_max_budget(bfqd->peak_rate, timeout); |
16984 |
++ bfq_log(bfqd, "new max_budget=%lu", |
16985 |
++ bfqd->bfq_max_budget); |
16986 |
++ } |
16987 |
++ } |
16988 |
++ |
16989 |
++ /* |
16990 |
++ * If the process has been served for a too short time |
16991 |
++ * interval to let its possible sequential accesses prevail on |
16992 |
++ * the initial seek time needed to move the disk head on the |
16993 |
++ * first sector it requested, then give the process a chance |
16994 |
++ * and for the moment return false. |
16995 |
++ */ |
16996 |
++ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) |
16997 |
++ return 0; |
16998 |
++ |
16999 |
++ /* |
17000 |
++ * A process is considered ``slow'' (i.e., seeky, so that we |
17001 |
++ * cannot treat it fairly in the service domain, as it would |
17002 |
++ * slow down too much the other processes) if, when a slice |
17003 |
++ * ends for whatever reason, it has received service at a |
17004 |
++ * rate that would not be high enough to complete the budget |
17005 |
++ * before the budget timeout expiration. |
17006 |
++ */ |
17007 |
++ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; |
17008 |
++ |
17009 |
++ /* |
17010 |
++ * Caveat: processes doing IO in the slower disk zones will |
17011 |
++ * tend to be slow(er) even if not seeky. And the estimated |
17012 |
++ * peak rate will actually be an average over the disk |
17013 |
++ * surface. Hence, to not be too harsh with unlucky processes, |
17014 |
++ * we keep a budget/3 margin of safety before declaring a |
17015 |
++ * process slow. |
17016 |
++ */ |
17017 |
++ return expected > (4 * bfqq->entity.budget) / 3; |
17018 |
++} |
17019 |
++ |
17020 |
++/* |
17021 |
++ * To be deemed as soft real-time, an application must meet two requirements. |
17022 |
++ * The first is that the application must not require an average bandwidth |
17023 |
++ * higher than the approximate bandwidth required to playback or record a |
17024 |
++ * compressed high-definition video. |
17025 |
++ * The next function is invoked on the completion of the last request of a |
17026 |
++ * batch, to compute the next-start time instant, soft_rt_next_start, such |
17027 |
++ * that, if the next request of the application does not arrive before |
17028 |
++ * soft_rt_next_start, then the above requirement on the bandwidth is met. |
17029 |
++ * |
17030 |
++ * The second requirement is that the request pattern of the application is |
17031 |
++ * isochronous, i.e., that, after issuing a request or a batch of requests, the |
17032 |
++ * application stops for a while, then issues a new batch, and so on. For this |
17033 |
++ * reason the next function is invoked to compute soft_rt_next_start only for |
17034 |
++ * applications that meet this requirement, whereas soft_rt_next_start is set |
17035 |
++ * to infinity for applications that do not. |
17036 |
++ * |
17037 |
++ * Unfortunately, even a greedy application may happen to behave in an |
17038 |
++ * isochronous way if several processes are competing for the CPUs. In fact, |
17039 |
++ * in this scenario the application stops issuing requests while the CPUs are |
17040 |
++ * busy serving other processes, then restarts, then stops again for a while, |
17041 |
++ * and so on. In addition, if the disk achieves a low enough throughput with |
17042 |
++ * the request pattern issued by the application (e.g., because the request |
17043 |
++ * pattern is random and/or the device is slow), then the above bandwidth |
17044 |
++ * requirement may happen to be met too. To prevent such a greedy application |
17045 |
++ * to be deemed as soft real-time, a further rule is used in the computation |
17046 |
++ * of soft_rt_next_start: soft_rt_next_start must be higher than the current |
17047 |
++ * time plus the maximum time for which the arrival of a request is waited |
17048 |
++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. This |
17049 |
++ * filters out greedy applications, as the latter issue instead their next |
17050 |
++ * request as soon as possible after the last one has been completed (in |
17051 |
++ * contrast, when a batch of requests is completed, a soft real-time |
17052 |
++ * application spends some time processing data). |
17053 |
++ * |
17054 |
++ * Actually, the last filter may easily generate false positives if: only |
17055 |
++ * bfqd->bfq_slice_idle is used as a reference time interval, and one or |
17056 |
++ * both the following two cases occur: |
17057 |
++ * 1) HZ is so low that the duration of a jiffie is comparable to or higher |
17058 |
++ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with |
17059 |
++ * HZ=100. |
17060 |
++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing |
17061 |
++ * for a while, then suddenly 'jump' by several units to recover the lost |
17062 |
++ * increments. This seems to happen, e.g., inside virtual machines. |
17063 |
++ * To address this issue, we do not use as a reference time interval just |
17064 |
++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In |
17065 |
++ * particular we add the minimum number of jiffies for which the filter seems |
17066 |
++ * to be quite precise also in embedded systems and KVM/QEMU virtual machines. |
17067 |
++ */ |
17068 |
++static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, |
17069 |
++ struct bfq_queue *bfqq) |
17070 |
++{ |
17071 |
++ return max(bfqq->last_idle_bklogged + |
17072 |
++ HZ * bfqq->service_from_backlogged / |
17073 |
++ bfqd->bfq_raising_max_softrt_rate, |
17074 |
++ jiffies + bfqq->bfqd->bfq_slice_idle + 4); |
17075 |
++} |
17076 |
++ |
17077 |
++/* |
17078 |
++ * Largest-possible time instant such that, for as long as possible, the |
17079 |
++ * current time will be lower than this time instant according to the macro |
17080 |
++ * time_is_before_jiffies(). |
17081 |
++ */ |
17082 |
++static inline unsigned long bfq_infinity_from_now(unsigned long now) |
17083 |
++{ |
17084 |
++ return now + ULONG_MAX / 2; |
17085 |
++} |
17086 |
++ |
17087 |
++/** |
17088 |
++ * bfq_bfqq_expire - expire a queue. |
17089 |
++ * @bfqd: device owning the queue. |
17090 |
++ * @bfqq: the queue to expire. |
17091 |
++ * @compensate: if true, compensate for the time spent idling. |
17092 |
++ * @reason: the reason causing the expiration. |
17093 |
++ * |
17094 |
++ * |
17095 |
++ * If the process associated to the queue is slow (i.e., seeky), or in |
17096 |
++ * case of budget timeout, or, finally, if it is async, we |
17097 |
++ * artificially charge it an entire budget (independently of the |
17098 |
++ * actual service it received). As a consequence, the queue will get |
17099 |
++ * higher timestamps than the correct ones upon reactivation, and |
17100 |
++ * hence it will be rescheduled as if it had received more service |
17101 |
++ * than what it actually received. In the end, this class of processes |
17102 |
++ * will receive less service in proportion to how slowly they consume |
17103 |
++ * their budgets (and hence how seriously they tend to lower the |
17104 |
++ * throughput). |
17105 |
++ * |
17106 |
++ * In contrast, when a queue expires because it has been idling for |
17107 |
++ * too much or because it exhausted its budget, we do not touch the |
17108 |
++ * amount of service it has received. Hence when the queue will be |
17109 |
++ * reactivated and its timestamps updated, the latter will be in sync |
17110 |
++ * with the actual service received by the queue until expiration. |
17111 |
++ * |
17112 |
++ * Charging a full budget to the first type of queues and the exact |
17113 |
++ * service to the others has the effect of using the WF2Q+ policy to |
17114 |
++ * schedule the former on a timeslice basis, without violating the |
17115 |
++ * service domain guarantees of the latter. |
17116 |
++ */ |
17117 |
++static void bfq_bfqq_expire(struct bfq_data *bfqd, |
17118 |
++ struct bfq_queue *bfqq, |
17119 |
++ int compensate, |
17120 |
++ enum bfqq_expiration reason) |
17121 |
++{ |
17122 |
++ int slow; |
17123 |
++ BUG_ON(bfqq != bfqd->in_service_queue); |
17124 |
++ |
17125 |
++ /* Update disk peak rate for autotuning and check whether the |
17126 |
++ * process is slow (see bfq_update_peak_rate). |
17127 |
++ */ |
17128 |
++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); |
17129 |
++ |
17130 |
++ /* |
17131 |
++ * As above explained, 'punish' slow (i.e., seeky), timed-out |
17132 |
++ * and async queues, to favor sequential sync workloads. |
17133 |
++ * |
17134 |
++ * Processes doing IO in the slower disk zones will tend to be |
17135 |
++ * slow(er) even if not seeky. Hence, since the estimated peak |
17136 |
++ * rate is actually an average over the disk surface, these |
17137 |
++ * processes may timeout just for bad luck. To avoid punishing |
17138 |
++ * them we do not charge a full budget to a process that |
17139 |
++ * succeeded in consuming at least 2/3 of its budget. |
17140 |
++ */ |
17141 |
++ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && |
17142 |
++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) |
17143 |
++ bfq_bfqq_charge_full_budget(bfqq); |
17144 |
++ |
17145 |
++ bfqq->service_from_backlogged += bfqq->entity.service; |
17146 |
++ |
17147 |
++ if (bfqd->low_latency && bfqq->raising_coeff == 1) |
17148 |
++ bfqq->last_rais_start_finish = jiffies; |
17149 |
++ |
17150 |
++ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0 && |
17151 |
++ RB_EMPTY_ROOT(&bfqq->sort_list)) { |
17152 |
++ /* |
17153 |
++ * If we get here, then the request pattern is |
17154 |
++ * isochronous (see the comments to the function |
17155 |
++ * bfq_bfqq_softrt_next_start()). However, if the |
17156 |
++ * queue still has in-flight requests, then it is |
17157 |
++ * better to postpone the computation of next_start |
17158 |
++ * to the next request completion. In fact, if we |
17159 |
++ * computed it now, then the application might pass |
17160 |
++ * the greedy-application filter improperly, because |
17161 |
++ * the arrival of its next request may happen to be |
17162 |
++ * higher than (jiffies + bfqq->bfqd->bfq_slice_idle) |
17163 |
++ * not because the application is truly soft real- |
17164 |
++ * time, but just because the application is currently |
17165 |
++ * waiting for the completion of some request before |
17166 |
++ * issuing, as quickly as possible, its next request. |
17167 |
++ */ |
17168 |
++ if (bfqq->dispatched > 0) { |
17169 |
++ /* |
17170 |
++ * The application is still waiting for the |
17171 |
++ * completion of one or more requests: |
17172 |
++ * prevent it from possibly being incorrectly |
17173 |
++ * deemed as soft real-time by setting its |
17174 |
++ * soft_rt_next_start to infinity. In fact, |
17175 |
++ * without this assignment, the application |
17176 |
++ * would be incorrectly deemed as soft |
17177 |
++ * real-time if: |
17178 |
++ * 1) it issued a new request before the |
17179 |
++ * completion of all its in-flight |
17180 |
++ * requests, and |
17181 |
++ * 2) at that time, its soft_rt_next_start |
17182 |
++ * happened to be in the past. |
17183 |
++ */ |
17184 |
++ bfqq->soft_rt_next_start = |
17185 |
++ bfq_infinity_from_now(jiffies); |
17186 |
++ bfq_mark_bfqq_softrt_update(bfqq); |
17187 |
++ } else |
17188 |
++ bfqq->soft_rt_next_start = |
17189 |
++ bfq_bfqq_softrt_next_start(bfqd, bfqq); |
17190 |
++ } |
17191 |
++ |
17192 |
++ bfq_log_bfqq(bfqd, bfqq, |
17193 |
++ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow, |
17194 |
++ bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); |
17195 |
++ |
17196 |
++ /* Increase, decrease or leave budget unchanged according to reason */ |
17197 |
++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); |
17198 |
++ __bfq_bfqq_expire(bfqd, bfqq); |
17199 |
++} |
17200 |
++ |
17201 |
++/* |
17202 |
++ * Budget timeout is not implemented through a dedicated timer, but |
17203 |
++ * just checked on request arrivals and completions, as well as on |
17204 |
++ * idle timer expirations. |
17205 |
++ */ |
17206 |
++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) |
17207 |
++{ |
17208 |
++ if (bfq_bfqq_budget_new(bfqq)) |
17209 |
++ return 0; |
17210 |
++ |
17211 |
++ if (time_before(jiffies, bfqq->budget_timeout)) |
17212 |
++ return 0; |
17213 |
++ |
17214 |
++ return 1; |
17215 |
++} |
17216 |
++ |
17217 |
++/* |
17218 |
++ * If we expire a queue that is waiting for the arrival of a new |
17219 |
++ * request, we may prevent the fictitious timestamp backshifting that |
17220 |
++ * allows the guarantees of the queue to be preserved (see [1] for |
17221 |
++ * this tricky aspect). Hence we return true only if this condition |
17222 |
++ * does not hold, or if the queue is slow enough to deserve only to be |
17223 |
++ * kicked off for preserving a high throughput. |
17224 |
++*/ |
17225 |
++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) |
17226 |
++{ |
17227 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
17228 |
++ "may_budget_timeout: wr %d left %d timeout %d", |
17229 |
++ bfq_bfqq_wait_request(bfqq), |
17230 |
++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, |
17231 |
++ bfq_bfqq_budget_timeout(bfqq)); |
17232 |
++ |
17233 |
++ return (!bfq_bfqq_wait_request(bfqq) || |
17234 |
++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) |
17235 |
++ && |
17236 |
++ bfq_bfqq_budget_timeout(bfqq); |
17237 |
++} |
17238 |
++ |
17239 |
++/* |
17240 |
++ * For weight-raised queues issuing sync requests, idling is always performed, |
17241 |
++ * as this is instrumental in guaranteeing a high fraction of the throughput |
17242 |
++ * to these queues, and hence in guaranteeing a lower latency for their |
17243 |
++ * requests. See [1] for details. |
17244 |
++ * |
17245 |
++ * For non-weight-raised queues, idling is instead disabled if the device is |
17246 |
++ * NCQ-enabled and non-rotational, as this boosts the throughput on such |
17247 |
++ * devices. |
17248 |
++ */ |
17249 |
++static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq) |
17250 |
++{ |
17251 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
17252 |
++ |
17253 |
++ return bfq_bfqq_sync(bfqq) && ( |
17254 |
++ bfqq->raising_coeff > 1 || |
17255 |
++ (bfq_bfqq_idle_window(bfqq) && |
17256 |
++ !(bfqd->hw_tag && |
17257 |
++ (blk_queue_nonrot(bfqd->queue) || |
17258 |
++ /* |
17259 |
++ * If there are weight-raised busy queues, then do not idle |
17260 |
++ * the disk for a sync non-weight-raised queue, and hence |
17261 |
++ * expire the queue immediately if empty. Combined with the |
17262 |
++ * timestamping rules of BFQ (see [1] for details), this |
17263 |
++ * causes sync non-weight-raised queues to get a lower |
17264 |
++ * fraction of the disk throughput, and hence reduces the rate |
17265 |
++ * at which the processes associated to these queues ask for |
17266 |
++ * requests from the request pool. |
17267 |
++ * |
17268 |
++ * This is beneficial for weight-raised processes, when the |
17269 |
++ * system operates in request-pool saturation conditions |
17270 |
++ * (e.g., in the presence of write hogs). In fact, if |
17271 |
++ * non-weight-raised processes ask for requests at a lower |
17272 |
++ * rate, then weight-raised processes have a higher |
17273 |
++ * probability to get a request from the pool immediately |
17274 |
++ * (or at least soon) when they need one. Hence they have a |
17275 |
++ * higher probability to actually get a fraction of the disk |
17276 |
++ * throughput proportional to their high weight. This is |
17277 |
++ * especially true with NCQ-enabled drives, which enqueue |
17278 |
++ * several requests in advance and further reorder |
17279 |
++ * internally-queued requests. |
17280 |
++ * |
17281 |
++ * Mistreating non-weight-raised queues in the above-described |
17282 |
++ * way, when there are busy weight-raised queues, seems to |
17283 |
++ * mitigate starvation problems in the presence of heavy write |
17284 |
++ * workloads and NCQ, and hence to guarantee a higher |
17285 |
++ * application and system responsiveness in these hostile |
17286 |
++ * scenarios. |
17287 |
++ */ |
17288 |
++ bfqd->raised_busy_queues > 0) |
17289 |
++ ) |
17290 |
++ ) |
17291 |
++ ); |
17292 |
++} |
17293 |
++ |
17294 |
++/* |
17295 |
++ * If the in-service queue is empty, but it is sync and either of the following |
17296 |
++ * conditions holds, then: 1) the queue must remain in service and cannot be |
17297 |
++ * expired, and 2) the disk must be idled to wait for the possible arrival |
17298 |
++ * of a new request for the queue. The conditions are: |
17299 |
++ * - the device is rotational and not performing NCQ, and the queue has its |
17300 |
++ * idle window set (in this case, waiting for a new request for the queue |
17301 |
++ * is likely to boost the disk throughput); |
17302 |
++ * - the queue is weight-raised (waiting for the request is necessary to |
17303 |
++ * provide the queue with fairness and latency guarantees, see [1] for |
17304 |
++ * details). |
17305 |
++ */ |
17306 |
++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) |
17307 |
++{ |
17308 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
17309 |
++ |
17310 |
++ return (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && |
17311 |
++ bfq_bfqq_must_not_expire(bfqq) && |
17312 |
++ !bfq_queue_nonrot_noidle(bfqd, bfqq)); |
17313 |
++} |
17314 |
++ |
17315 |
++/* |
17316 |
++ * Select a queue for service. If we have a current queue in service, |
17317 |
++ * check whether to continue servicing it, or retrieve and set a new one. |
17318 |
++ */ |
17319 |
++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) |
17320 |
++{ |
17321 |
++ struct bfq_queue *bfqq, *new_bfqq = NULL; |
17322 |
++ struct request *next_rq; |
17323 |
++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; |
17324 |
++ |
17325 |
++ bfqq = bfqd->in_service_queue; |
17326 |
++ if (bfqq == NULL) |
17327 |
++ goto new_queue; |
17328 |
++ |
17329 |
++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); |
17330 |
++ |
17331 |
++ /* |
17332 |
++ * If another queue has a request waiting within our mean seek |
17333 |
++ * distance, let it run. The expire code will check for close |
17334 |
++ * cooperators and put the close queue at the front of the |
17335 |
++ * service tree. If possible, merge the expiring queue with the |
17336 |
++ * new bfqq. |
17337 |
++ */ |
17338 |
++ new_bfqq = bfq_close_cooperator(bfqd, bfqq); |
17339 |
++ if (new_bfqq != NULL && bfqq->new_bfqq == NULL) |
17340 |
++ bfq_setup_merge(bfqq, new_bfqq); |
17341 |
++ |
17342 |
++ if (bfq_may_expire_for_budg_timeout(bfqq) && |
17343 |
++ !timer_pending(&bfqd->idle_slice_timer) && |
17344 |
++ !bfq_bfqq_must_idle(bfqq)) |
17345 |
++ goto expire; |
17346 |
++ |
17347 |
++ next_rq = bfqq->next_rq; |
17348 |
++ /* |
17349 |
++ * If bfqq has requests queued and it has enough budget left to |
17350 |
++ * serve them, keep the queue, otherwise expire it. |
17351 |
++ */ |
17352 |
++ if (next_rq != NULL) { |
17353 |
++ if (bfq_serv_to_charge(next_rq, bfqq) > |
17354 |
++ bfq_bfqq_budget_left(bfqq)) { |
17355 |
++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; |
17356 |
++ goto expire; |
17357 |
++ } else { |
17358 |
++ /* |
17359 |
++ * The idle timer may be pending because we may not |
17360 |
++ * disable disk idling even when a new request arrives |
17361 |
++ */ |
17362 |
++ if (timer_pending(&bfqd->idle_slice_timer)) { |
17363 |
++ /* |
17364 |
++ * If we get here: 1) at least a new request |
17365 |
++ * has arrived but we have not disabled the |
17366 |
++ * timer because the request was too small, |
17367 |
++ * 2) then the block layer has unplugged the |
17368 |
++ * device, causing the dispatch to be invoked. |
17369 |
++ * |
17370 |
++ * Since the device is unplugged, now the |
17371 |
++ * requests are probably large enough to |
17372 |
++ * provide a reasonable throughput. |
17373 |
++ * So we disable idling. |
17374 |
++ */ |
17375 |
++ bfq_clear_bfqq_wait_request(bfqq); |
17376 |
++ del_timer(&bfqd->idle_slice_timer); |
17377 |
++ } |
17378 |
++ if (new_bfqq == NULL) |
17379 |
++ goto keep_queue; |
17380 |
++ else |
17381 |
++ goto expire; |
17382 |
++ } |
17383 |
++ } |
17384 |
++ |
17385 |
++ /* |
17386 |
++ * No requests pending. If the in-service queue has no cooperator and |
17387 |
++ * still has requests in flight (possibly waiting for a completion) |
17388 |
++ * or is idling for a new request, then keep it. |
17389 |
++ */ |
17390 |
++ if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) || |
17391 |
++ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) { |
17392 |
++ bfqq = NULL; |
17393 |
++ goto keep_queue; |
17394 |
++ } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) { |
17395 |
++ /* |
17396 |
++ * Expiring the queue because there is a close cooperator, |
17397 |
++ * cancel timer. |
17398 |
++ */ |
17399 |
++ bfq_clear_bfqq_wait_request(bfqq); |
17400 |
++ del_timer(&bfqd->idle_slice_timer); |
17401 |
++ } |
17402 |
++ |
17403 |
++ reason = BFQ_BFQQ_NO_MORE_REQUESTS; |
17404 |
++expire: |
17405 |
++ bfq_bfqq_expire(bfqd, bfqq, 0, reason); |
17406 |
++new_queue: |
17407 |
++ bfqq = bfq_set_in_service_queue(bfqd, new_bfqq); |
17408 |
++ bfq_log(bfqd, "select_queue: new queue %d returned", |
17409 |
++ bfqq != NULL ? bfqq->pid : 0); |
17410 |
++keep_queue: |
17411 |
++ return bfqq; |
17412 |
++} |
17413 |
++ |
17414 |
++static void bfq_update_raising_data(struct bfq_data *bfqd, |
17415 |
++ struct bfq_queue *bfqq) |
17416 |
++{ |
17417 |
++ if (bfqq->raising_coeff > 1) { /* queue is being boosted */ |
17418 |
++ struct bfq_entity *entity = &bfqq->entity; |
17419 |
++ |
17420 |
++ bfq_log_bfqq(bfqd, bfqq, |
17421 |
++ "raising period dur %u/%u msec, " |
17422 |
++ "old raising coeff %u, w %d(%d)", |
17423 |
++ jiffies_to_msecs(jiffies - |
17424 |
++ bfqq->last_rais_start_finish), |
17425 |
++ jiffies_to_msecs(bfqq->raising_cur_max_time), |
17426 |
++ bfqq->raising_coeff, |
17427 |
++ bfqq->entity.weight, bfqq->entity.orig_weight); |
17428 |
++ |
17429 |
++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != |
17430 |
++ entity->orig_weight * bfqq->raising_coeff); |
17431 |
++ if (entity->ioprio_changed) |
17432 |
++ bfq_log_bfqq(bfqd, bfqq, |
17433 |
++ "WARN: pending prio change"); |
17434 |
++ /* |
17435 |
++ * If too much time has elapsed from the beginning |
17436 |
++ * of this weight-raising, stop it. |
17437 |
++ */ |
17438 |
++ if (time_is_before_jiffies(bfqq->last_rais_start_finish + |
17439 |
++ bfqq->raising_cur_max_time)) { |
17440 |
++ bfqq->last_rais_start_finish = jiffies; |
17441 |
++ bfq_log_bfqq(bfqd, bfqq, |
17442 |
++ "wrais ending at %lu, " |
17443 |
++ "rais_max_time %u", |
17444 |
++ bfqq->last_rais_start_finish, |
17445 |
++ jiffies_to_msecs(bfqq-> |
17446 |
++ raising_cur_max_time)); |
17447 |
++ bfq_bfqq_end_raising(bfqq); |
17448 |
++ __bfq_entity_update_weight_prio( |
17449 |
++ bfq_entity_service_tree(entity), |
17450 |
++ entity); |
17451 |
++ } |
17452 |
++ } |
17453 |
++} |
17454 |
++ |
17455 |
++/* |
17456 |
++ * Dispatch one request from bfqq, moving it to the request queue |
17457 |
++ * dispatch list. |
17458 |
++ */ |
17459 |
++static int bfq_dispatch_request(struct bfq_data *bfqd, |
17460 |
++ struct bfq_queue *bfqq) |
17461 |
++{ |
17462 |
++ int dispatched = 0; |
17463 |
++ struct request *rq; |
17464 |
++ unsigned long service_to_charge; |
17465 |
++ |
17466 |
++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); |
17467 |
++ |
17468 |
++ /* Follow expired path, else get first next available. */ |
17469 |
++ rq = bfq_check_fifo(bfqq); |
17470 |
++ if (rq == NULL) |
17471 |
++ rq = bfqq->next_rq; |
17472 |
++ service_to_charge = bfq_serv_to_charge(rq, bfqq); |
17473 |
++ |
17474 |
++ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { |
17475 |
++ /* |
17476 |
++ * This may happen if the next rq is chosen |
17477 |
++ * in fifo order instead of sector order. |
17478 |
++ * The budget is properly dimensioned |
17479 |
++ * to be always sufficient to serve the next request |
17480 |
++ * only if it is chosen in sector order. The reason is |
17481 |
++ * that it would be quite inefficient and little useful |
17482 |
++ * to always make sure that the budget is large enough |
17483 |
++ * to serve even the possible next rq in fifo order. |
17484 |
++ * In fact, requests are seldom served in fifo order. |
17485 |
++ * |
17486 |
++ * Expire the queue for budget exhaustion, and |
17487 |
++ * make sure that the next act_budget is enough |
17488 |
++ * to serve the next request, even if it comes |
17489 |
++ * from the fifo expired path. |
17490 |
++ */ |
17491 |
++ bfqq->next_rq = rq; |
17492 |
++ /* |
17493 |
++ * Since this dispatch is failed, make sure that |
17494 |
++ * a new one will be performed |
17495 |
++ */ |
17496 |
++ if (!bfqd->rq_in_driver) |
17497 |
++ bfq_schedule_dispatch(bfqd); |
17498 |
++ goto expire; |
17499 |
++ } |
17500 |
++ |
17501 |
++ /* Finally, insert request into driver dispatch list. */ |
17502 |
++ bfq_bfqq_served(bfqq, service_to_charge); |
17503 |
++ bfq_dispatch_insert(bfqd->queue, rq); |
17504 |
++ |
17505 |
++ bfq_update_raising_data(bfqd, bfqq); |
17506 |
++ |
17507 |
++ bfq_log_bfqq(bfqd, bfqq, |
17508 |
++ "dispatched %u sec req (%llu), budg left %lu", |
17509 |
++ blk_rq_sectors(rq), |
17510 |
++ (long long unsigned)blk_rq_pos(rq), |
17511 |
++ bfq_bfqq_budget_left(bfqq)); |
17512 |
++ |
17513 |
++ dispatched++; |
17514 |
++ |
17515 |
++ if (bfqd->in_service_bic == NULL) { |
17516 |
++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); |
17517 |
++ bfqd->in_service_bic = RQ_BIC(rq); |
17518 |
++ } |
17519 |
++ |
17520 |
++ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && |
17521 |
++ dispatched >= bfqd->bfq_max_budget_async_rq) || |
17522 |
++ bfq_class_idle(bfqq))) |
17523 |
++ goto expire; |
17524 |
++ |
17525 |
++ return dispatched; |
17526 |
++ |
17527 |
++expire: |
17528 |
++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED); |
17529 |
++ return dispatched; |
17530 |
++} |
17531 |
++ |
17532 |
++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) |
17533 |
++{ |
17534 |
++ int dispatched = 0; |
17535 |
++ |
17536 |
++ while (bfqq->next_rq != NULL) { |
17537 |
++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); |
17538 |
++ dispatched++; |
17539 |
++ } |
17540 |
++ |
17541 |
++ BUG_ON(!list_empty(&bfqq->fifo)); |
17542 |
++ return dispatched; |
17543 |
++} |
17544 |
++ |
17545 |
++/* |
17546 |
++ * Drain our current requests. Used for barriers and when switching |
17547 |
++ * io schedulers on-the-fly. |
17548 |
++ */ |
17549 |
++static int bfq_forced_dispatch(struct bfq_data *bfqd) |
17550 |
++{ |
17551 |
++ struct bfq_queue *bfqq, *n; |
17552 |
++ struct bfq_service_tree *st; |
17553 |
++ int dispatched = 0; |
17554 |
++ |
17555 |
++ bfqq = bfqd->in_service_queue; |
17556 |
++ if (bfqq != NULL) |
17557 |
++ __bfq_bfqq_expire(bfqd, bfqq); |
17558 |
++ |
17559 |
++ /* |
17560 |
++ * Loop through classes, and be careful to leave the scheduler |
17561 |
++ * in a consistent state, as feedback mechanisms and vtime |
17562 |
++ * updates cannot be disabled during the process. |
17563 |
++ */ |
17564 |
++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { |
17565 |
++ st = bfq_entity_service_tree(&bfqq->entity); |
17566 |
++ |
17567 |
++ dispatched += __bfq_forced_dispatch_bfqq(bfqq); |
17568 |
++ bfqq->max_budget = bfq_max_budget(bfqd); |
17569 |
++ |
17570 |
++ bfq_forget_idle(st); |
17571 |
++ } |
17572 |
++ |
17573 |
++ BUG_ON(bfqd->busy_queues != 0); |
17574 |
++ |
17575 |
++ return dispatched; |
17576 |
++} |
17577 |
++ |
17578 |
++static int bfq_dispatch_requests(struct request_queue *q, int force) |
17579 |
++{ |
17580 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
17581 |
++ struct bfq_queue *bfqq; |
17582 |
++ int max_dispatch; |
17583 |
++ |
17584 |
++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); |
17585 |
++ if (bfqd->busy_queues == 0) |
17586 |
++ return 0; |
17587 |
++ |
17588 |
++ if (unlikely(force)) |
17589 |
++ return bfq_forced_dispatch(bfqd); |
17590 |
++ |
17591 |
++ bfqq = bfq_select_queue(bfqd); |
17592 |
++ if (bfqq == NULL) |
17593 |
++ return 0; |
17594 |
++ |
17595 |
++ max_dispatch = bfqd->bfq_quantum; |
17596 |
++ if (bfq_class_idle(bfqq)) |
17597 |
++ max_dispatch = 1; |
17598 |
++ |
17599 |
++ if (!bfq_bfqq_sync(bfqq)) |
17600 |
++ max_dispatch = bfqd->bfq_max_budget_async_rq; |
17601 |
++ |
17602 |
++ if (bfqq->dispatched >= max_dispatch) { |
17603 |
++ if (bfqd->busy_queues > 1) |
17604 |
++ return 0; |
17605 |
++ if (bfqq->dispatched >= 4 * max_dispatch) |
17606 |
++ return 0; |
17607 |
++ } |
17608 |
++ |
17609 |
++ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) |
17610 |
++ return 0; |
17611 |
++ |
17612 |
++ bfq_clear_bfqq_wait_request(bfqq); |
17613 |
++ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); |
17614 |
++ |
17615 |
++ if (!bfq_dispatch_request(bfqd, bfqq)) |
17616 |
++ return 0; |
17617 |
++ |
17618 |
++ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)", |
17619 |
++ bfqq->pid, max_dispatch); |
17620 |
++ |
17621 |
++ return 1; |
17622 |
++} |
17623 |
++ |
17624 |
++/* |
17625 |
++ * Task holds one reference to the queue, dropped when task exits. Each rq |
17626 |
++ * in-flight on this queue also holds a reference, dropped when rq is freed. |
17627 |
++ * |
17628 |
++ * Queue lock must be held here. |
17629 |
++ */ |
17630 |
++static void bfq_put_queue(struct bfq_queue *bfqq) |
17631 |
++{ |
17632 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
17633 |
++ |
17634 |
++ BUG_ON(atomic_read(&bfqq->ref) <= 0); |
17635 |
++ |
17636 |
++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, |
17637 |
++ atomic_read(&bfqq->ref)); |
17638 |
++ if (!atomic_dec_and_test(&bfqq->ref)) |
17639 |
++ return; |
17640 |
++ |
17641 |
++ BUG_ON(rb_first(&bfqq->sort_list) != NULL); |
17642 |
++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); |
17643 |
++ BUG_ON(bfqq->entity.tree != NULL); |
17644 |
++ BUG_ON(bfq_bfqq_busy(bfqq)); |
17645 |
++ BUG_ON(bfqd->in_service_queue == bfqq); |
17646 |
++ |
17647 |
++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); |
17648 |
++ |
17649 |
++ kmem_cache_free(bfq_pool, bfqq); |
17650 |
++} |
17651 |
++ |
17652 |
++static void bfq_put_cooperator(struct bfq_queue *bfqq) |
17653 |
++{ |
17654 |
++ struct bfq_queue *__bfqq, *next; |
17655 |
++ |
17656 |
++ /* |
17657 |
++ * If this queue was scheduled to merge with another queue, be |
17658 |
++ * sure to drop the reference taken on that queue (and others in |
17659 |
++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. |
17660 |
++ */ |
17661 |
++ __bfqq = bfqq->new_bfqq; |
17662 |
++ while (__bfqq) { |
17663 |
++ if (__bfqq == bfqq) { |
17664 |
++ WARN(1, "bfqq->new_bfqq loop detected.\n"); |
17665 |
++ break; |
17666 |
++ } |
17667 |
++ next = __bfqq->new_bfqq; |
17668 |
++ bfq_put_queue(__bfqq); |
17669 |
++ __bfqq = next; |
17670 |
++ } |
17671 |
++} |
17672 |
++ |
17673 |
++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
17674 |
++{ |
17675 |
++ if (bfqq == bfqd->in_service_queue) { |
17676 |
++ __bfq_bfqq_expire(bfqd, bfqq); |
17677 |
++ bfq_schedule_dispatch(bfqd); |
17678 |
++ } |
17679 |
++ |
17680 |
++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, |
17681 |
++ atomic_read(&bfqq->ref)); |
17682 |
++ |
17683 |
++ bfq_put_cooperator(bfqq); |
17684 |
++ |
17685 |
++ bfq_put_queue(bfqq); |
17686 |
++} |
17687 |
++ |
17688 |
++static void bfq_init_icq(struct io_cq *icq) |
17689 |
++{ |
17690 |
++ struct bfq_io_cq *bic = icq_to_bic(icq); |
17691 |
++ |
17692 |
++ bic->ttime.last_end_request = jiffies; |
17693 |
++} |
17694 |
++ |
17695 |
++static void bfq_exit_icq(struct io_cq *icq) |
17696 |
++{ |
17697 |
++ struct bfq_io_cq *bic = icq_to_bic(icq); |
17698 |
++ struct bfq_data *bfqd = bic_to_bfqd(bic); |
17699 |
++ |
17700 |
++ if (bic->bfqq[BLK_RW_ASYNC]) { |
17701 |
++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); |
17702 |
++ bic->bfqq[BLK_RW_ASYNC] = NULL; |
17703 |
++ } |
17704 |
++ |
17705 |
++ if (bic->bfqq[BLK_RW_SYNC]) { |
17706 |
++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); |
17707 |
++ bic->bfqq[BLK_RW_SYNC] = NULL; |
17708 |
++ } |
17709 |
++} |
17710 |
++ |
17711 |
++/* |
17712 |
++ * Update the entity prio values; note that the new values will not |
17713 |
++ * be used until the next (re)activation. |
17714 |
++ */ |
17715 |
++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) |
17716 |
++{ |
17717 |
++ struct task_struct *tsk = current; |
17718 |
++ int ioprio_class; |
17719 |
++ |
17720 |
++ if (!bfq_bfqq_prio_changed(bfqq)) |
17721 |
++ return; |
17722 |
++ |
17723 |
++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); |
17724 |
++ switch (ioprio_class) { |
17725 |
++ default: |
17726 |
++ dev_err(bfqq->bfqd->queue->backing_dev_info.dev, |
17727 |
++ "bfq: bad prio %x\n", ioprio_class); |
17728 |
++ case IOPRIO_CLASS_NONE: |
17729 |
++ /* |
17730 |
++ * No prio set, inherit CPU scheduling settings. |
17731 |
++ */ |
17732 |
++ bfqq->entity.new_ioprio = task_nice_ioprio(tsk); |
17733 |
++ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk); |
17734 |
++ break; |
17735 |
++ case IOPRIO_CLASS_RT: |
17736 |
++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); |
17737 |
++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT; |
17738 |
++ break; |
17739 |
++ case IOPRIO_CLASS_BE: |
17740 |
++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); |
17741 |
++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; |
17742 |
++ break; |
17743 |
++ case IOPRIO_CLASS_IDLE: |
17744 |
++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE; |
17745 |
++ bfqq->entity.new_ioprio = 7; |
17746 |
++ bfq_clear_bfqq_idle_window(bfqq); |
17747 |
++ break; |
17748 |
++ } |
17749 |
++ |
17750 |
++ bfqq->entity.ioprio_changed = 1; |
17751 |
++ |
17752 |
++ /* |
17753 |
++ * Keep track of original prio settings in case we have to temporarily |
17754 |
++ * elevate the priority of this queue. |
17755 |
++ */ |
17756 |
++ bfqq->org_ioprio = bfqq->entity.new_ioprio; |
17757 |
++ bfq_clear_bfqq_prio_changed(bfqq); |
17758 |
++} |
17759 |
++ |
17760 |
++static void bfq_changed_ioprio(struct bfq_io_cq *bic) |
17761 |
++{ |
17762 |
++ struct bfq_data *bfqd; |
17763 |
++ struct bfq_queue *bfqq, *new_bfqq; |
17764 |
++ struct bfq_group *bfqg; |
17765 |
++ unsigned long uninitialized_var(flags); |
17766 |
++ int ioprio = bic->icq.ioc->ioprio; |
17767 |
++ |
17768 |
++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), |
17769 |
++ &flags); |
17770 |
++ /* |
17771 |
++ * This condition may trigger on a newly created bic, be sure to drop |
17772 |
++ * the lock before returning. |
17773 |
++ */ |
17774 |
++ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio)) |
17775 |
++ goto out; |
17776 |
++ |
17777 |
++ bfqq = bic->bfqq[BLK_RW_ASYNC]; |
17778 |
++ if (bfqq != NULL) { |
17779 |
++ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group, |
17780 |
++ sched_data); |
17781 |
++ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic, |
17782 |
++ GFP_ATOMIC); |
17783 |
++ if (new_bfqq != NULL) { |
17784 |
++ bic->bfqq[BLK_RW_ASYNC] = new_bfqq; |
17785 |
++ bfq_log_bfqq(bfqd, bfqq, |
17786 |
++ "changed_ioprio: bfqq %p %d", |
17787 |
++ bfqq, atomic_read(&bfqq->ref)); |
17788 |
++ bfq_put_queue(bfqq); |
17789 |
++ } |
17790 |
++ } |
17791 |
++ |
17792 |
++ bfqq = bic->bfqq[BLK_RW_SYNC]; |
17793 |
++ if (bfqq != NULL) |
17794 |
++ bfq_mark_bfqq_prio_changed(bfqq); |
17795 |
++ |
17796 |
++ bic->ioprio = ioprio; |
17797 |
++ |
17798 |
++out: |
17799 |
++ bfq_put_bfqd_unlock(bfqd, &flags); |
17800 |
++} |
17801 |
++ |
17802 |
++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
17803 |
++ pid_t pid, int is_sync) |
17804 |
++{ |
17805 |
++ RB_CLEAR_NODE(&bfqq->entity.rb_node); |
17806 |
++ INIT_LIST_HEAD(&bfqq->fifo); |
17807 |
++ |
17808 |
++ atomic_set(&bfqq->ref, 0); |
17809 |
++ bfqq->bfqd = bfqd; |
17810 |
++ |
17811 |
++ bfq_mark_bfqq_prio_changed(bfqq); |
17812 |
++ |
17813 |
++ if (is_sync) { |
17814 |
++ if (!bfq_class_idle(bfqq)) |
17815 |
++ bfq_mark_bfqq_idle_window(bfqq); |
17816 |
++ bfq_mark_bfqq_sync(bfqq); |
17817 |
++ } |
17818 |
++ |
17819 |
++ /* Tentative initial value to trade off between thr and lat */ |
17820 |
++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; |
17821 |
++ bfqq->pid = pid; |
17822 |
++ |
17823 |
++ bfqq->raising_coeff = 1; |
17824 |
++ bfqq->last_rais_start_finish = 0; |
17825 |
++ /* |
17826 |
++ * Set to the value for which bfqq will not be deemed as |
17827 |
++ * soft rt when it becomes backlogged. |
17828 |
++ */ |
17829 |
++ bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); |
17830 |
++} |
17831 |
++ |
17832 |
++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, |
17833 |
++ struct bfq_group *bfqg, |
17834 |
++ int is_sync, |
17835 |
++ struct bfq_io_cq *bic, |
17836 |
++ gfp_t gfp_mask) |
17837 |
++{ |
17838 |
++ struct bfq_queue *bfqq, *new_bfqq = NULL; |
17839 |
++ |
17840 |
++retry: |
17841 |
++ /* bic always exists here */ |
17842 |
++ bfqq = bic_to_bfqq(bic, is_sync); |
17843 |
++ |
17844 |
++ /* |
17845 |
++ * Always try a new alloc if we fall back to the OOM bfqq |
17846 |
++ * originally, since it should just be a temporary situation. |
17847 |
++ */ |
17848 |
++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { |
17849 |
++ bfqq = NULL; |
17850 |
++ if (new_bfqq != NULL) { |
17851 |
++ bfqq = new_bfqq; |
17852 |
++ new_bfqq = NULL; |
17853 |
++ } else if (gfp_mask & __GFP_WAIT) { |
17854 |
++ spin_unlock_irq(bfqd->queue->queue_lock); |
17855 |
++ new_bfqq = kmem_cache_alloc_node(bfq_pool, |
17856 |
++ gfp_mask | __GFP_ZERO, |
17857 |
++ bfqd->queue->node); |
17858 |
++ spin_lock_irq(bfqd->queue->queue_lock); |
17859 |
++ if (new_bfqq != NULL) |
17860 |
++ goto retry; |
17861 |
++ } else { |
17862 |
++ bfqq = kmem_cache_alloc_node(bfq_pool, |
17863 |
++ gfp_mask | __GFP_ZERO, |
17864 |
++ bfqd->queue->node); |
17865 |
++ } |
17866 |
++ |
17867 |
++ if (bfqq != NULL) { |
17868 |
++ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync); |
17869 |
++ bfq_log_bfqq(bfqd, bfqq, "allocated"); |
17870 |
++ } else { |
17871 |
++ bfqq = &bfqd->oom_bfqq; |
17872 |
++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); |
17873 |
++ } |
17874 |
++ |
17875 |
++ bfq_init_prio_data(bfqq, bic); |
17876 |
++ bfq_init_entity(&bfqq->entity, bfqg); |
17877 |
++ } |
17878 |
++ |
17879 |
++ if (new_bfqq != NULL) |
17880 |
++ kmem_cache_free(bfq_pool, new_bfqq); |
17881 |
++ |
17882 |
++ return bfqq; |
17883 |
++} |
17884 |
++ |
17885 |
++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, |
17886 |
++ struct bfq_group *bfqg, |
17887 |
++ int ioprio_class, int ioprio) |
17888 |
++{ |
17889 |
++ switch (ioprio_class) { |
17890 |
++ case IOPRIO_CLASS_RT: |
17891 |
++ return &bfqg->async_bfqq[0][ioprio]; |
17892 |
++ case IOPRIO_CLASS_NONE: |
17893 |
++ ioprio = IOPRIO_NORM; |
17894 |
++ /* fall through */ |
17895 |
++ case IOPRIO_CLASS_BE: |
17896 |
++ return &bfqg->async_bfqq[1][ioprio]; |
17897 |
++ case IOPRIO_CLASS_IDLE: |
17898 |
++ return &bfqg->async_idle_bfqq; |
17899 |
++ default: |
17900 |
++ BUG(); |
17901 |
++ } |
17902 |
++} |
17903 |
++ |
17904 |
++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, |
17905 |
++ struct bfq_group *bfqg, int is_sync, |
17906 |
++ struct bfq_io_cq *bic, gfp_t gfp_mask) |
17907 |
++{ |
17908 |
++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); |
17909 |
++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); |
17910 |
++ struct bfq_queue **async_bfqq = NULL; |
17911 |
++ struct bfq_queue *bfqq = NULL; |
17912 |
++ |
17913 |
++ if (!is_sync) { |
17914 |
++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, |
17915 |
++ ioprio); |
17916 |
++ bfqq = *async_bfqq; |
17917 |
++ } |
17918 |
++ |
17919 |
++ if (bfqq == NULL) |
17920 |
++ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask); |
17921 |
++ |
17922 |
++ /* |
17923 |
++ * Pin the queue now that it's allocated, scheduler exit will prune it. |
17924 |
++ */ |
17925 |
++ if (!is_sync && *async_bfqq == NULL) { |
17926 |
++ atomic_inc(&bfqq->ref); |
17927 |
++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", |
17928 |
++ bfqq, atomic_read(&bfqq->ref)); |
17929 |
++ *async_bfqq = bfqq; |
17930 |
++ } |
17931 |
++ |
17932 |
++ atomic_inc(&bfqq->ref); |
17933 |
++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, |
17934 |
++ atomic_read(&bfqq->ref)); |
17935 |
++ return bfqq; |
17936 |
++} |
17937 |
++ |
17938 |
++static void bfq_update_io_thinktime(struct bfq_data *bfqd, |
17939 |
++ struct bfq_io_cq *bic) |
17940 |
++{ |
17941 |
++ unsigned long elapsed = jiffies - bic->ttime.last_end_request; |
17942 |
++ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); |
17943 |
++ |
17944 |
++ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; |
17945 |
++ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; |
17946 |
++ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / |
17947 |
++ bic->ttime.ttime_samples; |
17948 |
++} |
17949 |
++ |
17950 |
++static void bfq_update_io_seektime(struct bfq_data *bfqd, |
17951 |
++ struct bfq_queue *bfqq, |
17952 |
++ struct request *rq) |
17953 |
++{ |
17954 |
++ sector_t sdist; |
17955 |
++ u64 total; |
17956 |
++ |
17957 |
++ if (bfqq->last_request_pos < blk_rq_pos(rq)) |
17958 |
++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos; |
17959 |
++ else |
17960 |
++ sdist = bfqq->last_request_pos - blk_rq_pos(rq); |
17961 |
++ |
17962 |
++ /* |
17963 |
++ * Don't allow the seek distance to get too large from the |
17964 |
++ * odd fragment, pagein, etc. |
17965 |
++ */ |
17966 |
++ if (bfqq->seek_samples == 0) /* first request, not really a seek */ |
17967 |
++ sdist = 0; |
17968 |
++ else if (bfqq->seek_samples <= 60) /* second & third seek */ |
17969 |
++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); |
17970 |
++ else |
17971 |
++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); |
17972 |
++ |
17973 |
++ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; |
17974 |
++ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; |
17975 |
++ total = bfqq->seek_total + (bfqq->seek_samples/2); |
17976 |
++ do_div(total, bfqq->seek_samples); |
17977 |
++ bfqq->seek_mean = (sector_t)total; |
17978 |
++ |
17979 |
++ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, |
17980 |
++ (u64)bfqq->seek_mean); |
17981 |
++} |
17982 |
++ |
17983 |
++/* |
17984 |
++ * Disable idle window if the process thinks too long or seeks so much that |
17985 |
++ * it doesn't matter. |
17986 |
++ */ |
17987 |
++static void bfq_update_idle_window(struct bfq_data *bfqd, |
17988 |
++ struct bfq_queue *bfqq, |
17989 |
++ struct bfq_io_cq *bic) |
17990 |
++{ |
17991 |
++ int enable_idle; |
17992 |
++ |
17993 |
++ /* Don't idle for async or idle io prio class. */ |
17994 |
++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) |
17995 |
++ return; |
17996 |
++ |
17997 |
++ enable_idle = bfq_bfqq_idle_window(bfqq); |
17998 |
++ |
17999 |
++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || |
18000 |
++ bfqd->bfq_slice_idle == 0 || |
18001 |
++ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && |
18002 |
++ bfqq->raising_coeff == 1)) |
18003 |
++ enable_idle = 0; |
18004 |
++ else if (bfq_sample_valid(bic->ttime.ttime_samples)) { |
18005 |
++ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && |
18006 |
++ bfqq->raising_coeff == 1) |
18007 |
++ enable_idle = 0; |
18008 |
++ else |
18009 |
++ enable_idle = 1; |
18010 |
++ } |
18011 |
++ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", |
18012 |
++ enable_idle); |
18013 |
++ |
18014 |
++ if (enable_idle) |
18015 |
++ bfq_mark_bfqq_idle_window(bfqq); |
18016 |
++ else |
18017 |
++ bfq_clear_bfqq_idle_window(bfqq); |
18018 |
++} |
18019 |
++ |
18020 |
++/* |
18021 |
++ * Called when a new fs request (rq) is added to bfqq. Check if there's |
18022 |
++ * something we should do about it. |
18023 |
++ */ |
18024 |
++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
18025 |
++ struct request *rq) |
18026 |
++{ |
18027 |
++ struct bfq_io_cq *bic = RQ_BIC(rq); |
18028 |
++ |
18029 |
++ if (rq->cmd_flags & REQ_META) |
18030 |
++ bfqq->meta_pending++; |
18031 |
++ |
18032 |
++ bfq_update_io_thinktime(bfqd, bic); |
18033 |
++ bfq_update_io_seektime(bfqd, bfqq, rq); |
18034 |
++ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || |
18035 |
++ !BFQQ_SEEKY(bfqq)) |
18036 |
++ bfq_update_idle_window(bfqd, bfqq, bic); |
18037 |
++ |
18038 |
++ bfq_log_bfqq(bfqd, bfqq, |
18039 |
++ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", |
18040 |
++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), |
18041 |
++ (long long unsigned)bfqq->seek_mean); |
18042 |
++ |
18043 |
++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); |
18044 |
++ |
18045 |
++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { |
18046 |
++ int small_req = bfqq->queued[rq_is_sync(rq)] == 1 && |
18047 |
++ blk_rq_sectors(rq) < 32; |
18048 |
++ int budget_timeout = bfq_bfqq_budget_timeout(bfqq); |
18049 |
++ |
18050 |
++ /* |
18051 |
++ * There is just this request queued: if the request |
18052 |
++ * is small and the queue is not to be expired, then |
18053 |
++ * just exit. |
18054 |
++ * |
18055 |
++ * In this way, if the disk is being idled to wait for |
18056 |
++ * a new request from the in-service queue, we avoid |
18057 |
++ * unplugging the device and committing the disk to serve |
18058 |
++ * just a small request. On the contrary, we wait for |
18059 |
++ * the block layer to decide when to unplug the device: |
18060 |
++ * hopefully, new requests will be merged to this one |
18061 |
++ * quickly, then the device will be unplugged and |
18062 |
++ * larger requests will be dispatched. |
18063 |
++ */ |
18064 |
++ if (small_req && !budget_timeout) |
18065 |
++ return; |
18066 |
++ |
18067 |
++ /* |
18068 |
++ * A large enough request arrived, or the queue is to |
18069 |
++ * be expired: in both cases disk idling is to be |
18070 |
++ * stopped, so clear wait_request flag and reset |
18071 |
++ * timer. |
18072 |
++ */ |
18073 |
++ bfq_clear_bfqq_wait_request(bfqq); |
18074 |
++ del_timer(&bfqd->idle_slice_timer); |
18075 |
++ |
18076 |
++ /* |
18077 |
++ * The queue is not empty, because a new request just |
18078 |
++ * arrived. Hence we can safely expire the queue, in |
18079 |
++ * case of budget timeout, without risking that the |
18080 |
++ * timestamps of the queue are not updated correctly. |
18081 |
++ * See [1] for more details. |
18082 |
++ */ |
18083 |
++ if (budget_timeout) |
18084 |
++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); |
18085 |
++ |
18086 |
++ /* |
18087 |
++ * Let the request rip immediately, or let a new queue be |
18088 |
++ * selected if bfqq has just been expired. |
18089 |
++ */ |
18090 |
++ __blk_run_queue(bfqd->queue); |
18091 |
++ } |
18092 |
++} |
18093 |
++ |
18094 |
++static void bfq_insert_request(struct request_queue *q, struct request *rq) |
18095 |
++{ |
18096 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
18097 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
18098 |
++ |
18099 |
++ assert_spin_locked(bfqd->queue->queue_lock); |
18100 |
++ bfq_init_prio_data(bfqq, RQ_BIC(rq)); |
18101 |
++ |
18102 |
++ bfq_add_rq_rb(rq); |
18103 |
++ |
18104 |
++ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]); |
18105 |
++ list_add_tail(&rq->queuelist, &bfqq->fifo); |
18106 |
++ |
18107 |
++ bfq_rq_enqueued(bfqd, bfqq, rq); |
18108 |
++} |
18109 |
++ |
18110 |
++static void bfq_update_hw_tag(struct bfq_data *bfqd) |
18111 |
++{ |
18112 |
++ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, |
18113 |
++ bfqd->rq_in_driver); |
18114 |
++ |
18115 |
++ if (bfqd->hw_tag == 1) |
18116 |
++ return; |
18117 |
++ |
18118 |
++ /* |
18119 |
++ * This sample is valid if the number of outstanding requests |
18120 |
++ * is large enough to allow a queueing behavior. Note that the |
18121 |
++ * sum is not exact, as it's not taking into account deactivated |
18122 |
++ * requests. |
18123 |
++ */ |
18124 |
++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) |
18125 |
++ return; |
18126 |
++ |
18127 |
++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) |
18128 |
++ return; |
18129 |
++ |
18130 |
++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; |
18131 |
++ bfqd->max_rq_in_driver = 0; |
18132 |
++ bfqd->hw_tag_samples = 0; |
18133 |
++} |
18134 |
++ |
18135 |
++static void bfq_completed_request(struct request_queue *q, struct request *rq) |
18136 |
++{ |
18137 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
18138 |
++ struct bfq_data *bfqd = bfqq->bfqd; |
18139 |
++ const int sync = rq_is_sync(rq); |
18140 |
++ |
18141 |
++ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)", |
18142 |
++ blk_rq_sectors(rq), sync); |
18143 |
++ |
18144 |
++ bfq_update_hw_tag(bfqd); |
18145 |
++ |
18146 |
++ WARN_ON(!bfqd->rq_in_driver); |
18147 |
++ WARN_ON(!bfqq->dispatched); |
18148 |
++ bfqd->rq_in_driver--; |
18149 |
++ bfqq->dispatched--; |
18150 |
++ |
18151 |
++ if (bfq_bfqq_sync(bfqq)) |
18152 |
++ bfqd->sync_flight--; |
18153 |
++ |
18154 |
++ if (sync) |
18155 |
++ RQ_BIC(rq)->ttime.last_end_request = jiffies; |
18156 |
++ |
18157 |
++ /* |
18158 |
++ * The computation of softrt_next_start was scheduled for the next |
18159 |
++ * request completion: it is now time to compute it. |
18160 |
++ */ |
18161 |
++ if (bfq_bfqq_softrt_update(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list)) |
18162 |
++ bfqq->soft_rt_next_start = |
18163 |
++ bfq_bfqq_softrt_next_start(bfqd, bfqq); |
18164 |
++ |
18165 |
++ /* |
18166 |
++ * If this is the in-service queue, check if it needs to be expired, |
18167 |
++ * or if we want to idle in case it has no pending requests. |
18168 |
++ */ |
18169 |
++ if (bfqd->in_service_queue == bfqq) { |
18170 |
++ if (bfq_bfqq_budget_new(bfqq)) |
18171 |
++ bfq_set_budget_timeout(bfqd); |
18172 |
++ |
18173 |
++ if (bfq_bfqq_must_idle(bfqq)) { |
18174 |
++ bfq_arm_slice_timer(bfqd); |
18175 |
++ goto out; |
18176 |
++ } else if (bfq_may_expire_for_budg_timeout(bfqq)) |
18177 |
++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); |
18178 |
++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && |
18179 |
++ (bfqq->dispatched == 0 || |
18180 |
++ !bfq_bfqq_must_not_expire(bfqq))) |
18181 |
++ bfq_bfqq_expire(bfqd, bfqq, 0, |
18182 |
++ BFQ_BFQQ_NO_MORE_REQUESTS); |
18183 |
++ } |
18184 |
++ |
18185 |
++ if (!bfqd->rq_in_driver) |
18186 |
++ bfq_schedule_dispatch(bfqd); |
18187 |
++ |
18188 |
++out: |
18189 |
++ return; |
18190 |
++} |
18191 |
++ |
18192 |
++static inline int __bfq_may_queue(struct bfq_queue *bfqq) |
18193 |
++{ |
18194 |
++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { |
18195 |
++ bfq_clear_bfqq_must_alloc(bfqq); |
18196 |
++ return ELV_MQUEUE_MUST; |
18197 |
++ } |
18198 |
++ |
18199 |
++ return ELV_MQUEUE_MAY; |
18200 |
++} |
18201 |
++ |
18202 |
++static int bfq_may_queue(struct request_queue *q, int rw) |
18203 |
++{ |
18204 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
18205 |
++ struct task_struct *tsk = current; |
18206 |
++ struct bfq_io_cq *bic; |
18207 |
++ struct bfq_queue *bfqq; |
18208 |
++ |
18209 |
++ /* |
18210 |
++ * Don't force setup of a queue from here, as a call to may_queue |
18211 |
++ * does not necessarily imply that a request actually will be queued. |
18212 |
++ * So just lookup a possibly existing queue, or return 'may queue' |
18213 |
++ * if that fails. |
18214 |
++ */ |
18215 |
++ bic = bfq_bic_lookup(bfqd, tsk->io_context); |
18216 |
++ if (bic == NULL) |
18217 |
++ return ELV_MQUEUE_MAY; |
18218 |
++ |
18219 |
++ bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); |
18220 |
++ if (bfqq != NULL) { |
18221 |
++ bfq_init_prio_data(bfqq, bic); |
18222 |
++ |
18223 |
++ return __bfq_may_queue(bfqq); |
18224 |
++ } |
18225 |
++ |
18226 |
++ return ELV_MQUEUE_MAY; |
18227 |
++} |
18228 |
++ |
18229 |
++/* |
18230 |
++ * Queue lock held here. |
18231 |
++ */ |
18232 |
++static void bfq_put_request(struct request *rq) |
18233 |
++{ |
18234 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
18235 |
++ |
18236 |
++ if (bfqq != NULL) { |
18237 |
++ const int rw = rq_data_dir(rq); |
18238 |
++ |
18239 |
++ BUG_ON(!bfqq->allocated[rw]); |
18240 |
++ bfqq->allocated[rw]--; |
18241 |
++ |
18242 |
++ rq->elv.priv[0] = NULL; |
18243 |
++ rq->elv.priv[1] = NULL; |
18244 |
++ |
18245 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", |
18246 |
++ bfqq, atomic_read(&bfqq->ref)); |
18247 |
++ bfq_put_queue(bfqq); |
18248 |
++ } |
18249 |
++} |
18250 |
++ |
18251 |
++static struct bfq_queue * |
18252 |
++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, |
18253 |
++ struct bfq_queue *bfqq) |
18254 |
++{ |
18255 |
++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", |
18256 |
++ (long unsigned)bfqq->new_bfqq->pid); |
18257 |
++ bic_set_bfqq(bic, bfqq->new_bfqq, 1); |
18258 |
++ bfq_mark_bfqq_coop(bfqq->new_bfqq); |
18259 |
++ bfq_put_queue(bfqq); |
18260 |
++ return bic_to_bfqq(bic, 1); |
18261 |
++} |
18262 |
++ |
18263 |
++/* |
18264 |
++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this |
18265 |
++ * was the last process referring to said bfqq. |
18266 |
++ */ |
18267 |
++static struct bfq_queue * |
18268 |
++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) |
18269 |
++{ |
18270 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); |
18271 |
++ if (bfqq_process_refs(bfqq) == 1) { |
18272 |
++ bfqq->pid = current->pid; |
18273 |
++ bfq_clear_bfqq_coop(bfqq); |
18274 |
++ bfq_clear_bfqq_split_coop(bfqq); |
18275 |
++ return bfqq; |
18276 |
++ } |
18277 |
++ |
18278 |
++ bic_set_bfqq(bic, NULL, 1); |
18279 |
++ |
18280 |
++ bfq_put_cooperator(bfqq); |
18281 |
++ |
18282 |
++ bfq_put_queue(bfqq); |
18283 |
++ return NULL; |
18284 |
++} |
18285 |
++ |
18286 |
++/* |
18287 |
++ * Allocate bfq data structures associated with this request. |
18288 |
++ */ |
18289 |
++static int bfq_set_request(struct request_queue *q, struct request *rq, |
18290 |
++ struct bio *bio, gfp_t gfp_mask) |
18291 |
++{ |
18292 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
18293 |
++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); |
18294 |
++ const int rw = rq_data_dir(rq); |
18295 |
++ const int is_sync = rq_is_sync(rq); |
18296 |
++ struct bfq_queue *bfqq; |
18297 |
++ struct bfq_group *bfqg; |
18298 |
++ unsigned long flags; |
18299 |
++ |
18300 |
++ might_sleep_if(gfp_mask & __GFP_WAIT); |
18301 |
++ |
18302 |
++ bfq_changed_ioprio(bic); |
18303 |
++ |
18304 |
++ spin_lock_irqsave(q->queue_lock, flags); |
18305 |
++ |
18306 |
++ if (bic == NULL) |
18307 |
++ goto queue_fail; |
18308 |
++ |
18309 |
++ bfqg = bfq_bic_update_cgroup(bic); |
18310 |
++ |
18311 |
++new_queue: |
18312 |
++ bfqq = bic_to_bfqq(bic, is_sync); |
18313 |
++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { |
18314 |
++ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask); |
18315 |
++ bic_set_bfqq(bic, bfqq, is_sync); |
18316 |
++ } else { |
18317 |
++ /* |
18318 |
++ * If the queue was seeky for too long, break it apart. |
18319 |
++ */ |
18320 |
++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { |
18321 |
++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); |
18322 |
++ bfqq = bfq_split_bfqq(bic, bfqq); |
18323 |
++ if (!bfqq) |
18324 |
++ goto new_queue; |
18325 |
++ } |
18326 |
++ |
18327 |
++ /* |
18328 |
++ * Check to see if this queue is scheduled to merge with |
18329 |
++ * another closely cooperating queue. The merging of queues |
18330 |
++ * happens here as it must be done in process context. |
18331 |
++ * The reference on new_bfqq was taken in merge_bfqqs. |
18332 |
++ */ |
18333 |
++ if (bfqq->new_bfqq != NULL) |
18334 |
++ bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq); |
18335 |
++ } |
18336 |
++ |
18337 |
++ bfqq->allocated[rw]++; |
18338 |
++ atomic_inc(&bfqq->ref); |
18339 |
++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, |
18340 |
++ atomic_read(&bfqq->ref)); |
18341 |
++ |
18342 |
++ rq->elv.priv[0] = bic; |
18343 |
++ rq->elv.priv[1] = bfqq; |
18344 |
++ |
18345 |
++ spin_unlock_irqrestore(q->queue_lock, flags); |
18346 |
++ |
18347 |
++ return 0; |
18348 |
++ |
18349 |
++queue_fail: |
18350 |
++ bfq_schedule_dispatch(bfqd); |
18351 |
++ spin_unlock_irqrestore(q->queue_lock, flags); |
18352 |
++ |
18353 |
++ return 1; |
18354 |
++} |
18355 |
++ |
18356 |
++static void bfq_kick_queue(struct work_struct *work) |
18357 |
++{ |
18358 |
++ struct bfq_data *bfqd = |
18359 |
++ container_of(work, struct bfq_data, unplug_work); |
18360 |
++ struct request_queue *q = bfqd->queue; |
18361 |
++ |
18362 |
++ spin_lock_irq(q->queue_lock); |
18363 |
++ __blk_run_queue(q); |
18364 |
++ spin_unlock_irq(q->queue_lock); |
18365 |
++} |
18366 |
++ |
18367 |
++/* |
18368 |
++ * Handler of the expiration of the timer running if the in-service queue |
18369 |
++ * is idling inside its time slice. |
18370 |
++ */ |
18371 |
++static void bfq_idle_slice_timer(unsigned long data) |
18372 |
++{ |
18373 |
++ struct bfq_data *bfqd = (struct bfq_data *)data; |
18374 |
++ struct bfq_queue *bfqq; |
18375 |
++ unsigned long flags; |
18376 |
++ enum bfqq_expiration reason; |
18377 |
++ |
18378 |
++ spin_lock_irqsave(bfqd->queue->queue_lock, flags); |
18379 |
++ |
18380 |
++ bfqq = bfqd->in_service_queue; |
18381 |
++ /* |
18382 |
++ * Theoretical race here: the in-service queue can be NULL or different |
18383 |
++ * from the queue that was idling if the timer handler spins on |
18384 |
++ * the queue_lock and a new request arrives for the current |
18385 |
++ * queue and there is a full dispatch cycle that changes the |
18386 |
++ * in-service queue. This can hardly happen, but in the worst case |
18387 |
++ * we just expire a queue too early. |
18388 |
++ */ |
18389 |
++ if (bfqq != NULL) { |
18390 |
++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); |
18391 |
++ if (bfq_bfqq_budget_timeout(bfqq)) |
18392 |
++ /* |
18393 |
++ * Also here the queue can be safely expired |
18394 |
++ * for budget timeout without wasting |
18395 |
++ * guarantees |
18396 |
++ */ |
18397 |
++ reason = BFQ_BFQQ_BUDGET_TIMEOUT; |
18398 |
++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) |
18399 |
++ /* |
18400 |
++ * The queue may not be empty upon timer expiration, |
18401 |
++ * because we may not disable the timer when the first |
18402 |
++ * request of the in-service queue arrives during |
18403 |
++ * disk idling |
18404 |
++ */ |
18405 |
++ reason = BFQ_BFQQ_TOO_IDLE; |
18406 |
++ else |
18407 |
++ goto schedule_dispatch; |
18408 |
++ |
18409 |
++ bfq_bfqq_expire(bfqd, bfqq, 1, reason); |
18410 |
++ } |
18411 |
++ |
18412 |
++schedule_dispatch: |
18413 |
++ bfq_schedule_dispatch(bfqd); |
18414 |
++ |
18415 |
++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); |
18416 |
++} |
18417 |
++ |
18418 |
++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) |
18419 |
++{ |
18420 |
++ del_timer_sync(&bfqd->idle_slice_timer); |
18421 |
++ cancel_work_sync(&bfqd->unplug_work); |
18422 |
++} |
18423 |
++ |
18424 |
++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd, |
18425 |
++ struct bfq_queue **bfqq_ptr) |
18426 |
++{ |
18427 |
++ struct bfq_group *root_group = bfqd->root_group; |
18428 |
++ struct bfq_queue *bfqq = *bfqq_ptr; |
18429 |
++ |
18430 |
++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); |
18431 |
++ if (bfqq != NULL) { |
18432 |
++ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); |
18433 |
++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", |
18434 |
++ bfqq, atomic_read(&bfqq->ref)); |
18435 |
++ bfq_put_queue(bfqq); |
18436 |
++ *bfqq_ptr = NULL; |
18437 |
++ } |
18438 |
++} |
18439 |
++ |
18440 |
++/* |
18441 |
++ * Release all the bfqg references to its async queues. If we are |
18442 |
++ * deallocating the group these queues may still contain requests, so |
18443 |
++ * we reparent them to the root cgroup (i.e., the only one that will |
18444 |
++ * exist for sure untill all the requests on a device are gone). |
18445 |
++ */ |
18446 |
++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) |
18447 |
++{ |
18448 |
++ int i, j; |
18449 |
++ |
18450 |
++ for (i = 0; i < 2; i++) |
18451 |
++ for (j = 0; j < IOPRIO_BE_NR; j++) |
18452 |
++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); |
18453 |
++ |
18454 |
++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); |
18455 |
++} |
18456 |
++ |
18457 |
++static void bfq_exit_queue(struct elevator_queue *e) |
18458 |
++{ |
18459 |
++ struct bfq_data *bfqd = e->elevator_data; |
18460 |
++ struct request_queue *q = bfqd->queue; |
18461 |
++ struct bfq_queue *bfqq, *n; |
18462 |
++ |
18463 |
++ bfq_shutdown_timer_wq(bfqd); |
18464 |
++ |
18465 |
++ spin_lock_irq(q->queue_lock); |
18466 |
++ |
18467 |
++ BUG_ON(bfqd->in_service_queue != NULL); |
18468 |
++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) |
18469 |
++ bfq_deactivate_bfqq(bfqd, bfqq, 0); |
18470 |
++ |
18471 |
++ bfq_disconnect_groups(bfqd); |
18472 |
++ spin_unlock_irq(q->queue_lock); |
18473 |
++ |
18474 |
++ bfq_shutdown_timer_wq(bfqd); |
18475 |
++ |
18476 |
++ synchronize_rcu(); |
18477 |
++ |
18478 |
++ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); |
18479 |
++ |
18480 |
++ bfq_free_root_group(bfqd); |
18481 |
++ kfree(bfqd); |
18482 |
++} |
18483 |
++ |
18484 |
++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) |
18485 |
++{ |
18486 |
++ struct bfq_group *bfqg; |
18487 |
++ struct bfq_data *bfqd; |
18488 |
++ struct elevator_queue *eq; |
18489 |
++ |
18490 |
++ eq = elevator_alloc(q, e); |
18491 |
++ if (eq == NULL) |
18492 |
++ return -ENOMEM; |
18493 |
++ |
18494 |
++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); |
18495 |
++ if (bfqd == NULL) { |
18496 |
++ kobject_put(&eq->kobj); |
18497 |
++ return -ENOMEM; |
18498 |
++ } |
18499 |
++ eq->elevator_data = bfqd; |
18500 |
++ |
18501 |
++ /* |
18502 |
++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. |
18503 |
++ * Grab a permanent reference to it, so that the normal code flow |
18504 |
++ * will not attempt to free it. |
18505 |
++ */ |
18506 |
++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0); |
18507 |
++ atomic_inc(&bfqd->oom_bfqq.ref); |
18508 |
++ |
18509 |
++ bfqd->queue = q; |
18510 |
++ |
18511 |
++ spin_lock_irq(q->queue_lock); |
18512 |
++ q->elevator = eq; |
18513 |
++ spin_unlock_irq(q->queue_lock); |
18514 |
++ |
18515 |
++ bfqg = bfq_alloc_root_group(bfqd, q->node); |
18516 |
++ if (bfqg == NULL) { |
18517 |
++ kfree(bfqd); |
18518 |
++ kobject_put(&eq->kobj); |
18519 |
++ return -ENOMEM; |
18520 |
++ } |
18521 |
++ |
18522 |
++ bfqd->root_group = bfqg; |
18523 |
++ |
18524 |
++ init_timer(&bfqd->idle_slice_timer); |
18525 |
++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; |
18526 |
++ bfqd->idle_slice_timer.data = (unsigned long)bfqd; |
18527 |
++ |
18528 |
++ bfqd->rq_pos_tree = RB_ROOT; |
18529 |
++ |
18530 |
++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); |
18531 |
++ |
18532 |
++ INIT_LIST_HEAD(&bfqd->active_list); |
18533 |
++ INIT_LIST_HEAD(&bfqd->idle_list); |
18534 |
++ |
18535 |
++ bfqd->hw_tag = -1; |
18536 |
++ |
18537 |
++ bfqd->bfq_max_budget = bfq_default_max_budget; |
18538 |
++ |
18539 |
++ bfqd->bfq_quantum = bfq_quantum; |
18540 |
++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; |
18541 |
++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; |
18542 |
++ bfqd->bfq_back_max = bfq_back_max; |
18543 |
++ bfqd->bfq_back_penalty = bfq_back_penalty; |
18544 |
++ bfqd->bfq_slice_idle = bfq_slice_idle; |
18545 |
++ bfqd->bfq_class_idle_last_service = 0; |
18546 |
++ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; |
18547 |
++ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; |
18548 |
++ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; |
18549 |
++ |
18550 |
++ bfqd->low_latency = true; |
18551 |
++ |
18552 |
++ bfqd->bfq_raising_coeff = 20; |
18553 |
++ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300); |
18554 |
++ bfqd->bfq_raising_max_time = 0; |
18555 |
++ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000); |
18556 |
++ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500); |
18557 |
++ bfqd->bfq_raising_max_softrt_rate = 7000; /* |
18558 |
++ * Approximate rate required |
18559 |
++ * to playback or record a |
18560 |
++ * high-definition compressed |
18561 |
++ * video. |
18562 |
++ */ |
18563 |
++ bfqd->raised_busy_queues = 0; |
18564 |
++ |
18565 |
++ /* Initially estimate the device's peak rate as the reference rate */ |
18566 |
++ if (blk_queue_nonrot(bfqd->queue)) { |
18567 |
++ bfqd->RT_prod = R_nonrot * T_nonrot; |
18568 |
++ bfqd->peak_rate = R_nonrot; |
18569 |
++ } else { |
18570 |
++ bfqd->RT_prod = R_rot * T_rot; |
18571 |
++ bfqd->peak_rate = R_rot; |
18572 |
++ } |
18573 |
++ |
18574 |
++ return 0; |
18575 |
++} |
18576 |
++ |
18577 |
++static void bfq_slab_kill(void) |
18578 |
++{ |
18579 |
++ if (bfq_pool != NULL) |
18580 |
++ kmem_cache_destroy(bfq_pool); |
18581 |
++} |
18582 |
++ |
18583 |
++static int __init bfq_slab_setup(void) |
18584 |
++{ |
18585 |
++ bfq_pool = KMEM_CACHE(bfq_queue, 0); |
18586 |
++ if (bfq_pool == NULL) |
18587 |
++ return -ENOMEM; |
18588 |
++ return 0; |
18589 |
++} |
18590 |
++ |
18591 |
++static ssize_t bfq_var_show(unsigned int var, char *page) |
18592 |
++{ |
18593 |
++ return sprintf(page, "%d\n", var); |
18594 |
++} |
18595 |
++ |
18596 |
++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count) |
18597 |
++{ |
18598 |
++ unsigned long new_val; |
18599 |
++ int ret = kstrtoul(page, 10, &new_val); |
18600 |
++ |
18601 |
++ if (ret == 0) |
18602 |
++ *var = new_val; |
18603 |
++ |
18604 |
++ return count; |
18605 |
++} |
18606 |
++ |
18607 |
++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page) |
18608 |
++{ |
18609 |
++ struct bfq_data *bfqd = e->elevator_data; |
18610 |
++ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ? |
18611 |
++ jiffies_to_msecs(bfqd->bfq_raising_max_time) : |
18612 |
++ jiffies_to_msecs(bfq_wrais_duration(bfqd))); |
18613 |
++} |
18614 |
++ |
18615 |
++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) |
18616 |
++{ |
18617 |
++ struct bfq_queue *bfqq; |
18618 |
++ struct bfq_data *bfqd = e->elevator_data; |
18619 |
++ ssize_t num_char = 0; |
18620 |
++ |
18621 |
++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", |
18622 |
++ bfqd->queued); |
18623 |
++ |
18624 |
++ spin_lock_irq(bfqd->queue->queue_lock); |
18625 |
++ |
18626 |
++ num_char += sprintf(page + num_char, "Active:\n"); |
18627 |
++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { |
18628 |
++ num_char += sprintf(page + num_char, |
18629 |
++ "pid%d: weight %hu, nr_queued %d %d," |
18630 |
++ " dur %d/%u\n", |
18631 |
++ bfqq->pid, |
18632 |
++ bfqq->entity.weight, |
18633 |
++ bfqq->queued[0], |
18634 |
++ bfqq->queued[1], |
18635 |
++ jiffies_to_msecs(jiffies - |
18636 |
++ bfqq->last_rais_start_finish), |
18637 |
++ jiffies_to_msecs(bfqq->raising_cur_max_time)); |
18638 |
++ } |
18639 |
++ |
18640 |
++ num_char += sprintf(page + num_char, "Idle:\n"); |
18641 |
++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { |
18642 |
++ num_char += sprintf(page + num_char, |
18643 |
++ "pid%d: weight %hu, dur %d/%u\n", |
18644 |
++ bfqq->pid, |
18645 |
++ bfqq->entity.weight, |
18646 |
++ jiffies_to_msecs(jiffies - |
18647 |
++ bfqq->last_rais_start_finish), |
18648 |
++ jiffies_to_msecs(bfqq->raising_cur_max_time)); |
18649 |
++ } |
18650 |
++ |
18651 |
++ spin_unlock_irq(bfqd->queue->queue_lock); |
18652 |
++ |
18653 |
++ return num_char; |
18654 |
++} |
18655 |
++ |
18656 |
++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ |
18657 |
++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ |
18658 |
++{ \ |
18659 |
++ struct bfq_data *bfqd = e->elevator_data; \ |
18660 |
++ unsigned int __data = __VAR; \ |
18661 |
++ if (__CONV) \ |
18662 |
++ __data = jiffies_to_msecs(__data); \ |
18663 |
++ return bfq_var_show(__data, (page)); \ |
18664 |
++} |
18665 |
++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0); |
18666 |
++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); |
18667 |
++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); |
18668 |
++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); |
18669 |
++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); |
18670 |
++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); |
18671 |
++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); |
18672 |
++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0); |
18673 |
++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); |
18674 |
++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); |
18675 |
++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); |
18676 |
++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0); |
18677 |
++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1); |
18678 |
++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time, |
18679 |
++ 1); |
18680 |
++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show, |
18681 |
++ bfqd->bfq_raising_min_inter_arr_async, |
18682 |
++ 1); |
18683 |
++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show, |
18684 |
++ bfqd->bfq_raising_max_softrt_rate, 0); |
18685 |
++#undef SHOW_FUNCTION |
18686 |
++ |
18687 |
++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ |
18688 |
++static ssize_t \ |
18689 |
++__FUNC(struct elevator_queue *e, const char *page, size_t count) \ |
18690 |
++{ \ |
18691 |
++ struct bfq_data *bfqd = e->elevator_data; \ |
18692 |
++ unsigned long uninitialized_var(__data); \ |
18693 |
++ int ret = bfq_var_store(&__data, (page), count); \ |
18694 |
++ if (__data < (MIN)) \ |
18695 |
++ __data = (MIN); \ |
18696 |
++ else if (__data > (MAX)) \ |
18697 |
++ __data = (MAX); \ |
18698 |
++ if (__CONV) \ |
18699 |
++ *(__PTR) = msecs_to_jiffies(__data); \ |
18700 |
++ else \ |
18701 |
++ *(__PTR) = __data; \ |
18702 |
++ return ret; \ |
18703 |
++} |
18704 |
++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0); |
18705 |
++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, |
18706 |
++ INT_MAX, 1); |
18707 |
++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, |
18708 |
++ INT_MAX, 1); |
18709 |
++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); |
18710 |
++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, |
18711 |
++ INT_MAX, 0); |
18712 |
++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); |
18713 |
++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, |
18714 |
++ 1, INT_MAX, 0); |
18715 |
++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, |
18716 |
++ INT_MAX, 1); |
18717 |
++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1, |
18718 |
++ INT_MAX, 0); |
18719 |
++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0, |
18720 |
++ INT_MAX, 1); |
18721 |
++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0, |
18722 |
++ INT_MAX, 1); |
18723 |
++STORE_FUNCTION(bfq_raising_min_idle_time_store, |
18724 |
++ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1); |
18725 |
++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store, |
18726 |
++ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1); |
18727 |
++STORE_FUNCTION(bfq_raising_max_softrt_rate_store, |
18728 |
++ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0); |
18729 |
++#undef STORE_FUNCTION |
18730 |
++ |
18731 |
++/* do nothing for the moment */ |
18732 |
++static ssize_t bfq_weights_store(struct elevator_queue *e, |
18733 |
++ const char *page, size_t count) |
18734 |
++{ |
18735 |
++ return count; |
18736 |
++} |
18737 |
++ |
18738 |
++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) |
18739 |
++{ |
18740 |
++ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); |
18741 |
++ |
18742 |
++ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) |
18743 |
++ return bfq_calc_max_budget(bfqd->peak_rate, timeout); |
18744 |
++ else |
18745 |
++ return bfq_default_max_budget; |
18746 |
++} |
18747 |
++ |
18748 |
++static ssize_t bfq_max_budget_store(struct elevator_queue *e, |
18749 |
++ const char *page, size_t count) |
18750 |
++{ |
18751 |
++ struct bfq_data *bfqd = e->elevator_data; |
18752 |
++ unsigned long uninitialized_var(__data); |
18753 |
++ int ret = bfq_var_store(&__data, (page), count); |
18754 |
++ |
18755 |
++ if (__data == 0) |
18756 |
++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); |
18757 |
++ else { |
18758 |
++ if (__data > INT_MAX) |
18759 |
++ __data = INT_MAX; |
18760 |
++ bfqd->bfq_max_budget = __data; |
18761 |
++ } |
18762 |
++ |
18763 |
++ bfqd->bfq_user_max_budget = __data; |
18764 |
++ |
18765 |
++ return ret; |
18766 |
++} |
18767 |
++ |
18768 |
++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, |
18769 |
++ const char *page, size_t count) |
18770 |
++{ |
18771 |
++ struct bfq_data *bfqd = e->elevator_data; |
18772 |
++ unsigned long uninitialized_var(__data); |
18773 |
++ int ret = bfq_var_store(&__data, (page), count); |
18774 |
++ |
18775 |
++ if (__data < 1) |
18776 |
++ __data = 1; |
18777 |
++ else if (__data > INT_MAX) |
18778 |
++ __data = INT_MAX; |
18779 |
++ |
18780 |
++ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); |
18781 |
++ if (bfqd->bfq_user_max_budget == 0) |
18782 |
++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); |
18783 |
++ |
18784 |
++ return ret; |
18785 |
++} |
18786 |
++ |
18787 |
++static ssize_t bfq_low_latency_store(struct elevator_queue *e, |
18788 |
++ const char *page, size_t count) |
18789 |
++{ |
18790 |
++ struct bfq_data *bfqd = e->elevator_data; |
18791 |
++ unsigned long uninitialized_var(__data); |
18792 |
++ int ret = bfq_var_store(&__data, (page), count); |
18793 |
++ |
18794 |
++ if (__data > 1) |
18795 |
++ __data = 1; |
18796 |
++ if (__data == 0 && bfqd->low_latency != 0) |
18797 |
++ bfq_end_raising(bfqd); |
18798 |
++ bfqd->low_latency = __data; |
18799 |
++ |
18800 |
++ return ret; |
18801 |
++} |
18802 |
++ |
18803 |
++#define BFQ_ATTR(name) \ |
18804 |
++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) |
18805 |
++ |
18806 |
++static struct elv_fs_entry bfq_attrs[] = { |
18807 |
++ BFQ_ATTR(quantum), |
18808 |
++ BFQ_ATTR(fifo_expire_sync), |
18809 |
++ BFQ_ATTR(fifo_expire_async), |
18810 |
++ BFQ_ATTR(back_seek_max), |
18811 |
++ BFQ_ATTR(back_seek_penalty), |
18812 |
++ BFQ_ATTR(slice_idle), |
18813 |
++ BFQ_ATTR(max_budget), |
18814 |
++ BFQ_ATTR(max_budget_async_rq), |
18815 |
++ BFQ_ATTR(timeout_sync), |
18816 |
++ BFQ_ATTR(timeout_async), |
18817 |
++ BFQ_ATTR(low_latency), |
18818 |
++ BFQ_ATTR(raising_coeff), |
18819 |
++ BFQ_ATTR(raising_max_time), |
18820 |
++ BFQ_ATTR(raising_rt_max_time), |
18821 |
++ BFQ_ATTR(raising_min_idle_time), |
18822 |
++ BFQ_ATTR(raising_min_inter_arr_async), |
18823 |
++ BFQ_ATTR(raising_max_softrt_rate), |
18824 |
++ BFQ_ATTR(weights), |
18825 |
++ __ATTR_NULL |
18826 |
++}; |
18827 |
++ |
18828 |
++static struct elevator_type iosched_bfq = { |
18829 |
++ .ops = { |
18830 |
++ .elevator_merge_fn = bfq_merge, |
18831 |
++ .elevator_merged_fn = bfq_merged_request, |
18832 |
++ .elevator_merge_req_fn = bfq_merged_requests, |
18833 |
++ .elevator_allow_merge_fn = bfq_allow_merge, |
18834 |
++ .elevator_dispatch_fn = bfq_dispatch_requests, |
18835 |
++ .elevator_add_req_fn = bfq_insert_request, |
18836 |
++ .elevator_activate_req_fn = bfq_activate_request, |
18837 |
++ .elevator_deactivate_req_fn = bfq_deactivate_request, |
18838 |
++ .elevator_completed_req_fn = bfq_completed_request, |
18839 |
++ .elevator_former_req_fn = elv_rb_former_request, |
18840 |
++ .elevator_latter_req_fn = elv_rb_latter_request, |
18841 |
++ .elevator_init_icq_fn = bfq_init_icq, |
18842 |
++ .elevator_exit_icq_fn = bfq_exit_icq, |
18843 |
++ .elevator_set_req_fn = bfq_set_request, |
18844 |
++ .elevator_put_req_fn = bfq_put_request, |
18845 |
++ .elevator_may_queue_fn = bfq_may_queue, |
18846 |
++ .elevator_init_fn = bfq_init_queue, |
18847 |
++ .elevator_exit_fn = bfq_exit_queue, |
18848 |
++ }, |
18849 |
++ .icq_size = sizeof(struct bfq_io_cq), |
18850 |
++ .icq_align = __alignof__(struct bfq_io_cq), |
18851 |
++ .elevator_attrs = bfq_attrs, |
18852 |
++ .elevator_name = "bfq", |
18853 |
++ .elevator_owner = THIS_MODULE, |
18854 |
++}; |
18855 |
++ |
18856 |
++static int __init bfq_init(void) |
18857 |
++{ |
18858 |
++ /* |
18859 |
++ * Can be 0 on HZ < 1000 setups. |
18860 |
++ */ |
18861 |
++ if (bfq_slice_idle == 0) |
18862 |
++ bfq_slice_idle = 1; |
18863 |
++ |
18864 |
++ if (bfq_timeout_async == 0) |
18865 |
++ bfq_timeout_async = 1; |
18866 |
++ |
18867 |
++ if (bfq_slab_setup()) |
18868 |
++ return -ENOMEM; |
18869 |
++ |
18870 |
++ elv_register(&iosched_bfq); |
18871 |
++ printk(KERN_INFO "BFQ I/O-scheduler version: v7r1"); |
18872 |
++ |
18873 |
++ return 0; |
18874 |
++} |
18875 |
++ |
18876 |
++static void __exit bfq_exit(void) |
18877 |
++{ |
18878 |
++ elv_unregister(&iosched_bfq); |
18879 |
++ bfq_slab_kill(); |
18880 |
++} |
18881 |
++ |
18882 |
++module_init(bfq_init); |
18883 |
++module_exit(bfq_exit); |
18884 |
++ |
18885 |
++MODULE_AUTHOR("Fabio Checconi, Paolo Valente"); |
18886 |
++MODULE_LICENSE("GPL"); |
18887 |
++MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler"); |
18888 |
+diff --git a/block/bfq-sched.c b/block/bfq-sched.c |
18889 |
+new file mode 100644 |
18890 |
+index 0000000..999b475 |
18891 |
+--- /dev/null |
18892 |
++++ b/block/bfq-sched.c |
18893 |
+@@ -0,0 +1,1078 @@ |
18894 |
++/* |
18895 |
++ * BFQ: Hierarchical B-WF2Q+ scheduler. |
18896 |
++ * |
18897 |
++ * Based on ideas and code from CFQ: |
18898 |
++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
18899 |
++ * |
18900 |
++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
18901 |
++ * Paolo Valente <paolo.valente@×××××××.it> |
18902 |
++ * |
18903 |
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
18904 |
++ */ |
18905 |
++ |
18906 |
++#ifdef CONFIG_CGROUP_BFQIO |
18907 |
++#define for_each_entity(entity) \ |
18908 |
++ for (; entity != NULL; entity = entity->parent) |
18909 |
++ |
18910 |
++#define for_each_entity_safe(entity, parent) \ |
18911 |
++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) |
18912 |
++ |
18913 |
++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, |
18914 |
++ int extract, |
18915 |
++ struct bfq_data *bfqd); |
18916 |
++ |
18917 |
++static inline void bfq_update_budget(struct bfq_entity *next_in_service) |
18918 |
++{ |
18919 |
++ struct bfq_entity *bfqg_entity; |
18920 |
++ struct bfq_group *bfqg; |
18921 |
++ struct bfq_sched_data *group_sd; |
18922 |
++ |
18923 |
++ BUG_ON(next_in_service == NULL); |
18924 |
++ |
18925 |
++ group_sd = next_in_service->sched_data; |
18926 |
++ |
18927 |
++ bfqg = container_of(group_sd, struct bfq_group, sched_data); |
18928 |
++ /* |
18929 |
++ * bfq_group's my_entity field is not NULL only if the group |
18930 |
++ * is not the root group. We must not touch the root entity |
18931 |
++ * as it must never become an in-service entity. |
18932 |
++ */ |
18933 |
++ bfqg_entity = bfqg->my_entity; |
18934 |
++ if (bfqg_entity != NULL) |
18935 |
++ bfqg_entity->budget = next_in_service->budget; |
18936 |
++} |
18937 |
++ |
18938 |
++static int bfq_update_next_in_service(struct bfq_sched_data *sd) |
18939 |
++{ |
18940 |
++ struct bfq_entity *next_in_service; |
18941 |
++ |
18942 |
++ if (sd->in_service_entity != NULL) |
18943 |
++ /* will update/requeue at the end of service */ |
18944 |
++ return 0; |
18945 |
++ |
18946 |
++ /* |
18947 |
++ * NOTE: this can be improved in many ways, such as returning |
18948 |
++ * 1 (and thus propagating upwards the update) only when the |
18949 |
++ * budget changes, or caching the bfqq that will be scheduled |
18950 |
++ * next from this subtree. By now we worry more about |
18951 |
++ * correctness than about performance... |
18952 |
++ */ |
18953 |
++ next_in_service = bfq_lookup_next_entity(sd, 0, NULL); |
18954 |
++ sd->next_in_service = next_in_service; |
18955 |
++ |
18956 |
++ if (next_in_service != NULL) |
18957 |
++ bfq_update_budget(next_in_service); |
18958 |
++ |
18959 |
++ return 1; |
18960 |
++} |
18961 |
++ |
18962 |
++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, |
18963 |
++ struct bfq_entity *entity) |
18964 |
++{ |
18965 |
++ BUG_ON(sd->next_in_service != entity); |
18966 |
++} |
18967 |
++#else |
18968 |
++#define for_each_entity(entity) \ |
18969 |
++ for (; entity != NULL; entity = NULL) |
18970 |
++ |
18971 |
++#define for_each_entity_safe(entity, parent) \ |
18972 |
++ for (parent = NULL; entity != NULL; entity = parent) |
18973 |
++ |
18974 |
++static inline int bfq_update_next_in_service(struct bfq_sched_data *sd) |
18975 |
++{ |
18976 |
++ return 0; |
18977 |
++} |
18978 |
++ |
18979 |
++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, |
18980 |
++ struct bfq_entity *entity) |
18981 |
++{ |
18982 |
++} |
18983 |
++ |
18984 |
++static inline void bfq_update_budget(struct bfq_entity *next_in_service) |
18985 |
++{ |
18986 |
++} |
18987 |
++#endif |
18988 |
++ |
18989 |
++/* |
18990 |
++ * Shift for timestamp calculations. This actually limits the maximum |
18991 |
++ * service allowed in one timestamp delta (small shift values increase it), |
18992 |
++ * the maximum total weight that can be used for the queues in the system |
18993 |
++ * (big shift values increase it), and the period of virtual time wraparounds. |
18994 |
++ */ |
18995 |
++#define WFQ_SERVICE_SHIFT 22 |
18996 |
++ |
18997 |
++/** |
18998 |
++ * bfq_gt - compare two timestamps. |
18999 |
++ * @a: first ts. |
19000 |
++ * @b: second ts. |
19001 |
++ * |
19002 |
++ * Return @a > @b, dealing with wrapping correctly. |
19003 |
++ */ |
19004 |
++static inline int bfq_gt(u64 a, u64 b) |
19005 |
++{ |
19006 |
++ return (s64)(a - b) > 0; |
19007 |
++} |
19008 |
++ |
19009 |
++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) |
19010 |
++{ |
19011 |
++ struct bfq_queue *bfqq = NULL; |
19012 |
++ |
19013 |
++ BUG_ON(entity == NULL); |
19014 |
++ |
19015 |
++ if (entity->my_sched_data == NULL) |
19016 |
++ bfqq = container_of(entity, struct bfq_queue, entity); |
19017 |
++ |
19018 |
++ return bfqq; |
19019 |
++} |
19020 |
++ |
19021 |
++ |
19022 |
++/** |
19023 |
++ * bfq_delta - map service into the virtual time domain. |
19024 |
++ * @service: amount of service. |
19025 |
++ * @weight: scale factor (weight of an entity or weight sum). |
19026 |
++ */ |
19027 |
++static inline u64 bfq_delta(unsigned long service, |
19028 |
++ unsigned long weight) |
19029 |
++{ |
19030 |
++ u64 d = (u64)service << WFQ_SERVICE_SHIFT; |
19031 |
++ |
19032 |
++ do_div(d, weight); |
19033 |
++ return d; |
19034 |
++} |
19035 |
++ |
19036 |
++/** |
19037 |
++ * bfq_calc_finish - assign the finish time to an entity. |
19038 |
++ * @entity: the entity to act upon. |
19039 |
++ * @service: the service to be charged to the entity. |
19040 |
++ */ |
19041 |
++static inline void bfq_calc_finish(struct bfq_entity *entity, |
19042 |
++ unsigned long service) |
19043 |
++{ |
19044 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
19045 |
++ |
19046 |
++ BUG_ON(entity->weight == 0); |
19047 |
++ |
19048 |
++ entity->finish = entity->start + |
19049 |
++ bfq_delta(service, entity->weight); |
19050 |
++ |
19051 |
++ if (bfqq != NULL) { |
19052 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
19053 |
++ "calc_finish: serv %lu, w %d", |
19054 |
++ service, entity->weight); |
19055 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, |
19056 |
++ "calc_finish: start %llu, finish %llu, delta %llu", |
19057 |
++ entity->start, entity->finish, |
19058 |
++ bfq_delta(service, entity->weight)); |
19059 |
++ } |
19060 |
++} |
19061 |
++ |
19062 |
++/** |
19063 |
++ * bfq_entity_of - get an entity from a node. |
19064 |
++ * @node: the node field of the entity. |
19065 |
++ * |
19066 |
++ * Convert a node pointer to the relative entity. This is used only |
19067 |
++ * to simplify the logic of some functions and not as the generic |
19068 |
++ * conversion mechanism because, e.g., in the tree walking functions, |
19069 |
++ * the check for a %NULL value would be redundant. |
19070 |
++ */ |
19071 |
++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node) |
19072 |
++{ |
19073 |
++ struct bfq_entity *entity = NULL; |
19074 |
++ |
19075 |
++ if (node != NULL) |
19076 |
++ entity = rb_entry(node, struct bfq_entity, rb_node); |
19077 |
++ |
19078 |
++ return entity; |
19079 |
++} |
19080 |
++ |
19081 |
++/** |
19082 |
++ * bfq_extract - remove an entity from a tree. |
19083 |
++ * @root: the tree root. |
19084 |
++ * @entity: the entity to remove. |
19085 |
++ */ |
19086 |
++static inline void bfq_extract(struct rb_root *root, |
19087 |
++ struct bfq_entity *entity) |
19088 |
++{ |
19089 |
++ BUG_ON(entity->tree != root); |
19090 |
++ |
19091 |
++ entity->tree = NULL; |
19092 |
++ rb_erase(&entity->rb_node, root); |
19093 |
++} |
19094 |
++ |
19095 |
++/** |
19096 |
++ * bfq_idle_extract - extract an entity from the idle tree. |
19097 |
++ * @st: the service tree of the owning @entity. |
19098 |
++ * @entity: the entity being removed. |
19099 |
++ */ |
19100 |
++static void bfq_idle_extract(struct bfq_service_tree *st, |
19101 |
++ struct bfq_entity *entity) |
19102 |
++{ |
19103 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
19104 |
++ struct rb_node *next; |
19105 |
++ |
19106 |
++ BUG_ON(entity->tree != &st->idle); |
19107 |
++ |
19108 |
++ if (entity == st->first_idle) { |
19109 |
++ next = rb_next(&entity->rb_node); |
19110 |
++ st->first_idle = bfq_entity_of(next); |
19111 |
++ } |
19112 |
++ |
19113 |
++ if (entity == st->last_idle) { |
19114 |
++ next = rb_prev(&entity->rb_node); |
19115 |
++ st->last_idle = bfq_entity_of(next); |
19116 |
++ } |
19117 |
++ |
19118 |
++ bfq_extract(&st->idle, entity); |
19119 |
++ |
19120 |
++ if (bfqq != NULL) |
19121 |
++ list_del(&bfqq->bfqq_list); |
19122 |
++} |
19123 |
++ |
19124 |
++/** |
19125 |
++ * bfq_insert - generic tree insertion. |
19126 |
++ * @root: tree root. |
19127 |
++ * @entity: entity to insert. |
19128 |
++ * |
19129 |
++ * This is used for the idle and the active tree, since they are both |
19130 |
++ * ordered by finish time. |
19131 |
++ */ |
19132 |
++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) |
19133 |
++{ |
19134 |
++ struct bfq_entity *entry; |
19135 |
++ struct rb_node **node = &root->rb_node; |
19136 |
++ struct rb_node *parent = NULL; |
19137 |
++ |
19138 |
++ BUG_ON(entity->tree != NULL); |
19139 |
++ |
19140 |
++ while (*node != NULL) { |
19141 |
++ parent = *node; |
19142 |
++ entry = rb_entry(parent, struct bfq_entity, rb_node); |
19143 |
++ |
19144 |
++ if (bfq_gt(entry->finish, entity->finish)) |
19145 |
++ node = &parent->rb_left; |
19146 |
++ else |
19147 |
++ node = &parent->rb_right; |
19148 |
++ } |
19149 |
++ |
19150 |
++ rb_link_node(&entity->rb_node, parent, node); |
19151 |
++ rb_insert_color(&entity->rb_node, root); |
19152 |
++ |
19153 |
++ entity->tree = root; |
19154 |
++} |
19155 |
++ |
19156 |
++/** |
19157 |
++ * bfq_update_min - update the min_start field of a entity. |
19158 |
++ * @entity: the entity to update. |
19159 |
++ * @node: one of its children. |
19160 |
++ * |
19161 |
++ * This function is called when @entity may store an invalid value for |
19162 |
++ * min_start due to updates to the active tree. The function assumes |
19163 |
++ * that the subtree rooted at @node (which may be its left or its right |
19164 |
++ * child) has a valid min_start value. |
19165 |
++ */ |
19166 |
++static inline void bfq_update_min(struct bfq_entity *entity, |
19167 |
++ struct rb_node *node) |
19168 |
++{ |
19169 |
++ struct bfq_entity *child; |
19170 |
++ |
19171 |
++ if (node != NULL) { |
19172 |
++ child = rb_entry(node, struct bfq_entity, rb_node); |
19173 |
++ if (bfq_gt(entity->min_start, child->min_start)) |
19174 |
++ entity->min_start = child->min_start; |
19175 |
++ } |
19176 |
++} |
19177 |
++ |
19178 |
++/** |
19179 |
++ * bfq_update_active_node - recalculate min_start. |
19180 |
++ * @node: the node to update. |
19181 |
++ * |
19182 |
++ * @node may have changed position or one of its children may have moved, |
19183 |
++ * this function updates its min_start value. The left and right subtrees |
19184 |
++ * are assumed to hold a correct min_start value. |
19185 |
++ */ |
19186 |
++static inline void bfq_update_active_node(struct rb_node *node) |
19187 |
++{ |
19188 |
++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); |
19189 |
++ |
19190 |
++ entity->min_start = entity->start; |
19191 |
++ bfq_update_min(entity, node->rb_right); |
19192 |
++ bfq_update_min(entity, node->rb_left); |
19193 |
++} |
19194 |
++ |
19195 |
++/** |
19196 |
++ * bfq_update_active_tree - update min_start for the whole active tree. |
19197 |
++ * @node: the starting node. |
19198 |
++ * |
19199 |
++ * @node must be the deepest modified node after an update. This function |
19200 |
++ * updates its min_start using the values held by its children, assuming |
19201 |
++ * that they did not change, and then updates all the nodes that may have |
19202 |
++ * changed in the path to the root. The only nodes that may have changed |
19203 |
++ * are the ones in the path or their siblings. |
19204 |
++ */ |
19205 |
++static void bfq_update_active_tree(struct rb_node *node) |
19206 |
++{ |
19207 |
++ struct rb_node *parent; |
19208 |
++ |
19209 |
++up: |
19210 |
++ bfq_update_active_node(node); |
19211 |
++ |
19212 |
++ parent = rb_parent(node); |
19213 |
++ if (parent == NULL) |
19214 |
++ return; |
19215 |
++ |
19216 |
++ if (node == parent->rb_left && parent->rb_right != NULL) |
19217 |
++ bfq_update_active_node(parent->rb_right); |
19218 |
++ else if (parent->rb_left != NULL) |
19219 |
++ bfq_update_active_node(parent->rb_left); |
19220 |
++ |
19221 |
++ node = parent; |
19222 |
++ goto up; |
19223 |
++} |
19224 |
++ |
19225 |
++/** |
19226 |
++ * bfq_active_insert - insert an entity in the active tree of its group/device. |
19227 |
++ * @st: the service tree of the entity. |
19228 |
++ * @entity: the entity being inserted. |
19229 |
++ * |
19230 |
++ * The active tree is ordered by finish time, but an extra key is kept |
19231 |
++ * per each node, containing the minimum value for the start times of |
19232 |
++ * its children (and the node itself), so it's possible to search for |
19233 |
++ * the eligible node with the lowest finish time in logarithmic time. |
19234 |
++ */ |
19235 |
++static void bfq_active_insert(struct bfq_service_tree *st, |
19236 |
++ struct bfq_entity *entity) |
19237 |
++{ |
19238 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
19239 |
++ struct rb_node *node = &entity->rb_node; |
19240 |
++ |
19241 |
++ bfq_insert(&st->active, entity); |
19242 |
++ |
19243 |
++ if (node->rb_left != NULL) |
19244 |
++ node = node->rb_left; |
19245 |
++ else if (node->rb_right != NULL) |
19246 |
++ node = node->rb_right; |
19247 |
++ |
19248 |
++ bfq_update_active_tree(node); |
19249 |
++ |
19250 |
++ if (bfqq != NULL) |
19251 |
++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); |
19252 |
++} |
19253 |
++ |
19254 |
++/** |
19255 |
++ * bfq_ioprio_to_weight - calc a weight from an ioprio. |
19256 |
++ * @ioprio: the ioprio value to convert. |
19257 |
++ */ |
19258 |
++static unsigned short bfq_ioprio_to_weight(int ioprio) |
19259 |
++{ |
19260 |
++ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); |
19261 |
++ return IOPRIO_BE_NR - ioprio; |
19262 |
++} |
19263 |
++ |
19264 |
++/** |
19265 |
++ * bfq_weight_to_ioprio - calc an ioprio from a weight. |
19266 |
++ * @weight: the weight value to convert. |
19267 |
++ * |
19268 |
++ * To preserve as mush as possible the old only-ioprio user interface, |
19269 |
++ * 0 is used as an escape ioprio value for weights (numerically) equal or |
19270 |
++ * larger than IOPRIO_BE_NR |
19271 |
++ */ |
19272 |
++static unsigned short bfq_weight_to_ioprio(int weight) |
19273 |
++{ |
19274 |
++ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); |
19275 |
++ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; |
19276 |
++} |
19277 |
++ |
19278 |
++static inline void bfq_get_entity(struct bfq_entity *entity) |
19279 |
++{ |
19280 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
19281 |
++ struct bfq_sched_data *sd; |
19282 |
++ |
19283 |
++ if (bfqq != NULL) { |
19284 |
++ sd = entity->sched_data; |
19285 |
++ atomic_inc(&bfqq->ref); |
19286 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", |
19287 |
++ bfqq, atomic_read(&bfqq->ref)); |
19288 |
++ } |
19289 |
++} |
19290 |
++ |
19291 |
++/** |
19292 |
++ * bfq_find_deepest - find the deepest node that an extraction can modify. |
19293 |
++ * @node: the node being removed. |
19294 |
++ * |
19295 |
++ * Do the first step of an extraction in an rb tree, looking for the |
19296 |
++ * node that will replace @node, and returning the deepest node that |
19297 |
++ * the following modifications to the tree can touch. If @node is the |
19298 |
++ * last node in the tree return %NULL. |
19299 |
++ */ |
19300 |
++static struct rb_node *bfq_find_deepest(struct rb_node *node) |
19301 |
++{ |
19302 |
++ struct rb_node *deepest; |
19303 |
++ |
19304 |
++ if (node->rb_right == NULL && node->rb_left == NULL) |
19305 |
++ deepest = rb_parent(node); |
19306 |
++ else if (node->rb_right == NULL) |
19307 |
++ deepest = node->rb_left; |
19308 |
++ else if (node->rb_left == NULL) |
19309 |
++ deepest = node->rb_right; |
19310 |
++ else { |
19311 |
++ deepest = rb_next(node); |
19312 |
++ if (deepest->rb_right != NULL) |
19313 |
++ deepest = deepest->rb_right; |
19314 |
++ else if (rb_parent(deepest) != node) |
19315 |
++ deepest = rb_parent(deepest); |
19316 |
++ } |
19317 |
++ |
19318 |
++ return deepest; |
19319 |
++} |
19320 |
++ |
19321 |
++/** |
19322 |
++ * bfq_active_extract - remove an entity from the active tree. |
19323 |
++ * @st: the service_tree containing the tree. |
19324 |
++ * @entity: the entity being removed. |
19325 |
++ */ |
19326 |
++static void bfq_active_extract(struct bfq_service_tree *st, |
19327 |
++ struct bfq_entity *entity) |
19328 |
++{ |
19329 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
19330 |
++ struct rb_node *node; |
19331 |
++ |
19332 |
++ node = bfq_find_deepest(&entity->rb_node); |
19333 |
++ bfq_extract(&st->active, entity); |
19334 |
++ |
19335 |
++ if (node != NULL) |
19336 |
++ bfq_update_active_tree(node); |
19337 |
++ |
19338 |
++ if (bfqq != NULL) |
19339 |
++ list_del(&bfqq->bfqq_list); |
19340 |
++} |
19341 |
++ |
19342 |
++/** |
19343 |
++ * bfq_idle_insert - insert an entity into the idle tree. |
19344 |
++ * @st: the service tree containing the tree. |
19345 |
++ * @entity: the entity to insert. |
19346 |
++ */ |
19347 |
++static void bfq_idle_insert(struct bfq_service_tree *st, |
19348 |
++ struct bfq_entity *entity) |
19349 |
++{ |
19350 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
19351 |
++ struct bfq_entity *first_idle = st->first_idle; |
19352 |
++ struct bfq_entity *last_idle = st->last_idle; |
19353 |
++ |
19354 |
++ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish)) |
19355 |
++ st->first_idle = entity; |
19356 |
++ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish)) |
19357 |
++ st->last_idle = entity; |
19358 |
++ |
19359 |
++ bfq_insert(&st->idle, entity); |
19360 |
++ |
19361 |
++ if (bfqq != NULL) |
19362 |
++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); |
19363 |
++} |
19364 |
++ |
19365 |
++/** |
19366 |
++ * bfq_forget_entity - remove an entity from the wfq trees. |
19367 |
++ * @st: the service tree. |
19368 |
++ * @entity: the entity being removed. |
19369 |
++ * |
19370 |
++ * Update the device status and forget everything about @entity, putting |
19371 |
++ * the device reference to it, if it is a queue. Entities belonging to |
19372 |
++ * groups are not refcounted. |
19373 |
++ */ |
19374 |
++static void bfq_forget_entity(struct bfq_service_tree *st, |
19375 |
++ struct bfq_entity *entity) |
19376 |
++{ |
19377 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
19378 |
++ struct bfq_sched_data *sd; |
19379 |
++ |
19380 |
++ BUG_ON(!entity->on_st); |
19381 |
++ |
19382 |
++ entity->on_st = 0; |
19383 |
++ st->wsum -= entity->weight; |
19384 |
++ if (bfqq != NULL) { |
19385 |
++ sd = entity->sched_data; |
19386 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", |
19387 |
++ bfqq, atomic_read(&bfqq->ref)); |
19388 |
++ bfq_put_queue(bfqq); |
19389 |
++ } |
19390 |
++} |
19391 |
++ |
19392 |
++/** |
19393 |
++ * bfq_put_idle_entity - release the idle tree ref of an entity. |
19394 |
++ * @st: service tree for the entity. |
19395 |
++ * @entity: the entity being released. |
19396 |
++ */ |
19397 |
++static void bfq_put_idle_entity(struct bfq_service_tree *st, |
19398 |
++ struct bfq_entity *entity) |
19399 |
++{ |
19400 |
++ bfq_idle_extract(st, entity); |
19401 |
++ bfq_forget_entity(st, entity); |
19402 |
++} |
19403 |
++ |
19404 |
++/** |
19405 |
++ * bfq_forget_idle - update the idle tree if necessary. |
19406 |
++ * @st: the service tree to act upon. |
19407 |
++ * |
19408 |
++ * To preserve the global O(log N) complexity we only remove one entry here; |
19409 |
++ * as the idle tree will not grow indefinitely this can be done safely. |
19410 |
++ */ |
19411 |
++static void bfq_forget_idle(struct bfq_service_tree *st) |
19412 |
++{ |
19413 |
++ struct bfq_entity *first_idle = st->first_idle; |
19414 |
++ struct bfq_entity *last_idle = st->last_idle; |
19415 |
++ |
19416 |
++ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL && |
19417 |
++ !bfq_gt(last_idle->finish, st->vtime)) { |
19418 |
++ /* |
19419 |
++ * Forget the whole idle tree, increasing the vtime past |
19420 |
++ * the last finish time of idle entities. |
19421 |
++ */ |
19422 |
++ st->vtime = last_idle->finish; |
19423 |
++ } |
19424 |
++ |
19425 |
++ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime)) |
19426 |
++ bfq_put_idle_entity(st, first_idle); |
19427 |
++} |
19428 |
++ |
19429 |
++static struct bfq_service_tree * |
19430 |
++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, |
19431 |
++ struct bfq_entity *entity) |
19432 |
++{ |
19433 |
++ struct bfq_service_tree *new_st = old_st; |
19434 |
++ |
19435 |
++ if (entity->ioprio_changed) { |
19436 |
++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); |
19437 |
++ |
19438 |
++ BUG_ON(old_st->wsum < entity->weight); |
19439 |
++ old_st->wsum -= entity->weight; |
19440 |
++ |
19441 |
++ if (entity->new_weight != entity->orig_weight) { |
19442 |
++ entity->orig_weight = entity->new_weight; |
19443 |
++ entity->ioprio = |
19444 |
++ bfq_weight_to_ioprio(entity->orig_weight); |
19445 |
++ } else if (entity->new_ioprio != entity->ioprio) { |
19446 |
++ entity->ioprio = entity->new_ioprio; |
19447 |
++ entity->orig_weight = |
19448 |
++ bfq_ioprio_to_weight(entity->ioprio); |
19449 |
++ } else |
19450 |
++ entity->new_weight = entity->orig_weight = |
19451 |
++ bfq_ioprio_to_weight(entity->ioprio); |
19452 |
++ |
19453 |
++ entity->ioprio_class = entity->new_ioprio_class; |
19454 |
++ entity->ioprio_changed = 0; |
19455 |
++ |
19456 |
++ /* |
19457 |
++ * NOTE: here we may be changing the weight too early, |
19458 |
++ * this will cause unfairness. The correct approach |
19459 |
++ * would have required additional complexity to defer |
19460 |
++ * weight changes to the proper time instants (i.e., |
19461 |
++ * when entity->finish <= old_st->vtime). |
19462 |
++ */ |
19463 |
++ new_st = bfq_entity_service_tree(entity); |
19464 |
++ entity->weight = entity->orig_weight * |
19465 |
++ (bfqq != NULL ? bfqq->raising_coeff : 1); |
19466 |
++ new_st->wsum += entity->weight; |
19467 |
++ |
19468 |
++ if (new_st != old_st) |
19469 |
++ entity->start = new_st->vtime; |
19470 |
++ } |
19471 |
++ |
19472 |
++ return new_st; |
19473 |
++} |
19474 |
++ |
19475 |
++/** |
19476 |
++ * bfq_bfqq_served - update the scheduler status after selection for service. |
19477 |
++ * @bfqq: the queue being served. |
19478 |
++ * @served: bytes to transfer. |
19479 |
++ * |
19480 |
++ * NOTE: this can be optimized, as the timestamps of upper level entities |
19481 |
++ * are synchronized every time a new bfqq is selected for service. By now, |
19482 |
++ * we keep it to better check consistency. |
19483 |
++ */ |
19484 |
++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) |
19485 |
++{ |
19486 |
++ struct bfq_entity *entity = &bfqq->entity; |
19487 |
++ struct bfq_service_tree *st; |
19488 |
++ |
19489 |
++ for_each_entity(entity) { |
19490 |
++ st = bfq_entity_service_tree(entity); |
19491 |
++ |
19492 |
++ entity->service += served; |
19493 |
++ BUG_ON(entity->service > entity->budget); |
19494 |
++ BUG_ON(st->wsum == 0); |
19495 |
++ |
19496 |
++ st->vtime += bfq_delta(served, st->wsum); |
19497 |
++ bfq_forget_idle(st); |
19498 |
++ } |
19499 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served); |
19500 |
++} |
19501 |
++ |
19502 |
++/** |
19503 |
++ * bfq_bfqq_charge_full_budget - set the service to the entity budget. |
19504 |
++ * @bfqq: the queue that needs a service update. |
19505 |
++ * |
19506 |
++ * When it's not possible to be fair in the service domain, because |
19507 |
++ * a queue is not consuming its budget fast enough (the meaning of |
19508 |
++ * fast depends on the timeout parameter), we charge it a full |
19509 |
++ * budget. In this way we should obtain a sort of time-domain |
19510 |
++ * fairness among all the seeky/slow queues. |
19511 |
++ */ |
19512 |
++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) |
19513 |
++{ |
19514 |
++ struct bfq_entity *entity = &bfqq->entity; |
19515 |
++ |
19516 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); |
19517 |
++ |
19518 |
++ bfq_bfqq_served(bfqq, entity->budget - entity->service); |
19519 |
++} |
19520 |
++ |
19521 |
++/** |
19522 |
++ * __bfq_activate_entity - activate an entity. |
19523 |
++ * @entity: the entity being activated. |
19524 |
++ * |
19525 |
++ * Called whenever an entity is activated, i.e., it is not active and one |
19526 |
++ * of its children receives a new request, or has to be reactivated due to |
19527 |
++ * budget exhaustion. It uses the current budget of the entity (and the |
19528 |
++ * service received if @entity is active) of the queue to calculate its |
19529 |
++ * timestamps. |
19530 |
++ */ |
19531 |
++static void __bfq_activate_entity(struct bfq_entity *entity) |
19532 |
++{ |
19533 |
++ struct bfq_sched_data *sd = entity->sched_data; |
19534 |
++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); |
19535 |
++ |
19536 |
++ if (entity == sd->in_service_entity) { |
19537 |
++ BUG_ON(entity->tree != NULL); |
19538 |
++ /* |
19539 |
++ * If we are requeueing the current entity we have |
19540 |
++ * to take care of not charging to it service it has |
19541 |
++ * not received. |
19542 |
++ */ |
19543 |
++ bfq_calc_finish(entity, entity->service); |
19544 |
++ entity->start = entity->finish; |
19545 |
++ sd->in_service_entity = NULL; |
19546 |
++ } else if (entity->tree == &st->active) { |
19547 |
++ /* |
19548 |
++ * Requeueing an entity due to a change of some |
19549 |
++ * next_in_service entity below it. We reuse the |
19550 |
++ * old start time. |
19551 |
++ */ |
19552 |
++ bfq_active_extract(st, entity); |
19553 |
++ } else if (entity->tree == &st->idle) { |
19554 |
++ /* |
19555 |
++ * Must be on the idle tree, bfq_idle_extract() will |
19556 |
++ * check for that. |
19557 |
++ */ |
19558 |
++ bfq_idle_extract(st, entity); |
19559 |
++ entity->start = bfq_gt(st->vtime, entity->finish) ? |
19560 |
++ st->vtime : entity->finish; |
19561 |
++ } else { |
19562 |
++ /* |
19563 |
++ * The finish time of the entity may be invalid, and |
19564 |
++ * it is in the past for sure, otherwise the queue |
19565 |
++ * would have been on the idle tree. |
19566 |
++ */ |
19567 |
++ entity->start = st->vtime; |
19568 |
++ st->wsum += entity->weight; |
19569 |
++ bfq_get_entity(entity); |
19570 |
++ |
19571 |
++ BUG_ON(entity->on_st); |
19572 |
++ entity->on_st = 1; |
19573 |
++ } |
19574 |
++ |
19575 |
++ st = __bfq_entity_update_weight_prio(st, entity); |
19576 |
++ bfq_calc_finish(entity, entity->budget); |
19577 |
++ bfq_active_insert(st, entity); |
19578 |
++} |
19579 |
++ |
19580 |
++/** |
19581 |
++ * bfq_activate_entity - activate an entity and its ancestors if necessary. |
19582 |
++ * @entity: the entity to activate. |
19583 |
++ * |
19584 |
++ * Activate @entity and all the entities on the path from it to the root. |
19585 |
++ */ |
19586 |
++static void bfq_activate_entity(struct bfq_entity *entity) |
19587 |
++{ |
19588 |
++ struct bfq_sched_data *sd; |
19589 |
++ |
19590 |
++ for_each_entity(entity) { |
19591 |
++ __bfq_activate_entity(entity); |
19592 |
++ |
19593 |
++ sd = entity->sched_data; |
19594 |
++ if (!bfq_update_next_in_service(sd)) |
19595 |
++ /* |
19596 |
++ * No need to propagate the activation to the |
19597 |
++ * upper entities, as they will be updated when |
19598 |
++ * the in-service entity is rescheduled. |
19599 |
++ */ |
19600 |
++ break; |
19601 |
++ } |
19602 |
++} |
19603 |
++ |
19604 |
++/** |
19605 |
++ * __bfq_deactivate_entity - deactivate an entity from its service tree. |
19606 |
++ * @entity: the entity to deactivate. |
19607 |
++ * @requeue: if false, the entity will not be put into the idle tree. |
19608 |
++ * |
19609 |
++ * Deactivate an entity, independently from its previous state. If the |
19610 |
++ * entity was not on a service tree just return, otherwise if it is on |
19611 |
++ * any scheduler tree, extract it from that tree, and if necessary |
19612 |
++ * and if the caller did not specify @requeue, put it on the idle tree. |
19613 |
++ * |
19614 |
++ * Return %1 if the caller should update the entity hierarchy, i.e., |
19615 |
++ * if the entity was under service or if it was the next_in_service for |
19616 |
++ * its sched_data; return %0 otherwise. |
19617 |
++ */ |
19618 |
++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) |
19619 |
++{ |
19620 |
++ struct bfq_sched_data *sd = entity->sched_data; |
19621 |
++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); |
19622 |
++ int was_in_service = entity == sd->in_service_entity; |
19623 |
++ int ret = 0; |
19624 |
++ |
19625 |
++ if (!entity->on_st) |
19626 |
++ return 0; |
19627 |
++ |
19628 |
++ BUG_ON(was_in_service && entity->tree != NULL); |
19629 |
++ |
19630 |
++ if (was_in_service) { |
19631 |
++ bfq_calc_finish(entity, entity->service); |
19632 |
++ sd->in_service_entity = NULL; |
19633 |
++ } else if (entity->tree == &st->active) |
19634 |
++ bfq_active_extract(st, entity); |
19635 |
++ else if (entity->tree == &st->idle) |
19636 |
++ bfq_idle_extract(st, entity); |
19637 |
++ else if (entity->tree != NULL) |
19638 |
++ BUG(); |
19639 |
++ |
19640 |
++ if (was_in_service || sd->next_in_service == entity) |
19641 |
++ ret = bfq_update_next_in_service(sd); |
19642 |
++ |
19643 |
++ if (!requeue || !bfq_gt(entity->finish, st->vtime)) |
19644 |
++ bfq_forget_entity(st, entity); |
19645 |
++ else |
19646 |
++ bfq_idle_insert(st, entity); |
19647 |
++ |
19648 |
++ BUG_ON(sd->in_service_entity == entity); |
19649 |
++ BUG_ON(sd->next_in_service == entity); |
19650 |
++ |
19651 |
++ return ret; |
19652 |
++} |
19653 |
++ |
19654 |
++/** |
19655 |
++ * bfq_deactivate_entity - deactivate an entity. |
19656 |
++ * @entity: the entity to deactivate. |
19657 |
++ * @requeue: true if the entity can be put on the idle tree |
19658 |
++ */ |
19659 |
++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) |
19660 |
++{ |
19661 |
++ struct bfq_sched_data *sd; |
19662 |
++ struct bfq_entity *parent; |
19663 |
++ |
19664 |
++ for_each_entity_safe(entity, parent) { |
19665 |
++ sd = entity->sched_data; |
19666 |
++ |
19667 |
++ if (!__bfq_deactivate_entity(entity, requeue)) |
19668 |
++ /* |
19669 |
++ * The parent entity is still backlogged, and |
19670 |
++ * we don't need to update it as it is still |
19671 |
++ * under service. |
19672 |
++ */ |
19673 |
++ break; |
19674 |
++ |
19675 |
++ if (sd->next_in_service != NULL) |
19676 |
++ /* |
19677 |
++ * The parent entity is still backlogged and |
19678 |
++ * the budgets on the path towards the root |
19679 |
++ * need to be updated. |
19680 |
++ */ |
19681 |
++ goto update; |
19682 |
++ |
19683 |
++ /* |
19684 |
++ * If we reach there the parent is no more backlogged and |
19685 |
++ * we want to propagate the dequeue upwards. |
19686 |
++ */ |
19687 |
++ requeue = 1; |
19688 |
++ } |
19689 |
++ |
19690 |
++ return; |
19691 |
++ |
19692 |
++update: |
19693 |
++ entity = parent; |
19694 |
++ for_each_entity(entity) { |
19695 |
++ __bfq_activate_entity(entity); |
19696 |
++ |
19697 |
++ sd = entity->sched_data; |
19698 |
++ if (!bfq_update_next_in_service(sd)) |
19699 |
++ break; |
19700 |
++ } |
19701 |
++} |
19702 |
++ |
19703 |
++/** |
19704 |
++ * bfq_update_vtime - update vtime if necessary. |
19705 |
++ * @st: the service tree to act upon. |
19706 |
++ * |
19707 |
++ * If necessary update the service tree vtime to have at least one |
19708 |
++ * eligible entity, skipping to its start time. Assumes that the |
19709 |
++ * active tree of the device is not empty. |
19710 |
++ * |
19711 |
++ * NOTE: this hierarchical implementation updates vtimes quite often, |
19712 |
++ * we may end up with reactivated tasks getting timestamps after a |
19713 |
++ * vtime skip done because we needed a ->first_active entity on some |
19714 |
++ * intermediate node. |
19715 |
++ */ |
19716 |
++static void bfq_update_vtime(struct bfq_service_tree *st) |
19717 |
++{ |
19718 |
++ struct bfq_entity *entry; |
19719 |
++ struct rb_node *node = st->active.rb_node; |
19720 |
++ |
19721 |
++ entry = rb_entry(node, struct bfq_entity, rb_node); |
19722 |
++ if (bfq_gt(entry->min_start, st->vtime)) { |
19723 |
++ st->vtime = entry->min_start; |
19724 |
++ bfq_forget_idle(st); |
19725 |
++ } |
19726 |
++} |
19727 |
++ |
19728 |
++/** |
19729 |
++ * bfq_first_active_entity - find the eligible entity with |
19730 |
++ * the smallest finish time |
19731 |
++ * @st: the service tree to select from. |
19732 |
++ * |
19733 |
++ * This function searches the first schedulable entity, starting from the |
19734 |
++ * root of the tree and going on the left every time on this side there is |
19735 |
++ * a subtree with at least one eligible (start >= vtime) entity. The path |
19736 |
++ * on the right is followed only if a) the left subtree contains no eligible |
19737 |
++ * entities and b) no eligible entity has been found yet. |
19738 |
++ */ |
19739 |
++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) |
19740 |
++{ |
19741 |
++ struct bfq_entity *entry, *first = NULL; |
19742 |
++ struct rb_node *node = st->active.rb_node; |
19743 |
++ |
19744 |
++ while (node != NULL) { |
19745 |
++ entry = rb_entry(node, struct bfq_entity, rb_node); |
19746 |
++left: |
19747 |
++ if (!bfq_gt(entry->start, st->vtime)) |
19748 |
++ first = entry; |
19749 |
++ |
19750 |
++ BUG_ON(bfq_gt(entry->min_start, st->vtime)); |
19751 |
++ |
19752 |
++ if (node->rb_left != NULL) { |
19753 |
++ entry = rb_entry(node->rb_left, |
19754 |
++ struct bfq_entity, rb_node); |
19755 |
++ if (!bfq_gt(entry->min_start, st->vtime)) { |
19756 |
++ node = node->rb_left; |
19757 |
++ goto left; |
19758 |
++ } |
19759 |
++ } |
19760 |
++ if (first != NULL) |
19761 |
++ break; |
19762 |
++ node = node->rb_right; |
19763 |
++ } |
19764 |
++ |
19765 |
++ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active)); |
19766 |
++ return first; |
19767 |
++} |
19768 |
++ |
19769 |
++/** |
19770 |
++ * __bfq_lookup_next_entity - return the first eligible entity in @st. |
19771 |
++ * @st: the service tree. |
19772 |
++ * |
19773 |
++ * Update the virtual time in @st and return the first eligible entity |
19774 |
++ * it contains. |
19775 |
++ */ |
19776 |
++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, |
19777 |
++ bool force) |
19778 |
++{ |
19779 |
++ struct bfq_entity *entity, *new_next_in_service = NULL; |
19780 |
++ |
19781 |
++ if (RB_EMPTY_ROOT(&st->active)) |
19782 |
++ return NULL; |
19783 |
++ |
19784 |
++ bfq_update_vtime(st); |
19785 |
++ entity = bfq_first_active_entity(st); |
19786 |
++ BUG_ON(bfq_gt(entity->start, st->vtime)); |
19787 |
++ |
19788 |
++ /* |
19789 |
++ * If the chosen entity does not match with the sched_data's |
19790 |
++ * next_in_service and we are forcedly serving the IDLE priority |
19791 |
++ * class tree, bubble up budget update. |
19792 |
++ */ |
19793 |
++ if (unlikely(force && entity != entity->sched_data->next_in_service)) { |
19794 |
++ new_next_in_service = entity; |
19795 |
++ for_each_entity(new_next_in_service) |
19796 |
++ bfq_update_budget(new_next_in_service); |
19797 |
++ } |
19798 |
++ |
19799 |
++ return entity; |
19800 |
++} |
19801 |
++ |
19802 |
++/** |
19803 |
++ * bfq_lookup_next_entity - return the first eligible entity in @sd. |
19804 |
++ * @sd: the sched_data. |
19805 |
++ * @extract: if true the returned entity will be also extracted from @sd. |
19806 |
++ * |
19807 |
++ * NOTE: since we cache the next_in_service entity at each level of the |
19808 |
++ * hierarchy, the complexity of the lookup can be decreased with |
19809 |
++ * absolutely no effort just returning the cached next_in_service value; |
19810 |
++ * we prefer to do full lookups to test the consistency of * the data |
19811 |
++ * structures. |
19812 |
++ */ |
19813 |
++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, |
19814 |
++ int extract, |
19815 |
++ struct bfq_data *bfqd) |
19816 |
++{ |
19817 |
++ struct bfq_service_tree *st = sd->service_tree; |
19818 |
++ struct bfq_entity *entity; |
19819 |
++ int i = 0; |
19820 |
++ |
19821 |
++ BUG_ON(sd->in_service_entity != NULL); |
19822 |
++ |
19823 |
++ if (bfqd != NULL && |
19824 |
++ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { |
19825 |
++ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, |
19826 |
++ true); |
19827 |
++ if (entity != NULL) { |
19828 |
++ i = BFQ_IOPRIO_CLASSES - 1; |
19829 |
++ bfqd->bfq_class_idle_last_service = jiffies; |
19830 |
++ sd->next_in_service = entity; |
19831 |
++ } |
19832 |
++ } |
19833 |
++ for (; i < BFQ_IOPRIO_CLASSES; i++) { |
19834 |
++ entity = __bfq_lookup_next_entity(st + i, false); |
19835 |
++ if (entity != NULL) { |
19836 |
++ if (extract) { |
19837 |
++ bfq_check_next_in_service(sd, entity); |
19838 |
++ bfq_active_extract(st + i, entity); |
19839 |
++ sd->in_service_entity = entity; |
19840 |
++ sd->next_in_service = NULL; |
19841 |
++ } |
19842 |
++ break; |
19843 |
++ } |
19844 |
++ } |
19845 |
++ |
19846 |
++ return entity; |
19847 |
++} |
19848 |
++ |
19849 |
++/* |
19850 |
++ * Get next queue for service. |
19851 |
++ */ |
19852 |
++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) |
19853 |
++{ |
19854 |
++ struct bfq_entity *entity = NULL; |
19855 |
++ struct bfq_sched_data *sd; |
19856 |
++ struct bfq_queue *bfqq; |
19857 |
++ |
19858 |
++ BUG_ON(bfqd->in_service_queue != NULL); |
19859 |
++ |
19860 |
++ if (bfqd->busy_queues == 0) |
19861 |
++ return NULL; |
19862 |
++ |
19863 |
++ sd = &bfqd->root_group->sched_data; |
19864 |
++ for (; sd != NULL; sd = entity->my_sched_data) { |
19865 |
++ entity = bfq_lookup_next_entity(sd, 1, bfqd); |
19866 |
++ BUG_ON(entity == NULL); |
19867 |
++ entity->service = 0; |
19868 |
++ } |
19869 |
++ |
19870 |
++ bfqq = bfq_entity_to_bfqq(entity); |
19871 |
++ BUG_ON(bfqq == NULL); |
19872 |
++ |
19873 |
++ return bfqq; |
19874 |
++} |
19875 |
++ |
19876 |
++/* |
19877 |
++ * Forced extraction of the given queue. |
19878 |
++ */ |
19879 |
++static void bfq_get_next_queue_forced(struct bfq_data *bfqd, |
19880 |
++ struct bfq_queue *bfqq) |
19881 |
++{ |
19882 |
++ struct bfq_entity *entity; |
19883 |
++ struct bfq_sched_data *sd; |
19884 |
++ |
19885 |
++ BUG_ON(bfqd->in_service_queue != NULL); |
19886 |
++ |
19887 |
++ entity = &bfqq->entity; |
19888 |
++ /* |
19889 |
++ * Bubble up extraction/update from the leaf to the root. |
19890 |
++ */ |
19891 |
++ for_each_entity(entity) { |
19892 |
++ sd = entity->sched_data; |
19893 |
++ bfq_update_budget(entity); |
19894 |
++ bfq_update_vtime(bfq_entity_service_tree(entity)); |
19895 |
++ bfq_active_extract(bfq_entity_service_tree(entity), entity); |
19896 |
++ sd->active_entity = entity; |
19897 |
++ sd->next_active = NULL; |
19898 |
++ entity->service = 0; |
19899 |
++ } |
19900 |
++ |
19901 |
++ return; |
19902 |
++} |
19903 |
++ |
19904 |
++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) |
19905 |
++{ |
19906 |
++ if (bfqd->in_service_bic != NULL) { |
19907 |
++ put_io_context(bfqd->in_service_bic->icq.ioc); |
19908 |
++ bfqd->in_service_bic = NULL; |
19909 |
++ } |
19910 |
++ |
19911 |
++ bfqd->in_service_queue = NULL; |
19912 |
++ del_timer(&bfqd->idle_slice_timer); |
19913 |
++} |
19914 |
++ |
19915 |
++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
19916 |
++ int requeue) |
19917 |
++{ |
19918 |
++ struct bfq_entity *entity = &bfqq->entity; |
19919 |
++ |
19920 |
++ if (bfqq == bfqd->in_service_queue) |
19921 |
++ __bfq_bfqd_reset_in_service(bfqd); |
19922 |
++ |
19923 |
++ bfq_deactivate_entity(entity, requeue); |
19924 |
++} |
19925 |
++ |
19926 |
++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
19927 |
++{ |
19928 |
++ struct bfq_entity *entity = &bfqq->entity; |
19929 |
++ |
19930 |
++ bfq_activate_entity(entity); |
19931 |
++} |
19932 |
++ |
19933 |
++/* |
19934 |
++ * Called when the bfqq no longer has requests pending, remove it from |
19935 |
++ * the service tree. |
19936 |
++ */ |
19937 |
++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
19938 |
++ int requeue) |
19939 |
++{ |
19940 |
++ BUG_ON(!bfq_bfqq_busy(bfqq)); |
19941 |
++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); |
19942 |
++ |
19943 |
++ bfq_log_bfqq(bfqd, bfqq, "del from busy"); |
19944 |
++ |
19945 |
++ bfq_clear_bfqq_busy(bfqq); |
19946 |
++ |
19947 |
++ BUG_ON(bfqd->busy_queues == 0); |
19948 |
++ bfqd->busy_queues--; |
19949 |
++ if (bfqq->raising_coeff > 1) |
19950 |
++ bfqd->raised_busy_queues--; |
19951 |
++ |
19952 |
++ bfq_deactivate_bfqq(bfqd, bfqq, requeue); |
19953 |
++} |
19954 |
++ |
19955 |
++/* |
19956 |
++ * Called when an inactive queue receives a new request. |
19957 |
++ */ |
19958 |
++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) |
19959 |
++{ |
19960 |
++ BUG_ON(bfq_bfqq_busy(bfqq)); |
19961 |
++ BUG_ON(bfqq == bfqd->in_service_queue); |
19962 |
++ |
19963 |
++ bfq_log_bfqq(bfqd, bfqq, "add to busy"); |
19964 |
++ |
19965 |
++ bfq_activate_bfqq(bfqd, bfqq); |
19966 |
++ |
19967 |
++ bfq_mark_bfqq_busy(bfqq); |
19968 |
++ bfqd->busy_queues++; |
19969 |
++ if (bfqq->raising_coeff > 1) |
19970 |
++ bfqd->raised_busy_queues++; |
19971 |
++} |
19972 |
+diff --git a/block/bfq.h b/block/bfq.h |
19973 |
+new file mode 100644 |
19974 |
+index 0000000..f9b5881 |
19975 |
+--- /dev/null |
19976 |
++++ b/block/bfq.h |
19977 |
+@@ -0,0 +1,614 @@ |
19978 |
++/* |
19979 |
++ * BFQ-v7r1 for 3.13.0: data structures and common functions prototypes. |
19980 |
++ * |
19981 |
++ * Based on ideas and code from CFQ: |
19982 |
++ * Copyright (C) 2003 Jens Axboe <axboe@××××××.dk> |
19983 |
++ * |
19984 |
++ * Copyright (C) 2008 Fabio Checconi <fabio@×××××××××××××.it> |
19985 |
++ * Paolo Valente <paolo.valente@×××××××.it> |
19986 |
++ * |
19987 |
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@×××××××.it> |
19988 |
++ */ |
19989 |
++ |
19990 |
++#ifndef _BFQ_H |
19991 |
++#define _BFQ_H |
19992 |
++ |
19993 |
++#include <linux/blktrace_api.h> |
19994 |
++#include <linux/hrtimer.h> |
19995 |
++#include <linux/ioprio.h> |
19996 |
++#include <linux/rbtree.h> |
19997 |
++ |
19998 |
++#define BFQ_IOPRIO_CLASSES 3 |
19999 |
++#define BFQ_CL_IDLE_TIMEOUT (HZ/5) |
20000 |
++ |
20001 |
++#define BFQ_MIN_WEIGHT 1 |
20002 |
++#define BFQ_MAX_WEIGHT 1000 |
20003 |
++ |
20004 |
++#define BFQ_DEFAULT_GRP_WEIGHT 10 |
20005 |
++#define BFQ_DEFAULT_GRP_IOPRIO 0 |
20006 |
++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE |
20007 |
++ |
20008 |
++struct bfq_entity; |
20009 |
++ |
20010 |
++/** |
20011 |
++ * struct bfq_service_tree - per ioprio_class service tree. |
20012 |
++ * @active: tree for active entities (i.e., those backlogged). |
20013 |
++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). |
20014 |
++ * @first_idle: idle entity with minimum F_i. |
20015 |
++ * @last_idle: idle entity with maximum F_i. |
20016 |
++ * @vtime: scheduler virtual time. |
20017 |
++ * @wsum: scheduler weight sum; active and idle entities contribute to it. |
20018 |
++ * |
20019 |
++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each |
20020 |
++ * ioprio_class has its own independent scheduler, and so its own |
20021 |
++ * bfq_service_tree. All the fields are protected by the queue lock |
20022 |
++ * of the containing bfqd. |
20023 |
++ */ |
20024 |
++struct bfq_service_tree { |
20025 |
++ struct rb_root active; |
20026 |
++ struct rb_root idle; |
20027 |
++ |
20028 |
++ struct bfq_entity *first_idle; |
20029 |
++ struct bfq_entity *last_idle; |
20030 |
++ |
20031 |
++ u64 vtime; |
20032 |
++ unsigned long wsum; |
20033 |
++}; |
20034 |
++ |
20035 |
++/** |
20036 |
++ * struct bfq_sched_data - multi-class scheduler. |
20037 |
++ * @in_service_entity: entity under service. |
20038 |
++ * @next_in_service: head-of-the-line entity in the scheduler. |
20039 |
++ * @service_tree: array of service trees, one per ioprio_class. |
20040 |
++ * |
20041 |
++ * bfq_sched_data is the basic scheduler queue. It supports three |
20042 |
++ * ioprio_classes, and can be used either as a toplevel queue or as |
20043 |
++ * an intermediate queue on a hierarchical setup. |
20044 |
++ * @next_in_service points to the active entity of the sched_data |
20045 |
++ * service trees that will be scheduled next. |
20046 |
++ * |
20047 |
++ * The supported ioprio_classes are the same as in CFQ, in descending |
20048 |
++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. |
20049 |
++ * Requests from higher priority queues are served before all the |
20050 |
++ * requests from lower priority queues; among requests of the same |
20051 |
++ * queue requests are served according to B-WF2Q+. |
20052 |
++ * All the fields are protected by the queue lock of the containing bfqd. |
20053 |
++ */ |
20054 |
++struct bfq_sched_data { |
20055 |
++ struct bfq_entity *in_service_entity; |
20056 |
++ struct bfq_entity *next_in_service; |
20057 |
++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; |
20058 |
++}; |
20059 |
++ |
20060 |
++/** |
20061 |
++ * struct bfq_entity - schedulable entity. |
20062 |
++ * @rb_node: service_tree member. |
20063 |
++ * @on_st: flag, true if the entity is on a tree (either the active or |
20064 |
++ * the idle one of its service_tree). |
20065 |
++ * @finish: B-WF2Q+ finish timestamp (aka F_i). |
20066 |
++ * @start: B-WF2Q+ start timestamp (aka S_i). |
20067 |
++ * @tree: tree the entity is enqueued into; %NULL if not on a tree. |
20068 |
++ * @min_start: minimum start time of the (active) subtree rooted at |
20069 |
++ * this entity; used for O(log N) lookups into active trees. |
20070 |
++ * @service: service received during the last round of service. |
20071 |
++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. |
20072 |
++ * @weight: weight of the queue |
20073 |
++ * @parent: parent entity, for hierarchical scheduling. |
20074 |
++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the |
20075 |
++ * associated scheduler queue, %NULL on leaf nodes. |
20076 |
++ * @sched_data: the scheduler queue this entity belongs to. |
20077 |
++ * @ioprio: the ioprio in use. |
20078 |
++ * @new_weight: when a weight change is requested, the new weight value. |
20079 |
++ * @orig_weight: original weight, used to implement weight boosting |
20080 |
++ * @new_ioprio: when an ioprio change is requested, the new ioprio value. |
20081 |
++ * @ioprio_class: the ioprio_class in use. |
20082 |
++ * @new_ioprio_class: when an ioprio_class change is requested, the new |
20083 |
++ * ioprio_class value. |
20084 |
++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or |
20085 |
++ * ioprio_class change. |
20086 |
++ * |
20087 |
++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the |
20088 |
++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each |
20089 |
++ * entity belongs to the sched_data of the parent group in the cgroup |
20090 |
++ * hierarchy. Non-leaf entities have also their own sched_data, stored |
20091 |
++ * in @my_sched_data. |
20092 |
++ * |
20093 |
++ * Each entity stores independently its priority values; this would |
20094 |
++ * allow different weights on different devices, but this |
20095 |
++ * functionality is not exported to userspace by now. Priorities and |
20096 |
++ * weights are updated lazily, first storing the new values into the |
20097 |
++ * new_* fields, then setting the @ioprio_changed flag. As soon as |
20098 |
++ * there is a transition in the entity state that allows the priority |
20099 |
++ * update to take place the effective and the requested priority |
20100 |
++ * values are synchronized. |
20101 |
++ * |
20102 |
++ * Unless cgroups are used, the weight value is calculated from the |
20103 |
++ * ioprio to export the same interface as CFQ. When dealing with |
20104 |
++ * ``well-behaved'' queues (i.e., queues that do not spend too much |
20105 |
++ * time to consume their budget and have true sequential behavior, and |
20106 |
++ * when there are no external factors breaking anticipation) the |
20107 |
++ * relative weights at each level of the cgroups hierarchy should be |
20108 |
++ * guaranteed. All the fields are protected by the queue lock of the |
20109 |
++ * containing bfqd. |
20110 |
++ */ |
20111 |
++struct bfq_entity { |
20112 |
++ struct rb_node rb_node; |
20113 |
++ |
20114 |
++ int on_st; |
20115 |
++ |
20116 |
++ u64 finish; |
20117 |
++ u64 start; |
20118 |
++ |
20119 |
++ struct rb_root *tree; |
20120 |
++ |
20121 |
++ u64 min_start; |
20122 |
++ |
20123 |
++ unsigned long service, budget; |
20124 |
++ unsigned short weight, new_weight; |
20125 |
++ unsigned short orig_weight; |
20126 |
++ |
20127 |
++ struct bfq_entity *parent; |
20128 |
++ |
20129 |
++ struct bfq_sched_data *my_sched_data; |
20130 |
++ struct bfq_sched_data *sched_data; |
20131 |
++ |
20132 |
++ unsigned short ioprio, new_ioprio; |
20133 |
++ unsigned short ioprio_class, new_ioprio_class; |
20134 |
++ |
20135 |
++ int ioprio_changed; |
20136 |
++}; |
20137 |
++ |
20138 |
++struct bfq_group; |
20139 |
++ |
20140 |
++/** |
20141 |
++ * struct bfq_queue - leaf schedulable entity. |
20142 |
++ * @ref: reference counter. |
20143 |
++ * @bfqd: parent bfq_data. |
20144 |
++ * @new_bfqq: shared bfq_queue if queue is cooperating with |
20145 |
++ * one or more other queues. |
20146 |
++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree). |
20147 |
++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree). |
20148 |
++ * @sort_list: sorted list of pending requests. |
20149 |
++ * @next_rq: if fifo isn't expired, next request to serve. |
20150 |
++ * @queued: nr of requests queued in @sort_list. |
20151 |
++ * @allocated: currently allocated requests. |
20152 |
++ * @meta_pending: pending metadata requests. |
20153 |
++ * @fifo: fifo list of requests in sort_list. |
20154 |
++ * @entity: entity representing this queue in the scheduler. |
20155 |
++ * @max_budget: maximum budget allowed from the feedback mechanism. |
20156 |
++ * @budget_timeout: budget expiration (in jiffies). |
20157 |
++ * @dispatched: number of requests on the dispatch list or inside driver. |
20158 |
++ * @org_ioprio: saved ioprio during boosted periods. |
20159 |
++ * @flags: status flags. |
20160 |
++ * @bfqq_list: node for active/idle bfqq list inside our bfqd. |
20161 |
++ * @seek_samples: number of seeks sampled |
20162 |
++ * @seek_total: sum of the distances of the seeks sampled |
20163 |
++ * @seek_mean: mean seek distance |
20164 |
++ * @last_request_pos: position of the last request enqueued |
20165 |
++ * @pid: pid of the process owning the queue, used for logging purposes. |
20166 |
++ * @last_rais_start_time: last (idle -> weight-raised) transition attempt |
20167 |
++ * @raising_cur_max_time: current max raising time for this queue |
20168 |
++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from |
20169 |
++ * idle to backlogged |
20170 |
++ * @service_from_backlogged: cumulative service received from the @bfq_queue |
20171 |
++ * since the last transition from idle to backlogged |
20172 |
++ * |
20173 |
++ * A bfq_queue is a leaf request queue; it can be associated to an io_context |
20174 |
++ * or more (if it is an async one). @cgroup holds a reference to the |
20175 |
++ * cgroup, to be sure that it does not disappear while a bfqq still |
20176 |
++ * references it (mostly to avoid races between request issuing and task |
20177 |
++ * migration followed by cgroup distruction). |
20178 |
++ * All the fields are protected by the queue lock of the containing bfqd. |
20179 |
++ */ |
20180 |
++struct bfq_queue { |
20181 |
++ atomic_t ref; |
20182 |
++ struct bfq_data *bfqd; |
20183 |
++ |
20184 |
++ /* fields for cooperating queues handling */ |
20185 |
++ struct bfq_queue *new_bfqq; |
20186 |
++ struct rb_node pos_node; |
20187 |
++ struct rb_root *pos_root; |
20188 |
++ |
20189 |
++ struct rb_root sort_list; |
20190 |
++ struct request *next_rq; |
20191 |
++ int queued[2]; |
20192 |
++ int allocated[2]; |
20193 |
++ int meta_pending; |
20194 |
++ struct list_head fifo; |
20195 |
++ |
20196 |
++ struct bfq_entity entity; |
20197 |
++ |
20198 |
++ unsigned long max_budget; |
20199 |
++ unsigned long budget_timeout; |
20200 |
++ |
20201 |
++ int dispatched; |
20202 |
++ |
20203 |
++ unsigned short org_ioprio; |
20204 |
++ |
20205 |
++ unsigned int flags; |
20206 |
++ |
20207 |
++ struct list_head bfqq_list; |
20208 |
++ |
20209 |
++ unsigned int seek_samples; |
20210 |
++ u64 seek_total; |
20211 |
++ sector_t seek_mean; |
20212 |
++ sector_t last_request_pos; |
20213 |
++ |
20214 |
++ pid_t pid; |
20215 |
++ |
20216 |
++ /* weight-raising fields */ |
20217 |
++ unsigned long raising_cur_max_time; |
20218 |
++ unsigned long soft_rt_next_start; |
20219 |
++ unsigned long last_rais_start_finish; |
20220 |
++ unsigned int raising_coeff; |
20221 |
++ unsigned long last_idle_bklogged; |
20222 |
++ unsigned long service_from_backlogged; |
20223 |
++}; |
20224 |
++ |
20225 |
++/** |
20226 |
++ * struct bfq_ttime - per process thinktime stats. |
20227 |
++ * @ttime_total: total process thinktime |
20228 |
++ * @ttime_samples: number of thinktime samples |
20229 |
++ * @ttime_mean: average process thinktime |
20230 |
++ */ |
20231 |
++struct bfq_ttime { |
20232 |
++ unsigned long last_end_request; |
20233 |
++ |
20234 |
++ unsigned long ttime_total; |
20235 |
++ unsigned long ttime_samples; |
20236 |
++ unsigned long ttime_mean; |
20237 |
++}; |
20238 |
++ |
20239 |
++/** |
20240 |
++ * struct bfq_io_cq - per (request_queue, io_context) structure. |
20241 |
++ * @icq: associated io_cq structure |
20242 |
++ * @bfqq: array of two process queues, the sync and the async |
20243 |
++ * @ttime: associated @bfq_ttime struct |
20244 |
++ */ |
20245 |
++struct bfq_io_cq { |
20246 |
++ struct io_cq icq; /* must be the first member */ |
20247 |
++ struct bfq_queue *bfqq[2]; |
20248 |
++ struct bfq_ttime ttime; |
20249 |
++ int ioprio; |
20250 |
++}; |
20251 |
++ |
20252 |
++/** |
20253 |
++ * struct bfq_data - per device data structure. |
20254 |
++ * @queue: request queue for the managed device. |
20255 |
++ * @root_group: root bfq_group for the device. |
20256 |
++ * @rq_pos_tree: rbtree sorted by next_request position, |
20257 |
++ * used when determining if two or more queues |
20258 |
++ * have interleaving requests (see bfq_close_cooperator). |
20259 |
++ * @busy_queues: number of bfq_queues containing requests (including the |
20260 |
++ * queue under service, even if it is idling). |
20261 |
++ * @raised_busy_queues: number of weight-raised busy bfq_queues. |
20262 |
++ * @queued: number of queued requests. |
20263 |
++ * @rq_in_driver: number of requests dispatched and waiting for completion. |
20264 |
++ * @sync_flight: number of sync requests in the driver. |
20265 |
++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples |
20266 |
++ * completed requests . |
20267 |
++ * @hw_tag_samples: nr of samples used to calculate hw_tag. |
20268 |
++ * @hw_tag: flag set to one if the driver is showing a queueing behavior. |
20269 |
++ * @budgets_assigned: number of budgets assigned. |
20270 |
++ * @idle_slice_timer: timer set when idling for the next sequential request |
20271 |
++ * from the queue under service. |
20272 |
++ * @unplug_work: delayed work to restart dispatching on the request queue. |
20273 |
++ * @in_service_queue: bfq_queue under service. |
20274 |
++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. |
20275 |
++ * @last_position: on-disk position of the last served request. |
20276 |
++ * @last_budget_start: beginning of the last budget. |
20277 |
++ * @last_idling_start: beginning of the last idle slice. |
20278 |
++ * @peak_rate: peak transfer rate observed for a budget. |
20279 |
++ * @peak_rate_samples: number of samples used to calculate @peak_rate. |
20280 |
++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling. |
20281 |
++ * @group_list: list of all the bfq_groups active on the device. |
20282 |
++ * @active_list: list of all the bfq_queues active on the device. |
20283 |
++ * @idle_list: list of all the bfq_queues idle on the device. |
20284 |
++ * @bfq_quantum: max number of requests dispatched per dispatch round. |
20285 |
++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires |
20286 |
++ * requests are served in fifo order. |
20287 |
++ * @bfq_back_penalty: weight of backward seeks wrt forward ones. |
20288 |
++ * @bfq_back_max: maximum allowed backward seek. |
20289 |
++ * @bfq_slice_idle: maximum idling time. |
20290 |
++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning). |
20291 |
++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to |
20292 |
++ * async queues. |
20293 |
++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to |
20294 |
++ * to prevent seeky queues to impose long latencies to well |
20295 |
++ * behaved ones (this also implies that seeky queues cannot |
20296 |
++ * receive guarantees in the service domain; after a timeout |
20297 |
++ * they are charged for the whole allocated budget, to try |
20298 |
++ * to preserve a behavior reasonably fair among them, but |
20299 |
++ * without service-domain guarantees). |
20300 |
++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted |
20301 |
++ * queue is multiplied |
20302 |
++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies) |
20303 |
++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes |
20304 |
++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising |
20305 |
++ * may be reactivated for a queue (in jiffies) |
20306 |
++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals |
20307 |
++ * after which weight-raising may be |
20308 |
++ * reactivated for an already busy queue |
20309 |
++ * (in jiffies) |
20310 |
++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue, |
20311 |
++ * sectors per seconds |
20312 |
++ * @RT_prod: cached value of the product R*T used for computing the maximum |
20313 |
++ * duration of the weight raising automatically |
20314 |
++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions |
20315 |
++ * |
20316 |
++ * All the fields are protected by the @queue lock. |
20317 |
++ */ |
20318 |
++struct bfq_data { |
20319 |
++ struct request_queue *queue; |
20320 |
++ |
20321 |
++ struct bfq_group *root_group; |
20322 |
++ |
20323 |
++ struct rb_root rq_pos_tree; |
20324 |
++ |
20325 |
++ int busy_queues; |
20326 |
++ int raised_busy_queues; |
20327 |
++ int queued; |
20328 |
++ int rq_in_driver; |
20329 |
++ int sync_flight; |
20330 |
++ |
20331 |
++ int max_rq_in_driver; |
20332 |
++ int hw_tag_samples; |
20333 |
++ int hw_tag; |
20334 |
++ |
20335 |
++ int budgets_assigned; |
20336 |
++ |
20337 |
++ struct timer_list idle_slice_timer; |
20338 |
++ struct work_struct unplug_work; |
20339 |
++ |
20340 |
++ struct bfq_queue *in_service_queue; |
20341 |
++ struct bfq_io_cq *in_service_bic; |
20342 |
++ |
20343 |
++ sector_t last_position; |
20344 |
++ |
20345 |
++ ktime_t last_budget_start; |
20346 |
++ ktime_t last_idling_start; |
20347 |
++ int peak_rate_samples; |
20348 |
++ u64 peak_rate; |
20349 |
++ unsigned long bfq_max_budget; |
20350 |
++ |
20351 |
++ struct hlist_head group_list; |
20352 |
++ struct list_head active_list; |
20353 |
++ struct list_head idle_list; |
20354 |
++ |
20355 |
++ unsigned int bfq_quantum; |
20356 |
++ unsigned int bfq_fifo_expire[2]; |
20357 |
++ unsigned int bfq_back_penalty; |
20358 |
++ unsigned int bfq_back_max; |
20359 |
++ unsigned int bfq_slice_idle; |
20360 |
++ u64 bfq_class_idle_last_service; |
20361 |
++ |
20362 |
++ unsigned int bfq_user_max_budget; |
20363 |
++ unsigned int bfq_max_budget_async_rq; |
20364 |
++ unsigned int bfq_timeout[2]; |
20365 |
++ |
20366 |
++ bool low_latency; |
20367 |
++ |
20368 |
++ /* parameters of the low_latency heuristics */ |
20369 |
++ unsigned int bfq_raising_coeff; |
20370 |
++ unsigned int bfq_raising_max_time; |
20371 |
++ unsigned int bfq_raising_rt_max_time; |
20372 |
++ unsigned int bfq_raising_min_idle_time; |
20373 |
++ unsigned long bfq_raising_min_inter_arr_async; |
20374 |
++ unsigned int bfq_raising_max_softrt_rate; |
20375 |
++ u64 RT_prod; |
20376 |
++ |
20377 |
++ struct bfq_queue oom_bfqq; |
20378 |
++}; |
20379 |
++ |
20380 |
++enum bfqq_state_flags { |
20381 |
++ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */ |
20382 |
++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ |
20383 |
++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ |
20384 |
++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ |
20385 |
++ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ |
20386 |
++ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */ |
20387 |
++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ |
20388 |
++ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ |
20389 |
++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ |
20390 |
++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */ |
20391 |
++ BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */ |
20392 |
++}; |
20393 |
++ |
20394 |
++#define BFQ_BFQQ_FNS(name) \ |
20395 |
++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ |
20396 |
++{ \ |
20397 |
++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ |
20398 |
++} \ |
20399 |
++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ |
20400 |
++{ \ |
20401 |
++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ |
20402 |
++} \ |
20403 |
++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ |
20404 |
++{ \ |
20405 |
++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ |
20406 |
++} |
20407 |
++ |
20408 |
++BFQ_BFQQ_FNS(busy); |
20409 |
++BFQ_BFQQ_FNS(wait_request); |
20410 |
++BFQ_BFQQ_FNS(must_alloc); |
20411 |
++BFQ_BFQQ_FNS(fifo_expire); |
20412 |
++BFQ_BFQQ_FNS(idle_window); |
20413 |
++BFQ_BFQQ_FNS(prio_changed); |
20414 |
++BFQ_BFQQ_FNS(sync); |
20415 |
++BFQ_BFQQ_FNS(budget_new); |
20416 |
++BFQ_BFQQ_FNS(coop); |
20417 |
++BFQ_BFQQ_FNS(split_coop); |
20418 |
++BFQ_BFQQ_FNS(softrt_update); |
20419 |
++#undef BFQ_BFQQ_FNS |
20420 |
++ |
20421 |
++/* Logging facilities. */ |
20422 |
++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ |
20423 |
++ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) |
20424 |
++ |
20425 |
++#define bfq_log(bfqd, fmt, args...) \ |
20426 |
++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) |
20427 |
++ |
20428 |
++/* Expiration reasons. */ |
20429 |
++enum bfqq_expiration { |
20430 |
++ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */ |
20431 |
++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ |
20432 |
++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ |
20433 |
++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ |
20434 |
++}; |
20435 |
++ |
20436 |
++#ifdef CONFIG_CGROUP_BFQIO |
20437 |
++/** |
20438 |
++ * struct bfq_group - per (device, cgroup) data structure. |
20439 |
++ * @entity: schedulable entity to insert into the parent group sched_data. |
20440 |
++ * @sched_data: own sched_data, to contain child entities (they may be |
20441 |
++ * both bfq_queues and bfq_groups). |
20442 |
++ * @group_node: node to be inserted into the bfqio_cgroup->group_data |
20443 |
++ * list of the containing cgroup's bfqio_cgroup. |
20444 |
++ * @bfqd_node: node to be inserted into the @bfqd->group_list list |
20445 |
++ * of the groups active on the same device; used for cleanup. |
20446 |
++ * @bfqd: the bfq_data for the device this group acts upon. |
20447 |
++ * @async_bfqq: array of async queues for all the tasks belonging to |
20448 |
++ * the group, one queue per ioprio value per ioprio_class, |
20449 |
++ * except for the idle class that has only one queue. |
20450 |
++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). |
20451 |
++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used |
20452 |
++ * to avoid too many special cases during group creation/migration. |
20453 |
++ * |
20454 |
++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup |
20455 |
++ * there is a set of bfq_groups, each one collecting the lower-level |
20456 |
++ * entities belonging to the group that are acting on the same device. |
20457 |
++ * |
20458 |
++ * Locking works as follows: |
20459 |
++ * o @group_node is protected by the bfqio_cgroup lock, and is accessed |
20460 |
++ * via RCU from its readers. |
20461 |
++ * o @bfqd is protected by the queue lock, RCU is used to access it |
20462 |
++ * from the readers. |
20463 |
++ * o All the other fields are protected by the @bfqd queue lock. |
20464 |
++ */ |
20465 |
++struct bfq_group { |
20466 |
++ struct bfq_entity entity; |
20467 |
++ struct bfq_sched_data sched_data; |
20468 |
++ |
20469 |
++ struct hlist_node group_node; |
20470 |
++ struct hlist_node bfqd_node; |
20471 |
++ |
20472 |
++ void *bfqd; |
20473 |
++ |
20474 |
++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; |
20475 |
++ struct bfq_queue *async_idle_bfqq; |
20476 |
++ |
20477 |
++ struct bfq_entity *my_entity; |
20478 |
++}; |
20479 |
++ |
20480 |
++/** |
20481 |
++ * struct bfqio_cgroup - bfq cgroup data structure. |
20482 |
++ * @css: subsystem state for bfq in the containing cgroup. |
20483 |
++ * @online: flag marked when the subsystem is inserted. |
20484 |
++ * @weight: cgroup weight. |
20485 |
++ * @ioprio: cgroup ioprio. |
20486 |
++ * @ioprio_class: cgroup ioprio_class. |
20487 |
++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data. |
20488 |
++ * @group_data: list containing the bfq_group belonging to this cgroup. |
20489 |
++ * |
20490 |
++ * @group_data is accessed using RCU, with @lock protecting the updates, |
20491 |
++ * @ioprio and @ioprio_class are protected by @lock. |
20492 |
++ */ |
20493 |
++struct bfqio_cgroup { |
20494 |
++ struct cgroup_subsys_state css; |
20495 |
++ bool online; |
20496 |
++ |
20497 |
++ unsigned short weight, ioprio, ioprio_class; |
20498 |
++ |
20499 |
++ spinlock_t lock; |
20500 |
++ struct hlist_head group_data; |
20501 |
++}; |
20502 |
++#else |
20503 |
++struct bfq_group { |
20504 |
++ struct bfq_sched_data sched_data; |
20505 |
++ |
20506 |
++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; |
20507 |
++ struct bfq_queue *async_idle_bfqq; |
20508 |
++}; |
20509 |
++#endif |
20510 |
++ |
20511 |
++static inline struct bfq_service_tree * |
20512 |
++bfq_entity_service_tree(struct bfq_entity *entity) |
20513 |
++{ |
20514 |
++ struct bfq_sched_data *sched_data = entity->sched_data; |
20515 |
++ unsigned int idx = entity->ioprio_class - 1; |
20516 |
++ |
20517 |
++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); |
20518 |
++ BUG_ON(sched_data == NULL); |
20519 |
++ |
20520 |
++ return sched_data->service_tree + idx; |
20521 |
++} |
20522 |
++ |
20523 |
++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, |
20524 |
++ int is_sync) |
20525 |
++{ |
20526 |
++ return bic->bfqq[!!is_sync]; |
20527 |
++} |
20528 |
++ |
20529 |
++static inline void bic_set_bfqq(struct bfq_io_cq *bic, |
20530 |
++ struct bfq_queue *bfqq, int is_sync) |
20531 |
++{ |
20532 |
++ bic->bfqq[!!is_sync] = bfqq; |
20533 |
++} |
20534 |
++ |
20535 |
++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) |
20536 |
++{ |
20537 |
++ return bic->icq.q->elevator->elevator_data; |
20538 |
++} |
20539 |
++ |
20540 |
++/** |
20541 |
++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. |
20542 |
++ * @ptr: a pointer to a bfqd. |
20543 |
++ * @flags: storage for the flags to be saved. |
20544 |
++ * |
20545 |
++ * This function allows bfqg->bfqd to be protected by the |
20546 |
++ * queue lock of the bfqd they reference; the pointer is dereferenced |
20547 |
++ * under RCU, so the storage for bfqd is assured to be safe as long |
20548 |
++ * as the RCU read side critical section does not end. After the |
20549 |
++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be |
20550 |
++ * sure that no other writer accessed it. If we raced with a writer, |
20551 |
++ * the function returns NULL, with the queue unlocked, otherwise it |
20552 |
++ * returns the dereferenced pointer, with the queue locked. |
20553 |
++ */ |
20554 |
++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr, |
20555 |
++ unsigned long *flags) |
20556 |
++{ |
20557 |
++ struct bfq_data *bfqd; |
20558 |
++ |
20559 |
++ rcu_read_lock(); |
20560 |
++ bfqd = rcu_dereference(*(struct bfq_data **)ptr); |
20561 |
++ |
20562 |
++ if (bfqd != NULL) { |
20563 |
++ spin_lock_irqsave(bfqd->queue->queue_lock, *flags); |
20564 |
++ if (*ptr == bfqd) |
20565 |
++ goto out; |
20566 |
++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); |
20567 |
++ } |
20568 |
++ |
20569 |
++ bfqd = NULL; |
20570 |
++out: |
20571 |
++ rcu_read_unlock(); |
20572 |
++ return bfqd; |
20573 |
++} |
20574 |
++ |
20575 |
++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd, |
20576 |
++ unsigned long *flags) |
20577 |
++{ |
20578 |
++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); |
20579 |
++} |
20580 |
++ |
20581 |
++static void bfq_changed_ioprio(struct bfq_io_cq *bic); |
20582 |
++static void bfq_put_queue(struct bfq_queue *bfqq); |
20583 |
++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); |
20584 |
++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, |
20585 |
++ struct bfq_group *bfqg, int is_sync, |
20586 |
++ struct bfq_io_cq *bic, gfp_t gfp_mask); |
20587 |
++static void bfq_end_raising_async_queues(struct bfq_data *bfqd, |
20588 |
++ struct bfq_group *bfqg); |
20589 |
++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); |
20590 |
++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); |
20591 |
++#endif |
20592 |
+-- |
20593 |
+1.8.5.2 |
20594 |
+ |
20595 |
|
20596 |
Added: genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch |
20597 |
=================================================================== |
20598 |
--- genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch (rev 0) |
20599 |
+++ genpatches-2.6/trunk/3.14/5000_BFQ-3-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r1-for-3.13.0.patch 2014-02-07 15:42:35 UTC (rev 2666) |
20600 |
@@ -0,0 +1,1034 @@ |
20601 |
+From 39b1dba58b2562ba0d93a33a4f9af662d3c790c5 Mon Sep 17 00:00:00 2001 |
20602 |
+From: Mauro Andreolini <mauro.andreolini@×××××××.it> |
20603 |
+Date: Thu, 23 Jan 2014 16:54:44 +0100 |
20604 |
+Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r1 for |
20605 |
+ 3.13.0 |
20606 |
+ |
20607 |
+A set of processes may happen to perform interleaved reads, i.e., requests |
20608 |
+whose union would give rise to a sequential read pattern. There are two |
20609 |
+typical cases: in the first case, processes read fixed-size chunks of |
20610 |
+data at a fixed distance from each other, while in the second case processes |
20611 |
+may read variable-size chunks at variable distances. The latter case occurs |
20612 |
+for example with KVM, which splits the I/O generated by the guest into |
20613 |
+multiple chunks, and lets these chunks be served by a pool of cooperating |
20614 |
+processes, iteratively assigning the next chunk of I/O to the first |
20615 |
+available process. CFQ uses actual queue merging for the first type of |
20616 |
+rocesses, whereas it uses preemption to get a sequential read pattern out |
20617 |
+of the read requests performed by the second type of processes. In the end |
20618 |
+it uses two different mechanisms to achieve the same goal: boosting the |
20619 |
+throughput with interleaved I/O. |
20620 |
+ |
20621 |
+This patch introduces Early Queue Merge (EQM), a unified mechanism to get a |
20622 |
+sequential read pattern with both types of processes. The main idea is |
20623 |
+checking newly arrived requests against the next request of the active queue |
20624 |
+both in case of actual request insert and in case of request merge. By doing |
20625 |
+so, both the types of processes can be handled by just merging their queues. |
20626 |
+EQM is then simpler and more compact than the pair of mechanisms used in |
20627 |
+CFQ. |
20628 |
+ |
20629 |
+Finally, EQM also preserves the typical low-latency properties of BFQ, by |
20630 |
+properly restoring the weight-raising state of a queue when it gets back to |
20631 |
+a non-merged state. |
20632 |
+ |
20633 |
+Signed-off-by: Mauro Andreolini <mauro.andreolini@×××××××.it> |
20634 |
+Signed-off-by: Arianna Avanzini <avanzini.arianna@×××××.com> |
20635 |
+Reviewed-by: Paolo Valente <paolo.valente@×××××××.it> |
20636 |
+--- |
20637 |
+ block/bfq-iosched.c | 657 ++++++++++++++++++++++++++++++++++++---------------- |
20638 |
+ block/bfq-sched.c | 28 --- |
20639 |
+ block/bfq.h | 16 ++ |
20640 |
+ 3 files changed, 474 insertions(+), 227 deletions(-) |
20641 |
+ |
20642 |
+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c |
20643 |
+index eb760de..06ee844 100644 |
20644 |
+--- a/block/bfq-iosched.c |
20645 |
++++ b/block/bfq-iosched.c |
20646 |
+@@ -445,6 +445,46 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd) |
20647 |
+ return dur; |
20648 |
+ } |
20649 |
+ |
20650 |
++static inline void |
20651 |
++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) |
20652 |
++{ |
20653 |
++ if (bic->saved_idle_window) |
20654 |
++ bfq_mark_bfqq_idle_window(bfqq); |
20655 |
++ else |
20656 |
++ bfq_clear_bfqq_idle_window(bfqq); |
20657 |
++ if (bic->raising_time_left && bfqq->bfqd->low_latency) { |
20658 |
++ /* |
20659 |
++ * Start a weight raising period with the duration given by |
20660 |
++ * the raising_time_left snapshot. |
20661 |
++ */ |
20662 |
++ if (bfq_bfqq_busy(bfqq)) |
20663 |
++ bfqq->bfqd->raised_busy_queues++; |
20664 |
++ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff; |
20665 |
++ bfqq->raising_cur_max_time = bic->raising_time_left; |
20666 |
++ bfqq->last_rais_start_finish = jiffies; |
20667 |
++ bfqq->entity.ioprio_changed = 1; |
20668 |
++ } |
20669 |
++ /* |
20670 |
++ * Clear raising_time_left to prevent bfq_bfqq_save_state() from |
20671 |
++ * getting confused about the queue's need of a weight-raising |
20672 |
++ * period. |
20673 |
++ */ |
20674 |
++ bic->raising_time_left = 0; |
20675 |
++} |
20676 |
++ |
20677 |
++/* |
20678 |
++ * Must be called with the queue_lock held. |
20679 |
++ */ |
20680 |
++static int bfqq_process_refs(struct bfq_queue *bfqq) |
20681 |
++{ |
20682 |
++ int process_refs, io_refs; |
20683 |
++ |
20684 |
++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; |
20685 |
++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; |
20686 |
++ BUG_ON(process_refs < 0); |
20687 |
++ return process_refs; |
20688 |
++} |
20689 |
++ |
20690 |
+ static void bfq_add_rq_rb(struct request *rq) |
20691 |
+ { |
20692 |
+ struct bfq_queue *bfqq = RQ_BFQQ(rq); |
20693 |
+@@ -486,12 +526,20 @@ static void bfq_add_rq_rb(struct request *rq) |
20694 |
+ if (!bfqd->low_latency) |
20695 |
+ goto add_bfqq_busy; |
20696 |
+ |
20697 |
++ if (bfq_bfqq_just_split(bfqq)) |
20698 |
++ goto set_ioprio_changed; |
20699 |
++ |
20700 |
+ /* |
20701 |
+- * If the queue is not being boosted and has been idle |
20702 |
+- * for enough time, start a weight-raising period |
20703 |
++ * If the queue: |
20704 |
++ * - is not being boosted, |
20705 |
++ * - has been idle for enough time, |
20706 |
++ * - is not a sync queue or is linked to a bfq_io_cq (it is |
20707 |
++ * shared "for its nature" or it is not shared and its |
20708 |
++ * requests have not been redirected to a shared queue) |
20709 |
++ * start a weight-raising period. |
20710 |
+ */ |
20711 |
+- if (old_raising_coeff == 1 && |
20712 |
+- (idle_for_long_time || soft_rt)) { |
20713 |
++ if (old_raising_coeff == 1 && (idle_for_long_time || soft_rt) && |
20714 |
++ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) { |
20715 |
+ bfqq->raising_coeff = bfqd->bfq_raising_coeff; |
20716 |
+ if (idle_for_long_time) |
20717 |
+ bfqq->raising_cur_max_time = |
20718 |
+@@ -574,6 +622,7 @@ static void bfq_add_rq_rb(struct request *rq) |
20719 |
+ bfqd->bfq_raising_rt_max_time; |
20720 |
+ } |
20721 |
+ } |
20722 |
++set_ioprio_changed: |
20723 |
+ if (old_raising_coeff != bfqq->raising_coeff) |
20724 |
+ entity->ioprio_changed = 1; |
20725 |
+ add_bfqq_busy: |
20726 |
+@@ -756,90 +805,35 @@ static void bfq_end_raising(struct bfq_data *bfqd) |
20727 |
+ spin_unlock_irq(bfqd->queue->queue_lock); |
20728 |
+ } |
20729 |
+ |
20730 |
+-static int bfq_allow_merge(struct request_queue *q, struct request *rq, |
20731 |
+- struct bio *bio) |
20732 |
+-{ |
20733 |
+- struct bfq_data *bfqd = q->elevator->elevator_data; |
20734 |
+- struct bfq_io_cq *bic; |
20735 |
+- struct bfq_queue *bfqq; |
20736 |
+- |
20737 |
+- /* |
20738 |
+- * Disallow merge of a sync bio into an async request. |
20739 |
+- */ |
20740 |
+- if (bfq_bio_sync(bio) && !rq_is_sync(rq)) |
20741 |
+- return 0; |
20742 |
+- |
20743 |
+- /* |
20744 |
+- * Lookup the bfqq that this bio will be queued with. Allow |
20745 |
+- * merge only if rq is queued there. |
20746 |
+- * Queue lock is held here. |
20747 |
+- */ |
20748 |
+- bic = bfq_bic_lookup(bfqd, current->io_context); |
20749 |
+- if (bic == NULL) |
20750 |
+- return 0; |
20751 |
+- |
20752 |
+- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); |
20753 |
+- return bfqq == RQ_BFQQ(rq); |
20754 |
+-} |
20755 |
+- |
20756 |
+-static void __bfq_set_in_service_queue(struct bfq_data *bfqd, |
20757 |
+- struct bfq_queue *bfqq) |
20758 |
+-{ |
20759 |
+- if (bfqq != NULL) { |
20760 |
+- bfq_mark_bfqq_must_alloc(bfqq); |
20761 |
+- bfq_mark_bfqq_budget_new(bfqq); |
20762 |
+- bfq_clear_bfqq_fifo_expire(bfqq); |
20763 |
+- |
20764 |
+- bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; |
20765 |
+- |
20766 |
+- bfq_log_bfqq(bfqd, bfqq, |
20767 |
+- "set_in_service_queue, cur-budget = %lu", |
20768 |
+- bfqq->entity.budget); |
20769 |
+- } |
20770 |
+- |
20771 |
+- bfqd->in_service_queue = bfqq; |
20772 |
+-} |
20773 |
+- |
20774 |
+-/* |
20775 |
+- * Get and set a new queue for service. |
20776 |
+- */ |
20777 |
+-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd, |
20778 |
+- struct bfq_queue *bfqq) |
20779 |
++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request) |
20780 |
+ { |
20781 |
+- if (!bfqq) |
20782 |
+- bfqq = bfq_get_next_queue(bfqd); |
20783 |
++ if (request) |
20784 |
++ return blk_rq_pos(io_struct); |
20785 |
+ else |
20786 |
+- bfq_get_next_queue_forced(bfqd, bfqq); |
20787 |
+- |
20788 |
+- __bfq_set_in_service_queue(bfqd, bfqq); |
20789 |
+- return bfqq; |
20790 |
++ return ((struct bio *)io_struct)->bi_sector; |
20791 |
+ } |
20792 |
+ |
20793 |
+-static inline sector_t bfq_dist_from_last(struct bfq_data *bfqd, |
20794 |
+- struct request *rq) |
20795 |
++static inline sector_t bfq_dist_from(sector_t pos1, |
20796 |
++ sector_t pos2) |
20797 |
+ { |
20798 |
+- if (blk_rq_pos(rq) >= bfqd->last_position) |
20799 |
+- return blk_rq_pos(rq) - bfqd->last_position; |
20800 |
++ if (pos1 >= pos2) |
20801 |
++ return pos1 - pos2; |
20802 |
+ else |
20803 |
+- return bfqd->last_position - blk_rq_pos(rq); |
20804 |
++ return pos2 - pos1; |
20805 |
+ } |
20806 |
+ |
20807 |
+-/* |
20808 |
+- * Return true if bfqq has no request pending and rq is close enough to |
20809 |
+- * bfqd->last_position, or if rq is closer to bfqd->last_position than |
20810 |
+- * bfqq->next_rq |
20811 |
+- */ |
20812 |
+-static inline int bfq_rq_close(struct bfq_data *bfqd, struct request *rq) |
20813 |
++static inline int bfq_rq_close_to_sector(void *io_struct, bool request, |
20814 |
++ sector_t sector) |
20815 |
+ { |
20816 |
+- return bfq_dist_from_last(bfqd, rq) <= BFQQ_SEEK_THR; |
20817 |
++ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <= |
20818 |
++ BFQQ_SEEK_THR; |
20819 |
+ } |
20820 |
+ |
20821 |
+-static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) |
20822 |
++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector) |
20823 |
+ { |
20824 |
+ struct rb_root *root = &bfqd->rq_pos_tree; |
20825 |
+ struct rb_node *parent, *node; |
20826 |
+ struct bfq_queue *__bfqq; |
20827 |
+- sector_t sector = bfqd->last_position; |
20828 |
+ |
20829 |
+ if (RB_EMPTY_ROOT(root)) |
20830 |
+ return NULL; |
20831 |
+@@ -858,7 +852,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) |
20832 |
+ * position). |
20833 |
+ */ |
20834 |
+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); |
20835 |
+- if (bfq_rq_close(bfqd, __bfqq->next_rq)) |
20836 |
++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) |
20837 |
+ return __bfqq; |
20838 |
+ |
20839 |
+ if (blk_rq_pos(__bfqq->next_rq) < sector) |
20840 |
+@@ -869,7 +863,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) |
20841 |
+ return NULL; |
20842 |
+ |
20843 |
+ __bfqq = rb_entry(node, struct bfq_queue, pos_node); |
20844 |
+- if (bfq_rq_close(bfqd, __bfqq->next_rq)) |
20845 |
++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) |
20846 |
+ return __bfqq; |
20847 |
+ |
20848 |
+ return NULL; |
20849 |
+@@ -878,14 +872,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) |
20850 |
+ /* |
20851 |
+ * bfqd - obvious |
20852 |
+ * cur_bfqq - passed in so that we don't decide that the current queue |
20853 |
+- * is closely cooperating with itself. |
20854 |
+- * |
20855 |
+- * We are assuming that cur_bfqq has dispatched at least one request, |
20856 |
+- * and that bfqd->last_position reflects a position on the disk associated |
20857 |
+- * with the I/O issued by cur_bfqq. |
20858 |
++ * is closely cooperating with itself |
20859 |
++ * sector - used as a reference point to search for a close queue |
20860 |
+ */ |
20861 |
+ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, |
20862 |
+- struct bfq_queue *cur_bfqq) |
20863 |
++ struct bfq_queue *cur_bfqq, |
20864 |
++ sector_t sector) |
20865 |
+ { |
20866 |
+ struct bfq_queue *bfqq; |
20867 |
+ |
20868 |
+@@ -905,7 +897,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, |
20869 |
+ * working closely on the same area of the disk. In that case, |
20870 |
+ * we can group them together and don't waste time idling. |
20871 |
+ */ |
20872 |
+- bfqq = bfqq_close(bfqd); |
20873 |
++ bfqq = bfqq_close(bfqd, sector); |
20874 |
+ if (bfqq == NULL || bfqq == cur_bfqq) |
20875 |
+ return NULL; |
20876 |
+ |
20877 |
+@@ -932,6 +924,282 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, |
20878 |
+ return bfqq; |
20879 |
+ } |
20880 |
+ |
20881 |
++static struct bfq_queue * |
20882 |
++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) |
20883 |
++{ |
20884 |
++ int process_refs, new_process_refs; |
20885 |
++ struct bfq_queue *__bfqq; |
20886 |
++ |
20887 |
++ /* |
20888 |
++ * If there are no process references on the new_bfqq, then it is |
20889 |
++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain |
20890 |
++ * may have dropped their last reference (not just their last process |
20891 |
++ * reference). |
20892 |
++ */ |
20893 |
++ if (!bfqq_process_refs(new_bfqq)) |
20894 |
++ return NULL; |
20895 |
++ |
20896 |
++ /* Avoid a circular list and skip interim queue merges. */ |
20897 |
++ while ((__bfqq = new_bfqq->new_bfqq)) { |
20898 |
++ if (__bfqq == bfqq) |
20899 |
++ return NULL; |
20900 |
++ new_bfqq = __bfqq; |
20901 |
++ } |
20902 |
++ |
20903 |
++ process_refs = bfqq_process_refs(bfqq); |
20904 |
++ new_process_refs = bfqq_process_refs(new_bfqq); |
20905 |
++ /* |
20906 |
++ * If the process for the bfqq has gone away, there is no |
20907 |
++ * sense in merging the queues. |
20908 |
++ */ |
20909 |
++ if (process_refs == 0 || new_process_refs == 0) |
20910 |
++ return NULL; |
20911 |
++ |
20912 |
++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", |
20913 |
++ new_bfqq->pid); |
20914 |
++ |
20915 |
++ /* |
20916 |
++ * Merging is just a redirection: the requests of the process owning |
20917 |
++ * one of the two queues are redirected to the other queue. The latter |
20918 |
++ * queue, in its turn, is set as shared if this is the first time that |
20919 |
++ * the requests of some process are redirected to it. |
20920 |
++ * |
20921 |
++ * We redirect bfqq to new_bfqq and not the opposite, because we |
20922 |
++ * are in the context of the process owning bfqq, hence we have the |
20923 |
++ * io_cq of this process. So we can immediately configure this io_cq |
20924 |
++ * to redirect the requests of the process to new_bfqq. |
20925 |
++ * |
20926 |
++ * NOTE, even if new_bfqq coincides with the in-service queue, the |
20927 |
++ * io_cq of new_bfqq is not available, because, if the in-service queue |
20928 |
++ * is shared, bfqd->in_service_bic may not point to the io_cq of the |
20929 |
++ * in-service queue. |
20930 |
++ * Redirecting the requests of the process owning bfqq to the currently |
20931 |
++ * in-service queue is in any case the best option, as we feed the |
20932 |
++ * in-service queue with new requests close to the last request served |
20933 |
++ * and, by doing so, hopefully increase the throughput. |
20934 |
++ */ |
20935 |
++ bfqq->new_bfqq = new_bfqq; |
20936 |
++ atomic_add(process_refs, &new_bfqq->ref); |
20937 |
++ return new_bfqq; |
20938 |
++} |
20939 |
++ |
20940 |
++/* |
20941 |
++ * Attempt to schedule a merge of bfqq with the currently in-service queue or |
20942 |
++ * with a close queue among the scheduled queues. |
20943 |
++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue |
20944 |
++ * structure otherwise. |
20945 |
++ */ |
20946 |
++static struct bfq_queue * |
20947 |
++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
20948 |
++ void *io_struct, bool request) |
20949 |
++{ |
20950 |
++ struct bfq_queue *in_service_bfqq, *new_bfqq; |
20951 |
++ |
20952 |
++ if (bfqq->new_bfqq) |
20953 |
++ return bfqq->new_bfqq; |
20954 |
++ |
20955 |
++ if (!io_struct) |
20956 |
++ return NULL; |
20957 |
++ |
20958 |
++ in_service_bfqq = bfqd->in_service_queue; |
20959 |
++ |
20960 |
++ if (in_service_bfqq == NULL || in_service_bfqq == bfqq || |
20961 |
++ !bfqd->in_service_bic) |
20962 |
++ goto check_scheduled; |
20963 |
++ |
20964 |
++ if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq)) |
20965 |
++ goto check_scheduled; |
20966 |
++ |
20967 |
++ if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq)) |
20968 |
++ goto check_scheduled; |
20969 |
++ |
20970 |
++ if (in_service_bfqq->entity.parent != bfqq->entity.parent) |
20971 |
++ goto check_scheduled; |
20972 |
++ |
20973 |
++ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && |
20974 |
++ bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) { |
20975 |
++ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); |
20976 |
++ if (new_bfqq != NULL) |
20977 |
++ return new_bfqq; /* Merge with the in-service queue */ |
20978 |
++ } |
20979 |
++ |
20980 |
++ /* |
20981 |
++ * Check whether there is a cooperator among currently scheduled |
20982 |
++ * queues. The only thing we need is that the bio/request is not |
20983 |
++ * NULL, as we need it to establish whether a cooperator exists. |
20984 |
++ */ |
20985 |
++check_scheduled: |
20986 |
++ new_bfqq = bfq_close_cooperator(bfqd, bfqq, |
20987 |
++ bfq_io_struct_pos(io_struct, request)); |
20988 |
++ if (new_bfqq) |
20989 |
++ return bfq_setup_merge(bfqq, new_bfqq); |
20990 |
++ |
20991 |
++ return NULL; |
20992 |
++} |
20993 |
++ |
20994 |
++static inline void |
20995 |
++bfq_bfqq_save_state(struct bfq_queue *bfqq) |
20996 |
++{ |
20997 |
++ /* |
20998 |
++ * If bfqq->bic == NULL, the queue is already shared or its requests |
20999 |
++ * have already been redirected to a shared queue; both idle window |
21000 |
++ * and weight raising state have already been saved. Do nothing. |
21001 |
++ */ |
21002 |
++ if (bfqq->bic == NULL) |
21003 |
++ return; |
21004 |
++ if (bfqq->bic->raising_time_left) |
21005 |
++ /* |
21006 |
++ * This is the queue of a just-started process, and would |
21007 |
++ * deserve weight raising: we set raising_time_left to the full |
21008 |
++ * weight-raising duration to trigger weight-raising when and |
21009 |
++ * if the queue is split and the first request of the queue |
21010 |
++ * is enqueued. |
21011 |
++ */ |
21012 |
++ bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd); |
21013 |
++ else if (bfqq->raising_coeff > 1) { |
21014 |
++ unsigned long wrais_duration = |
21015 |
++ jiffies - bfqq->last_rais_start_finish; |
21016 |
++ /* |
21017 |
++ * It may happen that a queue's weight raising period lasts |
21018 |
++ * longer than its raising_cur_max_time, as weight raising is |
21019 |
++ * handled only when a request is enqueued or dispatched (it |
21020 |
++ * does not use any timer). If the weight raising period is |
21021 |
++ * about to end, don't save it. |
21022 |
++ */ |
21023 |
++ if (bfqq->raising_cur_max_time <= wrais_duration) |
21024 |
++ bfqq->bic->raising_time_left = 0; |
21025 |
++ else |
21026 |
++ bfqq->bic->raising_time_left = |
21027 |
++ bfqq->raising_cur_max_time - wrais_duration; |
21028 |
++ /* |
21029 |
++ * The bfq_queue is becoming shared or the requests of the |
21030 |
++ * process owning the queue are being redirected to a shared |
21031 |
++ * queue. Stop the weight raising period of the queue, as in |
21032 |
++ * both cases it should not be owned by an interactive or soft |
21033 |
++ * real-time application. |
21034 |
++ */ |
21035 |
++ bfq_bfqq_end_raising(bfqq); |
21036 |
++ } else |
21037 |
++ bfqq->bic->raising_time_left = 0; |
21038 |
++ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); |
21039 |
++} |
21040 |
++ |
21041 |
++static inline void |
21042 |
++bfq_get_bic_reference(struct bfq_queue *bfqq) |
21043 |
++{ |
21044 |
++ /* |
21045 |
++ * If bfqq->bic has a non-NULL value, the bic to which it belongs |
21046 |
++ * is about to begin using a shared bfq_queue. |
21047 |
++ */ |
21048 |
++ if (bfqq->bic) |
21049 |
++ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); |
21050 |
++} |
21051 |
++ |
21052 |
++static void |
21053 |
++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, |
21054 |
++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) |
21055 |
++{ |
21056 |
++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", |
21057 |
++ (long unsigned)new_bfqq->pid); |
21058 |
++ /* Save weight raising and idle window of the merged queues */ |
21059 |
++ bfq_bfqq_save_state(bfqq); |
21060 |
++ bfq_bfqq_save_state(new_bfqq); |
21061 |
++ /* |
21062 |
++ * Grab a reference to the bic, to prevent it from being destroyed |
21063 |
++ * before being possibly touched by a bfq_split_bfqq(). |
21064 |
++ */ |
21065 |
++ bfq_get_bic_reference(bfqq); |
21066 |
++ bfq_get_bic_reference(new_bfqq); |
21067 |
++ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */ |
21068 |
++ bic_set_bfqq(bic, new_bfqq, 1); |
21069 |
++ bfq_mark_bfqq_coop(new_bfqq); |
21070 |
++ /* |
21071 |
++ * new_bfqq now belongs to at least two bics (it is a shared queue): set |
21072 |
++ * new_bfqq->bic to NULL. bfqq either: |
21073 |
++ * - does not belong to any bic any more, and hence bfqq->bic must |
21074 |
++ * be set to NULL, or |
21075 |
++ * - is a queue whose owning bics have already been redirected to a |
21076 |
++ * different queue, hence the queue is destined to not belong to any |
21077 |
++ * bic soon and bfqq->bic is already NULL (therefore the next |
21078 |
++ * assignment causes no harm). |
21079 |
++ */ |
21080 |
++ new_bfqq->bic = NULL; |
21081 |
++ bfqq->bic = NULL; |
21082 |
++ bfq_put_queue(bfqq); |
21083 |
++} |
21084 |
++ |
21085 |
++static int bfq_allow_merge(struct request_queue *q, struct request *rq, |
21086 |
++ struct bio *bio) |
21087 |
++{ |
21088 |
++ struct bfq_data *bfqd = q->elevator->elevator_data; |
21089 |
++ struct bfq_io_cq *bic; |
21090 |
++ struct bfq_queue *bfqq, *new_bfqq; |
21091 |
++ |
21092 |
++ /* |
21093 |
++ * Disallow merge of a sync bio into an async request. |
21094 |
++ */ |
21095 |
++ if (bfq_bio_sync(bio) && !rq_is_sync(rq)) |
21096 |
++ return 0; |
21097 |
++ |
21098 |
++ /* |
21099 |
++ * Lookup the bfqq that this bio will be queued with. Allow |
21100 |
++ * merge only if rq is queued there. |
21101 |
++ * Queue lock is held here. |
21102 |
++ */ |
21103 |
++ bic = bfq_bic_lookup(bfqd, current->io_context); |
21104 |
++ if (bic == NULL) |
21105 |
++ return 0; |
21106 |
++ |
21107 |
++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); |
21108 |
++ /* |
21109 |
++ * We take advantage of this function to perform an early merge |
21110 |
++ * of the queues of possible cooperating processes. |
21111 |
++ */ |
21112 |
++ if (bfqq != NULL) { |
21113 |
++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); |
21114 |
++ if (new_bfqq != NULL) { |
21115 |
++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); |
21116 |
++ /* |
21117 |
++ * If we get here, the bio will be queued in the shared queue, |
21118 |
++ * i.e., new_bfqq, so use new_bfqq to decide whether bio and |
21119 |
++ * rq can be merged. |
21120 |
++ */ |
21121 |
++ bfqq = new_bfqq; |
21122 |
++ } |
21123 |
++ } |
21124 |
++ |
21125 |
++ return bfqq == RQ_BFQQ(rq); |
21126 |
++} |
21127 |
++ |
21128 |
++static void __bfq_set_in_service_queue(struct bfq_data *bfqd, |
21129 |
++ struct bfq_queue *bfqq) |
21130 |
++{ |
21131 |
++ if (bfqq != NULL) { |
21132 |
++ bfq_mark_bfqq_must_alloc(bfqq); |
21133 |
++ bfq_mark_bfqq_budget_new(bfqq); |
21134 |
++ bfq_clear_bfqq_fifo_expire(bfqq); |
21135 |
++ |
21136 |
++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; |
21137 |
++ |
21138 |
++ bfq_log_bfqq(bfqd, bfqq, |
21139 |
++ "set_in_service_queue, cur-budget = %lu", |
21140 |
++ bfqq->entity.budget); |
21141 |
++ } |
21142 |
++ |
21143 |
++ bfqd->in_service_queue = bfqq; |
21144 |
++} |
21145 |
++ |
21146 |
++/* |
21147 |
++ * Get and set a new queue for service. |
21148 |
++ */ |
21149 |
++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) |
21150 |
++{ |
21151 |
++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); |
21152 |
++ |
21153 |
++ __bfq_set_in_service_queue(bfqd, bfqq); |
21154 |
++ return bfqq; |
21155 |
++} |
21156 |
++ |
21157 |
+ /* |
21158 |
+ * If enough samples have been computed, return the current max budget |
21159 |
+ * stored in bfqd, which is dynamically updated according to the |
21160 |
+@@ -1079,63 +1347,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq) |
21161 |
+ return rq; |
21162 |
+ } |
21163 |
+ |
21164 |
+-/* |
21165 |
+- * Must be called with the queue_lock held. |
21166 |
+- */ |
21167 |
+-static int bfqq_process_refs(struct bfq_queue *bfqq) |
21168 |
+-{ |
21169 |
+- int process_refs, io_refs; |
21170 |
+- |
21171 |
+- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; |
21172 |
+- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; |
21173 |
+- BUG_ON(process_refs < 0); |
21174 |
+- return process_refs; |
21175 |
+-} |
21176 |
+- |
21177 |
+-static void bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) |
21178 |
+-{ |
21179 |
+- int process_refs, new_process_refs; |
21180 |
+- struct bfq_queue *__bfqq; |
21181 |
+- |
21182 |
+- /* |
21183 |
+- * If there are no process references on the new_bfqq, then it is |
21184 |
+- * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain |
21185 |
+- * may have dropped their last reference (not just their last process |
21186 |
+- * reference). |
21187 |
+- */ |
21188 |
+- if (!bfqq_process_refs(new_bfqq)) |
21189 |
+- return; |
21190 |
+- |
21191 |
+- /* Avoid a circular list and skip interim queue merges. */ |
21192 |
+- while ((__bfqq = new_bfqq->new_bfqq)) { |
21193 |
+- if (__bfqq == bfqq) |
21194 |
+- return; |
21195 |
+- new_bfqq = __bfqq; |
21196 |
+- } |
21197 |
+- |
21198 |
+- process_refs = bfqq_process_refs(bfqq); |
21199 |
+- new_process_refs = bfqq_process_refs(new_bfqq); |
21200 |
+- /* |
21201 |
+- * If the process for the bfqq has gone away, there is no |
21202 |
+- * sense in merging the queues. |
21203 |
+- */ |
21204 |
+- if (process_refs == 0 || new_process_refs == 0) |
21205 |
+- return; |
21206 |
+- |
21207 |
+- /* |
21208 |
+- * Merge in the direction of the lesser amount of work. |
21209 |
+- */ |
21210 |
+- if (new_process_refs >= process_refs) { |
21211 |
+- bfqq->new_bfqq = new_bfqq; |
21212 |
+- atomic_add(process_refs, &new_bfqq->ref); |
21213 |
+- } else { |
21214 |
+- new_bfqq->new_bfqq = bfqq; |
21215 |
+- atomic_add(new_process_refs, &bfqq->ref); |
21216 |
+- } |
21217 |
+- bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", |
21218 |
+- new_bfqq->pid); |
21219 |
+-} |
21220 |
+- |
21221 |
+ static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) |
21222 |
+ { |
21223 |
+ struct bfq_entity *entity = &bfqq->entity; |
21224 |
+@@ -1729,7 +1940,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) |
21225 |
+ */ |
21226 |
+ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) |
21227 |
+ { |
21228 |
+- struct bfq_queue *bfqq, *new_bfqq = NULL; |
21229 |
++ struct bfq_queue *bfqq; |
21230 |
+ struct request *next_rq; |
21231 |
+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; |
21232 |
+ |
21233 |
+@@ -1739,17 +1950,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) |
21234 |
+ |
21235 |
+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); |
21236 |
+ |
21237 |
+- /* |
21238 |
+- * If another queue has a request waiting within our mean seek |
21239 |
+- * distance, let it run. The expire code will check for close |
21240 |
+- * cooperators and put the close queue at the front of the |
21241 |
+- * service tree. If possible, merge the expiring queue with the |
21242 |
+- * new bfqq. |
21243 |
+- */ |
21244 |
+- new_bfqq = bfq_close_cooperator(bfqd, bfqq); |
21245 |
+- if (new_bfqq != NULL && bfqq->new_bfqq == NULL) |
21246 |
+- bfq_setup_merge(bfqq, new_bfqq); |
21247 |
+- |
21248 |
+ if (bfq_may_expire_for_budg_timeout(bfqq) && |
21249 |
+ !timer_pending(&bfqd->idle_slice_timer) && |
21250 |
+ !bfq_bfqq_must_idle(bfqq)) |
21251 |
+@@ -1786,36 +1986,26 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) |
21252 |
+ bfq_clear_bfqq_wait_request(bfqq); |
21253 |
+ del_timer(&bfqd->idle_slice_timer); |
21254 |
+ } |
21255 |
+- if (new_bfqq == NULL) |
21256 |
+- goto keep_queue; |
21257 |
+- else |
21258 |
+- goto expire; |
21259 |
++ goto keep_queue; |
21260 |
+ } |
21261 |
+ } |
21262 |
+ |
21263 |
+ /* |
21264 |
+- * No requests pending. If the in-service queue has no cooperator and |
21265 |
+- * still has requests in flight (possibly waiting for a completion) |
21266 |
+- * or is idling for a new request, then keep it. |
21267 |
++ * No requests pending. If the in-service queue still has requests in |
21268 |
++ * flight (possibly waiting for a completion) or is idling for a new |
21269 |
++ * request, then keep it. |
21270 |
+ */ |
21271 |
+- if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) || |
21272 |
+- (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) { |
21273 |
++ if (timer_pending(&bfqd->idle_slice_timer) || |
21274 |
++ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) { |
21275 |
+ bfqq = NULL; |
21276 |
+ goto keep_queue; |
21277 |
+- } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) { |
21278 |
+- /* |
21279 |
+- * Expiring the queue because there is a close cooperator, |
21280 |
+- * cancel timer. |
21281 |
+- */ |
21282 |
+- bfq_clear_bfqq_wait_request(bfqq); |
21283 |
+- del_timer(&bfqd->idle_slice_timer); |
21284 |
+ } |
21285 |
+ |
21286 |
+ reason = BFQ_BFQQ_NO_MORE_REQUESTS; |
21287 |
+ expire: |
21288 |
+ bfq_bfqq_expire(bfqd, bfqq, 0, reason); |
21289 |
+ new_queue: |
21290 |
+- bfqq = bfq_set_in_service_queue(bfqd, new_bfqq); |
21291 |
++ bfqq = bfq_set_in_service_queue(bfqd); |
21292 |
+ bfq_log(bfqd, "select_queue: new queue %d returned", |
21293 |
+ bfqq != NULL ? bfqq->pid : 0); |
21294 |
+ keep_queue: |
21295 |
+@@ -1825,9 +2015,8 @@ keep_queue: |
21296 |
+ static void bfq_update_raising_data(struct bfq_data *bfqd, |
21297 |
+ struct bfq_queue *bfqq) |
21298 |
+ { |
21299 |
++ struct bfq_entity *entity = &bfqq->entity; |
21300 |
+ if (bfqq->raising_coeff > 1) { /* queue is being boosted */ |
21301 |
+- struct bfq_entity *entity = &bfqq->entity; |
21302 |
+- |
21303 |
+ bfq_log_bfqq(bfqd, bfqq, |
21304 |
+ "raising period dur %u/%u msec, " |
21305 |
+ "old raising coeff %u, w %d(%d)", |
21306 |
+@@ -1844,7 +2033,7 @@ static void bfq_update_raising_data(struct bfq_data *bfqd, |
21307 |
+ "WARN: pending prio change"); |
21308 |
+ /* |
21309 |
+ * If too much time has elapsed from the beginning |
21310 |
+- * of this weight-raising, stop it. |
21311 |
++ * of this weight-raising period, stop it. |
21312 |
+ */ |
21313 |
+ if (time_is_before_jiffies(bfqq->last_rais_start_finish + |
21314 |
+ bfqq->raising_cur_max_time)) { |
21315 |
+@@ -1856,11 +2045,13 @@ static void bfq_update_raising_data(struct bfq_data *bfqd, |
21316 |
+ jiffies_to_msecs(bfqq-> |
21317 |
+ raising_cur_max_time)); |
21318 |
+ bfq_bfqq_end_raising(bfqq); |
21319 |
+- __bfq_entity_update_weight_prio( |
21320 |
+- bfq_entity_service_tree(entity), |
21321 |
+- entity); |
21322 |
+ } |
21323 |
+ } |
21324 |
++ /* Update weight both if it must be raised and if it must be lowered */ |
21325 |
++ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1)) |
21326 |
++ __bfq_entity_update_weight_prio( |
21327 |
++ bfq_entity_service_tree(entity), |
21328 |
++ entity); |
21329 |
+ } |
21330 |
+ |
21331 |
+ /* |
21332 |
+@@ -2101,6 +2292,25 @@ static void bfq_init_icq(struct io_cq *icq) |
21333 |
+ struct bfq_io_cq *bic = icq_to_bic(icq); |
21334 |
+ |
21335 |
+ bic->ttime.last_end_request = jiffies; |
21336 |
++ /* |
21337 |
++ * A newly created bic indicates that the process has just |
21338 |
++ * started doing I/O, and is probably mapping into memory its |
21339 |
++ * executable and libraries: it definitely needs weight raising. |
21340 |
++ * There is however the possibility that the process performs, |
21341 |
++ * for a while, I/O close to some other process. EQM intercepts |
21342 |
++ * this behavior and may merge the queue corresponding to the |
21343 |
++ * process with some other queue, BEFORE the weight of the queue |
21344 |
++ * is raised. Merged queues are not weight-raised (they are assumed |
21345 |
++ * to belong to processes that benefit only from high throughput). |
21346 |
++ * If the merge is basically the consequence of an accident, then |
21347 |
++ * the queue will be split soon and will get back its old weight. |
21348 |
++ * It is then important to write down somewhere that this queue |
21349 |
++ * does need weight raising, even if it did not make it to get its |
21350 |
++ * weight raised before being merged. To this purpose, we overload |
21351 |
++ * the field raising_time_left and assign 1 to it, to mark the queue |
21352 |
++ * as needing weight raising. |
21353 |
++ */ |
21354 |
++ bic->raising_time_left = 1; |
21355 |
+ } |
21356 |
+ |
21357 |
+ static void bfq_exit_icq(struct io_cq *icq) |
21358 |
+@@ -2114,6 +2324,13 @@ static void bfq_exit_icq(struct io_cq *icq) |
21359 |
+ } |
21360 |
+ |
21361 |
+ if (bic->bfqq[BLK_RW_SYNC]) { |
21362 |
++ /* |
21363 |
++ * If the bic is using a shared queue, put the reference |
21364 |
++ * taken on the io_context when the bic started using a |
21365 |
++ * shared bfq_queue. |
21366 |
++ */ |
21367 |
++ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) |
21368 |
++ put_io_context(icq->ioc); |
21369 |
+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); |
21370 |
+ bic->bfqq[BLK_RW_SYNC] = NULL; |
21371 |
+ } |
21372 |
+@@ -2405,6 +2622,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, |
21373 |
+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) |
21374 |
+ return; |
21375 |
+ |
21376 |
++ /* Idle window just restored, statistics are meaningless. */ |
21377 |
++ if (bfq_bfqq_just_split(bfqq)) |
21378 |
++ return; |
21379 |
++ |
21380 |
+ enable_idle = bfq_bfqq_idle_window(bfqq); |
21381 |
+ |
21382 |
+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || |
21383 |
+@@ -2445,6 +2666,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
21384 |
+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || |
21385 |
+ !BFQQ_SEEKY(bfqq)) |
21386 |
+ bfq_update_idle_window(bfqd, bfqq, bic); |
21387 |
++ bfq_clear_bfqq_just_split(bfqq); |
21388 |
+ |
21389 |
+ bfq_log_bfqq(bfqd, bfqq, |
21390 |
+ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", |
21391 |
+@@ -2505,13 +2727,48 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, |
21392 |
+ static void bfq_insert_request(struct request_queue *q, struct request *rq) |
21393 |
+ { |
21394 |
+ struct bfq_data *bfqd = q->elevator->elevator_data; |
21395 |
+- struct bfq_queue *bfqq = RQ_BFQQ(rq); |
21396 |
++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; |
21397 |
+ |
21398 |
+ assert_spin_locked(bfqd->queue->queue_lock); |
21399 |
++ |
21400 |
++ /* |
21401 |
++ * An unplug may trigger a requeue of a request from the device |
21402 |
++ * driver: make sure we are in process context while trying to |
21403 |
++ * merge two bfq_queues. |
21404 |
++ */ |
21405 |
++ if (!in_interrupt()) { |
21406 |
++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); |
21407 |
++ if (new_bfqq != NULL) { |
21408 |
++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) |
21409 |
++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); |
21410 |
++ /* |
21411 |
++ * Release the request's reference to the old bfqq |
21412 |
++ * and make sure one is taken to the shared queue. |
21413 |
++ */ |
21414 |
++ new_bfqq->allocated[rq_data_dir(rq)]++; |
21415 |
++ bfqq->allocated[rq_data_dir(rq)]--; |
21416 |
++ atomic_inc(&new_bfqq->ref); |
21417 |
++ bfq_put_queue(bfqq); |
21418 |
++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) |
21419 |
++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), |
21420 |
++ bfqq, new_bfqq); |
21421 |
++ rq->elv.priv[1] = new_bfqq; |
21422 |
++ bfqq = new_bfqq; |
21423 |
++ } |
21424 |
++ } |
21425 |
++ |
21426 |
+ bfq_init_prio_data(bfqq, RQ_BIC(rq)); |
21427 |
+ |
21428 |
+ bfq_add_rq_rb(rq); |
21429 |
+ |
21430 |
++ /* |
21431 |
++ * Here a newly-created bfq_queue has already started a weight-raising |
21432 |
++ * period: clear raising_time_left to prevent bfq_bfqq_save_state() |
21433 |
++ * from assigning it a full weight-raising period. See the detailed |
21434 |
++ * comments about this field in bfq_init_icq(). |
21435 |
++ */ |
21436 |
++ if (bfqq->bic != NULL) |
21437 |
++ bfqq->bic->raising_time_left = 0; |
21438 |
+ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]); |
21439 |
+ list_add_tail(&rq->queuelist, &bfqq->fifo); |
21440 |
+ |
21441 |
+@@ -2659,18 +2916,6 @@ static void bfq_put_request(struct request *rq) |
21442 |
+ } |
21443 |
+ } |
21444 |
+ |
21445 |
+-static struct bfq_queue * |
21446 |
+-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, |
21447 |
+- struct bfq_queue *bfqq) |
21448 |
+-{ |
21449 |
+- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", |
21450 |
+- (long unsigned)bfqq->new_bfqq->pid); |
21451 |
+- bic_set_bfqq(bic, bfqq->new_bfqq, 1); |
21452 |
+- bfq_mark_bfqq_coop(bfqq->new_bfqq); |
21453 |
+- bfq_put_queue(bfqq); |
21454 |
+- return bic_to_bfqq(bic, 1); |
21455 |
+-} |
21456 |
+- |
21457 |
+ /* |
21458 |
+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this |
21459 |
+ * was the last process referring to said bfqq. |
21460 |
+@@ -2679,6 +2924,9 @@ static struct bfq_queue * |
21461 |
+ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) |
21462 |
+ { |
21463 |
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); |
21464 |
++ |
21465 |
++ put_io_context(bic->icq.ioc); |
21466 |
++ |
21467 |
+ if (bfqq_process_refs(bfqq) == 1) { |
21468 |
+ bfqq->pid = current->pid; |
21469 |
+ bfq_clear_bfqq_coop(bfqq); |
21470 |
+@@ -2707,6 +2955,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, |
21471 |
+ struct bfq_queue *bfqq; |
21472 |
+ struct bfq_group *bfqg; |
21473 |
+ unsigned long flags; |
21474 |
++ bool split = false; |
21475 |
+ |
21476 |
+ might_sleep_if(gfp_mask & __GFP_WAIT); |
21477 |
+ |
21478 |
+@@ -2725,24 +2974,14 @@ new_queue: |
21479 |
+ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask); |
21480 |
+ bic_set_bfqq(bic, bfqq, is_sync); |
21481 |
+ } else { |
21482 |
+- /* |
21483 |
+- * If the queue was seeky for too long, break it apart. |
21484 |
+- */ |
21485 |
++ /* If the queue was seeky for too long, break it apart. */ |
21486 |
+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { |
21487 |
+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); |
21488 |
+ bfqq = bfq_split_bfqq(bic, bfqq); |
21489 |
++ split = true; |
21490 |
+ if (!bfqq) |
21491 |
+ goto new_queue; |
21492 |
+ } |
21493 |
+- |
21494 |
+- /* |
21495 |
+- * Check to see if this queue is scheduled to merge with |
21496 |
+- * another closely cooperating queue. The merging of queues |
21497 |
+- * happens here as it must be done in process context. |
21498 |
+- * The reference on new_bfqq was taken in merge_bfqqs. |
21499 |
+- */ |
21500 |
+- if (bfqq->new_bfqq != NULL) |
21501 |
+- bfqq = bfq_merge_bfqqs(bfqd, bic, bfqq); |
21502 |
+ } |
21503 |
+ |
21504 |
+ bfqq->allocated[rw]++; |
21505 |
+@@ -2753,6 +2992,26 @@ new_queue: |
21506 |
+ rq->elv.priv[0] = bic; |
21507 |
+ rq->elv.priv[1] = bfqq; |
21508 |
+ |
21509 |
++ /* |
21510 |
++ * If a bfq_queue has only one process reference, it is owned |
21511 |
++ * by only one bfq_io_cq: we can set the bic field of the |
21512 |
++ * bfq_queue to the address of that structure. Also, if the |
21513 |
++ * queue has just been split, mark a flag so that the |
21514 |
++ * information is available to the other scheduler hooks. |
21515 |
++ */ |
21516 |
++ if (bfqq_process_refs(bfqq) == 1) { |
21517 |
++ bfqq->bic = bic; |
21518 |
++ if (split) { |
21519 |
++ bfq_mark_bfqq_just_split(bfqq); |
21520 |
++ /* |
21521 |
++ * If the queue has just been split from a shared queue, |
21522 |
++ * restore the idle window and the possible weight |
21523 |
++ * raising period. |
21524 |
++ */ |
21525 |
++ bfq_bfqq_resume_state(bfqq, bic); |
21526 |
++ } |
21527 |
++ } |
21528 |
++ |
21529 |
+ spin_unlock_irqrestore(q->queue_lock, flags); |
21530 |
+ |
21531 |
+ return 0; |
21532 |
+diff --git a/block/bfq-sched.c b/block/bfq-sched.c |
21533 |
+index 999b475..e54ea33 100644 |
21534 |
+--- a/block/bfq-sched.c |
21535 |
++++ b/block/bfq-sched.c |
21536 |
+@@ -980,34 +980,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) |
21537 |
+ return bfqq; |
21538 |
+ } |
21539 |
+ |
21540 |
+-/* |
21541 |
+- * Forced extraction of the given queue. |
21542 |
+- */ |
21543 |
+-static void bfq_get_next_queue_forced(struct bfq_data *bfqd, |
21544 |
+- struct bfq_queue *bfqq) |
21545 |
+-{ |
21546 |
+- struct bfq_entity *entity; |
21547 |
+- struct bfq_sched_data *sd; |
21548 |
+- |
21549 |
+- BUG_ON(bfqd->in_service_queue != NULL); |
21550 |
+- |
21551 |
+- entity = &bfqq->entity; |
21552 |
+- /* |
21553 |
+- * Bubble up extraction/update from the leaf to the root. |
21554 |
+- */ |
21555 |
+- for_each_entity(entity) { |
21556 |
+- sd = entity->sched_data; |
21557 |
+- bfq_update_budget(entity); |
21558 |
+- bfq_update_vtime(bfq_entity_service_tree(entity)); |
21559 |
+- bfq_active_extract(bfq_entity_service_tree(entity), entity); |
21560 |
+- sd->active_entity = entity; |
21561 |
+- sd->next_active = NULL; |
21562 |
+- entity->service = 0; |
21563 |
+- } |
21564 |
+- |
21565 |
+- return; |
21566 |
+-} |
21567 |
+- |
21568 |
+ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) |
21569 |
+ { |
21570 |
+ if (bfqd->in_service_bic != NULL) { |
21571 |
+diff --git a/block/bfq.h b/block/bfq.h |
21572 |
+index f9b5881..0bfad40 100644 |
21573 |
+--- a/block/bfq.h |
21574 |
++++ b/block/bfq.h |
21575 |
+@@ -192,6 +192,8 @@ struct bfq_group; |
21576 |
+ * idle to backlogged |
21577 |
+ * @service_from_backlogged: cumulative service received from the @bfq_queue |
21578 |
+ * since the last transition from idle to backlogged |
21579 |
++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the |
21580 |
++ * queue is shared |
21581 |
+ * |
21582 |
+ * A bfq_queue is a leaf request queue; it can be associated to an io_context |
21583 |
+ * or more (if it is an async one). @cgroup holds a reference to the |
21584 |
+@@ -235,6 +237,7 @@ struct bfq_queue { |
21585 |
+ sector_t last_request_pos; |
21586 |
+ |
21587 |
+ pid_t pid; |
21588 |
++ struct bfq_io_cq *bic; |
21589 |
+ |
21590 |
+ /* weight-raising fields */ |
21591 |
+ unsigned long raising_cur_max_time; |
21592 |
+@@ -264,12 +267,23 @@ struct bfq_ttime { |
21593 |
+ * @icq: associated io_cq structure |
21594 |
+ * @bfqq: array of two process queues, the sync and the async |
21595 |
+ * @ttime: associated @bfq_ttime struct |
21596 |
++ * @raising_time_left: snapshot of the time left before weight raising ends |
21597 |
++ * for the sync queue associated to this process; this |
21598 |
++ * snapshot is taken to remember this value while the weight |
21599 |
++ * raising is suspended because the queue is merged with a |
21600 |
++ * shared queue, and is used to set @raising_cur_max_time |
21601 |
++ * when the queue is split from the shared queue and its |
21602 |
++ * weight is raised again |
21603 |
++ * @saved_idle_window: same purpose as the previous field for the idle window |
21604 |
+ */ |
21605 |
+ struct bfq_io_cq { |
21606 |
+ struct io_cq icq; /* must be the first member */ |
21607 |
+ struct bfq_queue *bfqq[2]; |
21608 |
+ struct bfq_ttime ttime; |
21609 |
+ int ioprio; |
21610 |
++ |
21611 |
++ unsigned int raising_time_left; |
21612 |
++ unsigned int saved_idle_window; |
21613 |
+ }; |
21614 |
+ |
21615 |
+ /** |
21616 |
+@@ -411,6 +425,7 @@ enum bfqq_state_flags { |
21617 |
+ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ |
21618 |
+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ |
21619 |
+ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */ |
21620 |
++ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ |
21621 |
+ BFQ_BFQQ_FLAG_softrt_update, /* needs softrt-next-start update */ |
21622 |
+ }; |
21623 |
+ |
21624 |
+@@ -438,6 +453,7 @@ BFQ_BFQQ_FNS(sync); |
21625 |
+ BFQ_BFQQ_FNS(budget_new); |
21626 |
+ BFQ_BFQQ_FNS(coop); |
21627 |
+ BFQ_BFQQ_FNS(split_coop); |
21628 |
++BFQ_BFQQ_FNS(just_split); |
21629 |
+ BFQ_BFQQ_FNS(softrt_update); |
21630 |
+ #undef BFQ_BFQQ_FNS |
21631 |
+ |
21632 |
+-- |
21633 |
+1.8.5.2 |
21634 |
+ |