1 |
commit: 1029f9c624e3f3bf252f20197f357cca00a20410 |
2 |
Author: Göktürk Yüksek <gokturk <AT> gentoo <DOT> org> |
3 |
AuthorDate: Thu Dec 26 01:37:23 2019 +0000 |
4 |
Commit: Göktürk Yüksek <gokturk <AT> gentoo <DOT> org> |
5 |
CommitDate: Thu Dec 26 01:37:23 2019 +0000 |
6 |
URL: https://gitweb.gentoo.org/proj/devmanual.git/commit/?id=1029f9c6 |
7 |
|
8 |
bin/build_search_documents.py: handle multi-line indented text better |
9 |
|
10 |
Beyond replacing newlines that show up in the middle of a text, remove |
11 |
the whitespace following the newline (which is the indentation) as |
12 |
well. |
13 |
|
14 |
Signed-off-by: Göktürk Yüksek <gokturk <AT> gentoo.org> |
15 |
|
16 |
bin/build_search_documents.py | 12 +++++++++--- |
17 |
1 file changed, 9 insertions(+), 3 deletions(-) |
18 |
|
19 |
diff --git a/bin/build_search_documents.py b/bin/build_search_documents.py |
20 |
index 1aac495..38ffd24 100755 |
21 |
--- a/bin/build_search_documents.py |
22 |
+++ b/bin/build_search_documents.py |
23 |
@@ -5,6 +5,12 @@ import json |
24 |
import os.path |
25 |
import sys |
26 |
import xml.etree.ElementTree as ET |
27 |
+import re |
28 |
+ |
29 |
+ |
30 |
+# The regex for stripping a newline and the possible indentation |
31 |
+# whitespace following it in multiline content |
32 |
+whitespace_re = re.compile(r'\n[ \t]*', flags=re.M) |
33 |
|
34 |
|
35 |
def stringify_node(parent: ET.Element) -> str: |
36 |
@@ -28,7 +34,7 @@ def stringify_node(parent: ET.Element) -> str: |
37 |
|
38 |
# For each child, strip the tags and append to text |
39 |
# along with the tail text following it. |
40 |
- # The tail may include '\n' if it spans multiple lines. |
41 |
+ # The tail may include '\n', '\t', ' ' if it spans multiple lines. |
42 |
# We will worry about those on return, not now. |
43 |
for child in parent: |
44 |
# The '<d/>' tag is simply a fancier '-' character |
45 |
@@ -42,8 +48,8 @@ def stringify_node(parent: ET.Element) -> str: |
46 |
# A paragraph typically ends with: |
47 |
# Text\n</p> |
48 |
# Right strip any spurious whitespace. |
49 |
- # Finally, get rid of any intermediate newlines. |
50 |
- return text.rstrip().replace('\n', ' ') |
51 |
+ # Finally, get rid of any intermediate newlines and indentation whitespace. |
52 |
+ return whitespace_re.sub(' ', text.rstrip()) |
53 |
|
54 |
|
55 |
def process_node(documents: list, node: ET.Element, name: str, url: str) -> None: |