Various fixes for MRS component operation guide

Reviewed-by: Hasko, Vladimir <vladimir.hasko@t-systems.com> Co-authored-by: gtema <artem.goncharov@gmail.com> Co-committed-by: gtema <artem.goncharov@gmail.com>
2022-11-22 17:11:07 +00:00 · 2022-11-22 17:11:07 +00:00 · c5ad20504f
commit c5ad20504f
parent 6ffc35f074
1 changed files with 36 additions and 15 deletions
--- a/otc_doc_convertor/convertor.py
+++ b/otc_doc_convertor/convertor.py
@ -221,7 +221,7 @@ class OTCDocConvertor:
                    else:
                        logging.debug(
                            "Not placing replaced anchor %s "
-                            " since it already existed",
+                            "since it already existed",
                            local_ref,
                        )

@ -319,17 +319,6 @@ class OTCDocConvertor:
                new = f"<code>{em.string}</code>"
                em.replace_with(bs4.BeautifulSoup(new, "html.parser"))

-        for p in soup.body.find_all(string=re.compile(r"(/\*).+")):
-            if p.string and p.parent.name == "p":
-                p.string.replace_with(p.string.replace("/*", "/``*``"))
-
-        # MRS UMN contains: /opt/Bigdata/FusionInsight_Porter_8.*/foo-*/
-        # This is a pretty special case and we do not want to apply that widely
-        # therefore only looking for [.-]*/ combinations
-        for p in soup.body.find_all(string=re.compile(r"([\.-]\*/).+")):
-            if p.string and p.parent.name == "p":
-                p.string.replace_with(p.string.replace("*/", "``*``/"))
-
        escape_asterisk_re = r"\((\*)[\.,]"
        for p in soup.body.find_all(string=re.compile(escape_asterisk_re)):
            if p.string and p.parent.name not in ["b", "strong", "pre"]:
@ -356,6 +345,8 @@ class OTCDocConvertor:
            r"\(([\W\x60_]{10,})\)",
            # MRS UMN contain: /:*?"<>|\\;&,'`!{}[]$%+
            r"\s([^a-zA-Z0-9\s]{8,})",
+            # MRS operation guide contain: /*+ MAPJOIN(join_table) \*/
+            r"\s(/\*.*\*/)",
            # BMS API contain sequence in a dedicated paragraph
            r"^([^a-zA-Z0-9\s]{10,})$",
            # OBS special chars - "\$" "\\" etc
@ -364,6 +355,8 @@ class OTCDocConvertor:
            r"\s(urn:smn:\(.*)\.",
            # "-" only (in tables) is considered as list
            r"^(-)$",
+            # MRS component guide has: "./mydate_\\\\d*/"
+            r"\w(_)\\",
        ]
        for to_rawize in rawize_strings:
            for p in soup.body.find_all(string=re.compile(to_rawize)):
@ -386,6 +379,20 @@ class OTCDocConvertor:
                            "Cannot find string for rawization anymore"
                        )

+        # Pandoc seem to be not escaping properly asterists which are
+        # immediately following non word chars
+        # (https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#toc-entry-44)
+        # NOTE(gtema):
+        # 1. this is on purpose placed here since we want to have some special
+        # escapings above
+        # 2. we are not escaping asterisks at the end of the paragraphs (pandoc
+        # deals correctly with that)
+        re_escape = re.compile(r"([-:/'\"<\([{])(\*+)(.+)")
+        for p in soup.body.find_all(string=re_escape):
+            if p.string and p.parent.name == "p":
+                p.string.replace_with(
+                    re.sub(re_escape, r"\1``\2``\3", p.string))
+
        # Drop parent link at the bottom of the page
        for parent in soup.body.find_all("p", class_="familylinks"):
            parent.decompose()
@ -555,8 +562,20 @@ class OTCDocConvertor:
                writer.write(f".. _{f.name.replace('.html', '')}:\n\n")
                # post process some usual stuff
                for line in reader.readlines():
-                    processed_line = re.sub(r"\.\.\\\\_", ".. _", line)
-                    processed_line = re.sub(r"\.\.\\_", ".. _", processed_line)
+                    processed_line = re.sub(
+                        r"\.\.\\\\_(.*):$", r".. _\1:", line)
+                    # replace anchor when it is itself inside some other block
+                    # (i.e. table)
+                    processed_line = re.sub(
+                        r"\.\.\\\\_(.*):\s", r".. _\1:  ", processed_line)
+                    # For some reason regex locally and in zuul are not
+                    # behaving same - thus same but different
+                    processed_line = re.sub(
+                        r"\.\.\\_(.*):$", r".. _\1:", processed_line)
+                    # replace anchor when it is itself inside some other block
+                    # (i.e. table)
+                    processed_line = re.sub(
+                        r"\.\.\\_(.*):\s", r".. _\1:  ", processed_line)
                    # We could get unwanted anchors from pandoc - get rid of
                    # them
                    anchor = re.search(r"\.\. \_(.*):", processed_line)
@ -566,7 +585,9 @@ class OTCDocConvertor:
                        ):
                            # This is most likely some duplicated anchor. It is
                            # not referred from any other place so drop it
-                            logging.info("Dropping not referred anchor")
+                            logging.info(
+                                "Dropping not referred anchor '%s'",
+                                anchor.group(1))
                            continue

                    processed_line = re.sub(