1
        # 2
        # 3

....
        #

for td_lines in soup.body.find_all("td", "linenos"): codeblock = td_lines.parent.find("td", "code") table = td_lines.find_parent("table") if codeblock and table: # Replace whole table with only codeblock td logging.debug("Replace %s with %s" % (table, codeblock)) codeblock.name = "pre" del codeblock["class"] table.replace_with(codeblock) for pre in soup.body.find_all("pre"): text = pre.get_text() # if text.startswith("{"): # pre["class"] = "data" if re.search(r"\[[a-z]*@\w+.*\][\s#>]?", text): # Something like "[root@ecs-test-0001 ~]#" pre["class"] = "console" elif re.match(r"^(GET|PUT|POST|DELETE)", text): # Something like "DELETE https://some_url" pre["class"] = "text" if "codeblock" in pre.get("class", []): #

{em.string}"
                em.replace_with(bs4.BeautifulSoup(new, "html.parser"))

        # Incredibly dirty hacks:
        rawize_expressions = [
            # DWS Dev Guide harcodes
            r"^(1\|\"iamtext\"\|\"iamvarchar\"\|2006-07-07\|12:00:00)$",
            r"^(2\|\"iamtext\"\|\"iamvarchar\"\|2022-07-07\|19:00:02)$",
            r"(\*max_files_per_process\*3)",
            r"(&&, &&&, .* <#>)",
            # r"(\*\+)",
            r"(-\|-)",
            r"(^-{8}$)"
        ]
        self.rawize_me(soup, rawize_expressions)

        # Special asterisks treatement
        escape_asterisk_re = r"\((\*)[\.,]"
        self.rawize_me(soup, [escape_asterisk_re])

        # And now remaining specialities
        rawize_strings = [
            # "\*\*\*\*\*\*",
            # r"([\\\/\:\*\?\"\~|<>]{4,})"
            # ModelArts UMN contain this "unallowed" sequence
            r"(\\/:\*\?\"<>\|)",
            # CSS UMN contain "SELECT exclude('*name') FROM my-index"
            r"(\*name)",
            # DMS UMN contain: (`~!@#$%^&*()-_=+\|[{}]:'",<.>/?)
            r"\(([\W\x60_]{10,})\)",
            # MRS UMN contain: /:*?"<>|\\;&,'`!{}[]$%+
            r"\s([^a-zA-Z0-9\s]{8,})",
            # MRS operation guide contain: /*+ MAPJOIN(join_table) \*/
            # RDS UMN contain: /*FORCE_MASTER*/
            r"(/\*.{5,}\*/)",
            # BMS API contain sequence in a dedicated paragraph
            r"^([^a-zA-Z0-9\s]{10,})$",
            # OBS special chars - "\$" "\\" etc
            r"^(\\[\$\\bfnrtvu]{1})$",
            # CES contains: urn:smn:([a-z]|[A-Z]|[0-9]|\\-){1,32}:....
            r"\s(urn:smn:\(.*)\.",
            # "-" only (in tables) is considered as list
            r"^(-)$",
            # MRS component guide has: "./mydate_\\\\d*/"
            r"\w(_)\\",
            # Pandoc is not properly escaping _ before special chars what makes
            # it invalid for Sphinx. A bit weird regex:
            # "[:space:][:word:][:underscore:][:comma:]"
            r"\s([\w_]+_)[,]",
            # DWS dirty-fixes part 2
            r"/(\*\+)",
        ]
        self.rawize_me(soup, rawize_strings)

        # Pandoc seem to be not escaping properly asterisks which are
        # immediately following non word chars
        # (https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#toc-entry-44)
        # NOTE(gtema):
        # 1. this is on purpose placed here since we want to have some special
        # escapings above
        # 2. we are not escaping asterisks at the end of the paragraphs (pandoc
        # deals correctly with that)
        re_escape = re.compile(r"([-/'\"<\([{])(\*+)(.)")
        for p in soup.body.find_all(string=re_escape):
            if p.string and p.parent.name == "p":
                p.string.replace_with(
                    re.sub(re_escape, r"\1``\2``\3", p.string))

        # Special case for multiple asterisks and colons like ecs:*:*
        re_escape = re.compile(r"([:])(\*+)")
        re_escape_new = re.compile(r"([:])(\*)[^$]")
        for p in soup.body.find_all(string=re_escape):
            if p.string and (p.parent.name == "p" or p.parent.name == "li"):
                string = p.string
                while re.search(re_escape, string):
                    if re.search(re_escape_new, string):
                        string = re.sub(
                            re_escape, r"\1``\2``", string, count=1)
                    else:
                        break
                p.string.replace_with(string)

        # Drop parent link at the bottom of the page
        for parent in soup.body.find_all("p", class_="familylinks"):
            parent.decompose()

        return soup.body

    def main(self):
        logging.basicConfig(level=logging.DEBUG)
        parser = argparse.ArgumentParser(description="Process links.")
        parser.add_argument("path", type=str, help="path to the files")
        parser.add_argument(
            "--improve-table-headers",
            action="store_true",
            help="Improve table headers by enforcing spaces around `/`",
        )
        parser.add_argument(
            "--pygments-lexer", help="Set particular code-block lexer language"
        )
        parser.add_argument(
            "--dest", help="Directory to write resulting files"
        )
        parser.add_argument("--title", required=True, help="Document title")
        parser.add_argument(
            "--service", help="Service to which the document belongs to"
        )
        parser.add_argument("--repo-name", help="Service repository")
        parser.add_argument("--pdf-name", help="PDF File name")
        parser.add_argument(
            "--templates-location",
            default="templates",
            help="Location of additional templates",
        )
        self.args = parser.parse_args()
        if self.args.dest:
            dest = pathlib.Path(self.args.dest)
        else:
            dest = pathlib.Path(self.args.path, "result")
        dest.mkdir(parents=True, exist_ok=True)

        metadata_file = pathlib.Path(self.args.path, "CLASS.TXT.json")
        meta_data = dict()

        if not metadata_file.exists():
            logging.warning(
                f"CLASS.TXT.json file is missing in {self.args.path}, "
                f"assuming initial import"
            )
            with open(pathlib.Path(dest, "index.rst"), "w") as index:
                index.write("=" * (len(self.args.title)) + "\n")
                index.write(self.args.title + "\n")
                index.write("=" * (len(self.args.title)) + "\n")
                index.write("\n")
        else:
            meta_data = json.loads(open(metadata_file).read())
        metadata_by_uri = dict()
        metadata_by_code = dict()
        self.doc_images = set()
        for f in meta_data:
            f["new_name"] = self.get_new_name(f["title"])
            metadata_by_uri[f["uri"]] = f
            metadata_by_code[f.get("code")] = f

        tree = self.build_doc_tree(metadata_by_code)

        pathlib.Path(self.args.path, "temp/").mkdir(
            parents=True, exist_ok=True
        )

        # Scan all docs for anchors
        for f in pathlib.Path(self.args.path).glob("*.html"):
            if f.name not in metadata_by_uri:
                continue
            # Registering section links
            with open(f, "r") as reader:
                logging.debug(f"Scanning {f.name}")
                content = reader.read()
                soup = bs4.BeautifulSoup(content, "lxml")
                for lnk in soup.body.find_all("a"):
                    if "name" in lnk.attrs and lnk.string is None:
                        anchor = lnk.attrs["name"]
                        title = re.sub("[ _:]", "-", anchor)
                        res = dict(
                            fname=f.name.lower(),
                            title=title,
                            replace=title.lower()
                        )
                        self.doc_anchors[anchor] = res
                    if "href" in lnk.attrs and lnk["href"]:
                        self.doc_links[lnk["href"].lower()] = f.name

        for f in pathlib.Path(self.args.path).glob("*.html"):
            if f.name not in metadata_by_uri:
                continue
            _target = metadata_by_uri[f.name]
            target = _target["new_name"]
            target_path = self.get_target_path(
                _target["p_code"], metadata_by_code
            )
            pathlib.Path(self.args.path, "temp").mkdir(
                parents=True, exist_ok=True
            )
            pathlib.Path(self.args.path, "tmp_result/" + target_path).mkdir(
                parents=True, exist_ok=True
            )
            pathlib.Path(dest, target_path).mkdir(parents=True, exist_ok=True)

            # Pre-processing of html content
            with open(f, "r") as reader, open(
                pathlib.Path(self.args.path, f"temp/{target}.tmp"), "w"
            ) as writer:
                # if f.name not in [
                # ]:
                #     continue
                logging.info(f"Pre-Processing {f} as {target}")
                content = reader.read()
                # Preprocess - Fix space inside link and not text
                # i.e. `Some link in text
                content = re.sub(r"\s", " ", content)
                content = re.sub(r"√", "Y", content)
                content = re.sub(r"×", "x", content)
                content = re.sub(r"π", "Pi", content)
                content = re.sub(r"–", "-", content)
                content = re.sub(r"≤", "<=", content)
                content = re.sub(r"≥", ">=", content)
                soup = bs4.BeautifulSoup(content, "lxml")
                proc = self.streamline_html(soup, f.name, self.args)

                for lnk in proc.find_all("a"):
                    href = lnk.get("href")
                    if href and not href.startswith("http"):
                        # Internal link - replace with :ref:
                        code = soup.new_tag("code")
                        code["class"] = "interpreted-text"
                        code["role"] = "ref"
                        href_parts = href.split("#")
                        if len(href_parts) > 1:
                            # for anchor just use anchor ref
                            link_target = href_parts[1].lower()
                        else:
                            # for other page - use only page name
                            link_target = (
                                href_parts[0].replace(".html", "").lower()
                            )
                        if link_target:
                            # Looks like an anchor on the same page
                            code.string = f"{lnk.string} <{link_target}>"
                            logging.debug(f" replace {lnk} with {code}")
                            lnk.replace_with(code)

                logging.info(f"Saving file {writer.name}")
                writer.write(str(proc))

            # Convert html to rst
            os.system(
                f"pandoc '{self.args.path}/temp/{target}.tmp' -f html "
                f"-o '{self.args.path}/tmp_result/{target_path}/{target}.rst' "
                f"--ascii -s --wrap none"
            )
            # Post processing of rendered rst
            with open(
                f"{self.args.path}/tmp_result/" f"{target_path}/{target}.rst",
                "r",
            ) as reader, open(
                pathlib.Path(dest, target_path, f"{target}.rst"), "w"
            ) as writer:
                logging.info(f"Post processing {target}...")
                writer.write(f":original_name: {f.name}\n\n")
                # Add root file label
                writer.write(f".. _{f.name.replace('.html', '')}:\n\n")
                # post process some usual stuff
                for line in reader.readlines():
                    processed_line = re.sub(
                        r"\.\.\\\\_(.*):$", r".. _\1:", line)
                    # replace anchor when it is itself inside some other block
                    # (i.e. table)
                    processed_line = re.sub(
                        r"\.\.\\\\_(.*):\s", r".. _\1:  ", processed_line)
                    # For some reason regex locally and in zuul are not
                    # behaving same - thus same but different
                    processed_line = re.sub(
                        r"\.\.\\_(.*):$", r".. _\1:", processed_line)
                    # replace anchor when it is itself inside some other block
                    # (i.e. table)
                    processed_line = re.sub(
                        r"\.\.\\_(.*):\s", r".. _\1:  ", processed_line)
                    # We could get unwanted anchors from pandoc - get rid of
                    # them
                    anchor = re.search(r"\.\. \_(.*):", processed_line)
                    if anchor and len(anchor.groups()) > 0:
                        if not self.is_element_referred(
                            anchor.group(1), f.name
                        ):
                            # This is most likely some duplicated anchor. It is
                            # not referred from any other place so drop it
                            logging.info(
                                "Dropping not referred anchor '%s'",
                                anchor.group(1))
                            continue

                    processed_line = re.sub(
                        r"public_sys-resources/", "", processed_line
                    )
                    processed_line = re.sub(
                        r"   :name: .*$", "", processed_line
                    )
                    processed_line = re.sub(
                        r"\*\*Parent topic:.*$", "", processed_line
                    )
                    processed_line = re.sub(
                        r".. code:: screen$",
                        r".. code-block::",
                        processed_line,
                    )
                    for lexer in ["json", "bash", "text", "console"]:
                        processed_line = re.sub(
                            f".. code:: {lexer}$",
                            f".. code-block:: {lexer}",
                            processed_line,
                        )
                        if re.match(rf".. code:: {lexer}\s", processed_line):
                            logging.error(
                                f"'code-block: {lexer}' with something "
                                "afterwards"
                            )
                            exit(1)
                    # spaces are important, since code-block may reside inside
                    # of the cell
                    processed_line = re.sub(
                        r".. code:: screen\s",
                        r".. code-block::  ",
                        processed_line,
                    )
                    processed_line = re.sub(
                        r".. code:: codeblock$",
                        r".. code-block::",
                        processed_line,
                    )
                    processed_line = re.sub(r"[ \t]*$", "", processed_line)
                    writer.write(processed_line)

        # Generate indexes
        for k, v in tree.items():
            path = ""
            title = self.args.title
            page_label = ""
            if k != 0:
                curr = metadata_by_code[k]
                title = curr["title"]
                page_label = curr["uri"].replace(".html", "").lower()
                path = self.get_target_path(curr["code"], metadata_by_code)

            p = pathlib.Path(dest, f"{path}.rst")
            if p.exists():
                logging.warning(
                    f"{p.resolve()} is renamed into {path}/index.rst"
                )
                # Update existing index file
                p.rename(pathlib.Path(dest, f"{path}/index.rst"))
                with open(pathlib.Path(dest, path, "index.rst"), "a") as index:
                    index.write("\n")
                    index.write(".. toctree::\n")
                    index.write("   :maxdepth: 1\n")
                    index.write("   :hidden: \n\n")
                    for child in v:
                        new_name = child["new_name"]
                        if child["code"] in tree:
                            # If this is folder - add /index
                            new_name = new_name + "/index"
                        index.write(f"   {new_name}\n")
            else:
                with open(pathlib.Path(dest, path, "index.rst"), "w") as index:
                    # New index file
                    if page_label:
                        index.write(f".. _{page_label}:\n\n")
                    index.write("=" * (len(title)) + "\n")
                    index.write(title + "\n")
                    index.write("=" * (len(title)) + "\n")
                    index.write("\n")
                    index.write(".. toctree::\n")
                    index.write("   :maxdepth: 1\n\n")
                    for child in v:
                        new_name = child["new_name"]
                        if child["code"] in tree:
                            # If this is folder - add /index
                            new_name = new_name + "/index"
                        index.write(f"   {new_name}\n")
        # Copy used images
        if len(self.doc_images) > 0:
            logging.debug("Processing images...")
            img_dest = pathlib.Path(dest, "_static", "images")
            img_dest.mkdir(parents=True, exist_ok=True)
            for img in self.doc_images:
                shutil.copyfile(
                    pathlib.Path(self.args.path, img).resolve(strict=False),
                    pathlib.Path(img_dest, os.path.basename(img)).resolve(
                        strict=False
                    ),
                )

        context = dict(
            title=self.args.title,
            project=self.args.service,
            repo_name=self.args.repo_name,
            pdf_name=self.args.pdf_name,
        )
        loader = FileSystemLoader([self.args.templates_location])
        env = Environment(loader=loader, autoescape=select_autoescape())
        for f in loader.list_templates():
            outfile_tmpl = env.get_template(f)
            outfile_rendered = outfile_tmpl.render(**context)
            target_file = pathlib.Path(self.args.dest, f)
            target_file.parent.mkdir(parents=True, exist_ok=True)
            with open(target_file, "w", encoding="utf-8", newline="") as out:
                logging.debug(f"Generating {f} from template...")
                out.write(outfile_rendered)


def main():
    OTCDocConvertor().main()


if __name__ == "__main__":
    main()