#!/usr/bin/env python3 import argparse import bs4 import json import logging import os import pathlib import re import shutil from jinja2 import FileSystemLoader, Environment, select_autoescape class OTCDocConvertor: def __init__(self): self.doc_anchors = dict() self.doc_links = dict() @staticmethod def get_new_name(current_name): new_name = current_name.replace(" - ", "_") new_name = new_name.replace(" ", "_") new_name = new_name.replace("/", "_") new_name = new_name.replace("'", "") new_name = new_name.replace('"', "") new_name = new_name.replace("`", "") new_name = new_name.replace("´", "") new_name = new_name.replace(":", "") new_name = new_name.replace("?", "") new_name = new_name.replace("(", "") new_name = new_name.replace(")", "") new_name = new_name.lower() return new_name @staticmethod def build_doc_tree(metadata): flat_tree = dict() for k, v in metadata.items(): parent_id = v.get("p_code") if not parent_id: parent_id = 0 if parent_id not in flat_tree: flat_tree[parent_id] = list() flat_tree[parent_id].append(v) return flat_tree @classmethod def get_target_path(cls, code, metadata): if code in metadata: current = metadata[code] if not current.get("p_code"): return current["new_name"] else: return "{0}/{1}".format( cls.get_target_path(current["p_code"], metadata), current["new_name"], ) else: return "" def make_label(self, soup, name): label = soup.new_tag("p") label.string = f"..\\_{name.lower()}:" return label def is_element_referred(self, ref, fname): return ( ref in self.doc_links or "#" + ref in self.doc_links or fname + "#" + ref in self.doc_links ) def streamline_html(self, soup, file_name): # Drop eventual header duplicated anchors fname = file_name.replace(".html", "").lower() page_anchors = set() met_page_anchors = dict() for lnk in soup.body.find_all("a"): name = None if "name" in lnk.attrs and lnk.string is None: name = lnk.attrs["name"].lower() if name in met_page_anchors: # Such anchor already existed on this page, drop it lnk.decompose() met_page_anchors[name] = True if name and name.lower() == fname: lnk.decompose() # Process divs for i in soup.body.find_all("div"): if "note" in i.get("class", []): # Notes del i["id"] if i.img: i.img.decompose() notetitle = i.find("span", class_="notetitle") if notetitle: title = soup.new_tag("div") title["class"] = "title" title.string = "Note:" notetitle.replace_with(title) elif "warning" in i.get("class", []): # Warnings del i["id"] if i.img: i.img.decompose() eltitle = i.find("span", class_="warningtitle") if eltitle: title = soup.new_tag("div") title["class"] = "title" title.string = "Warning:" eltitle.replace_with(title) elif "notice" in i.get("class", []): # Notices del i["id"] if i.img: i.img.decompose() i["class"] = "important" elif "caution" in i.get("class", []): # Cautions del i["id"] if i.img: i.img.decompose() elif "fignone" in i.get("class", []): # Figures # When we found figure generate local label (anchor) if i.get("id"): logging.debug("place figure label") i.insert_before(self.make_label(soup, i.get("id"))) figure = soup.new_tag("figure") img = i.find("img") cap = i.find("span", class_="figcap") if cap is not None: cap.name = "figcaption" figure.append(cap) if img: # Store all referred images for copying self.doc_images.add(img["src"]) img["src"] = "/_static/images/" + img["src"] del img["width"] del img["height"] del img["class"] del img["title"] del img["name"] figure.append(img) i.replace_with(figure) elif "section" in i.get("class", []): # Sections if i.get("id"): # When we found section generate local label (anchor) sec_id = i.get("id").lower() if self.is_element_referred(sec_id, file_name): page_anchors.add(sec_id) i.insert_before(self.make_label(soup, sec_id)) i.unwrap() elif i.get("id") and i.get("id").startswith("body"): i.unwrap() else: i.name = "p" # Process remaining images for img in soup.body.find_all("img"): if img["src"] and not img["src"].startswith("/_static/images"): self.doc_images.add(img["src"]) img["src"] = "/_static/images/" + img["src"] del img["width"] del img["height"] del img["class"] del img["title"] del img["id"] # Drop strong in table headers "/" for th in soup.body.find_all("th"): if th.p.strong: th.p.strong.unwrap() if self.args.improve_table_headers: # Add spaces around "/" for th in soup.body.find_all("th"): if hasattr(th, "p") and th.p.string: th.p.string = re.sub(r"\b/\b", " / ", th.p.string) # Drop strong around links "/" for strong in soup.body.find_all("strong"): if strong.a: strong.unwrap() # table anchors - some tables are referred. Some are having anchor in # front, some not. In order to cope with that we analyze every table # and if it is referred - prepend anchor. Next anchor processing will # skiip it, since such anchor is already placed on the page for table in soup.body.find_all("table"): # Verify this is really called from somewhere: if table.get("id"): local_ref = table["id"].lower() if self.is_element_referred(local_ref, file_name): # We now know something in the document wants this anchor - # replace it with label if local_ref not in page_anchors: lnk = bs4.BeautifulSoup( f"

..\\_{local_ref}:

", "html.parser" ) table.insert_before(lnk) page_anchors.add(local_ref) else: logging.debug( "Not placing replaced anchor %s " " since it already existed", local_ref, ) # local anchors for lnk in soup.body.find_all("a"): if ( lnk.string is None and hasattr(lnk, "name") and not re.match(r"^li\d+$", lnk.attrs["name"]) # anywhere section and not re.match(r".*section\d+$", lnk.attrs["name"]) # starts with table and not re.match(r"^table\d+$", lnk.attrs["name"]) ): # Verify this is really called from somewhere: local_ref = lnk["name"].lower() if self.is_element_referred(local_ref, file_name): # We now know something in the document wants this anchor - # replace it with label if local_ref not in page_anchors: logging.debug("Adding anchor") lnk.name = "p" lnk.string = f"..\\_{local_ref}:" del lnk["name"] page_anchors.add(local_ref) else: logging.debug( "Not placing replaced anchor %s " " since it already existed", local_ref, ) else: logging.debug("Dropping unreferred link %s", lnk) for li in soup.body.find_all("li"): del li["id"] # Sometimes we have code blocks with line numbers. #
#
1
        # 2
        # 3
....
        # 
for td_lines in soup.body.find_all("td", "linenos"): codeblock = td_lines.parent.find("td", "code") table = td_lines.find_parent("table") if codeblock and table: # Replace whole table with only codeblock td table.replace_with(codeblock) for pre in soup.body.find_all("pre"): text = pre.get_text() # if text.startswith("{"): # pre["class"] = "data" if re.search(r"\[[a-z]*@\w+.*\][\s#>]?", text): # Something like "[root@ecs-test-0001 ~]#" pre["class"] = "console" elif re.match(r"^(GET|PUT|POST|DELETE)", text): # Something like "DELETE https://some_url" pre["class"] = "text" if "codeblock" in pre.get("class", []): #
 all files (*.*)` - no need to escape
                if len(part.groups()) > 0 and p.parent.name != "b":
                    logging.debug(
                        "Found asterisks to escape: %s", part.group(1)
                    )
                    new = curr.replace(
                        part.group(1), f"{part.group(1)}"
                    )
                    p.replace_with(bs4.BeautifulSoup(new, "html.parser"))

        # And now specialities
        rawize_strings = [
            # "\*\*\*\*\*\*",
            # r"([\\\/\:\*\?\"\~|<>]{4,})"
            # ModelArts UMN contain this "unallowed" sequence
            r"(\\/:\*\?\"<>\|)",
            # CSS UMN contain "SELECT exclude('*name') FROM my-index"
            r"(\*name)",
            # DMS UMN contain: (`~!@#$%^&*()-_=+\|[{}]:'",<.>/?)
            r"\(([\W\x60_]{10,})\)",
            # MRS UMN contain: /:*?"<>|\\;&,'`!{}[]$%+
            r"\s([^a-zA-Z0-9\s]{10,})",
        ]
        for to_rawize in rawize_strings:
            for p in soup.body.find_all(string=re.compile(to_rawize)):
                if p.string:
                    curr = p.string
                    part = re.search(to_rawize, curr)
                    if len(part.groups()) > 0:
                        logging.debug(
                            "Found element to rawize: %s", part.group(1)
                        )
                        new = curr.replace(
                            part.group(1), f"{part.group(1)}"
                        )
                        logging.debug("Replacing string with: %s", new)
                        p.replace_with(bs4.BeautifulSoup(new, "html.parser"))
                        logging.debug("Replacing string with: %s", p.string)
                    else:
                        logging.error(
                            "Cannot find string for rawization anymore"
                        )
                logging.error(f"String with star: {p}")

        # Drop parent link at the bottom of the page
        for parent in soup.body.find_all("p", class_="familylinks"):
            parent.decompose()

        return soup.body

    def main(self):
        logging.basicConfig(level=logging.DEBUG)
        parser = argparse.ArgumentParser(description="Process links.")
        parser.add_argument("path", type=str, help="path to the files")
        parser.add_argument(
            "--improve-table-headers",
            action="store_true",
            help="Improve table headers by enforcing spaces around `/`",
        )
        parser.add_argument(
            "--pygments-lexer", help="Set particular code-block lexer language"
        )
        parser.add_argument(
            "--dest", help="Directory to write resulting files"
        )
        parser.add_argument("--title", required=True, help="Document title")
        parser.add_argument(
            "--service", help="Service to which the document belongs to"
        )
        parser.add_argument("--repo-name", help="Service repository")
        parser.add_argument("--pdf-name", help="PDF File name")
        parser.add_argument(
            "--templates-location",
            default="templates",
            help="Location of additional templates",
        )
        self.args = parser.parse_args()
        if self.args.dest:
            dest = pathlib.Path(self.args.dest)
        else:
            dest = pathlib.Path(self.args.path, "result")
        dest.mkdir(parents=True, exist_ok=True)

        metadata_file = pathlib.Path(self.args.path, "CLASS.TXT.json")
        meta_data = dict()

        if not metadata_file.exists():
            logging.warning(
                f"CLASS.TXT.json file is missing in {self.args.path}, "
                f"assuming initial import"
            )
            with open(pathlib.Path(dest, "index.rst"), "w") as index:
                index.write("=" * (len(self.args.title)) + "\n")
                index.write(self.args.title + "\n")
                index.write("=" * (len(self.args.title)) + "\n")
                index.write("\n")
        else:
            meta_data = json.loads(open(metadata_file).read())
        metadata_by_uri = dict()
        metadata_by_code = dict()
        self.doc_images = set()
        for f in meta_data:
            f["new_name"] = self.get_new_name(f["title"])
            metadata_by_uri[f["uri"]] = f
            metadata_by_code[f.get("code")] = f

        tree = self.build_doc_tree(metadata_by_code)

        pathlib.Path(self.args.path, "temp/").mkdir(
            parents=True, exist_ok=True
        )

        # Scan all docs for anchors
        for f in pathlib.Path(self.args.path).glob("*.html"):
            if f.name not in metadata_by_uri:
                continue
            # Registering section links
            with open(f, "r") as reader:
                logging.debug(f"Scanning {f.name}")
                content = reader.read()
                soup = bs4.BeautifulSoup(content, "lxml")
                for lnk in soup.body.find_all("a"):
                    if "name" in lnk.attrs and lnk.string is None:
                        anchor = lnk.attrs["name"]
                        title = re.sub("[ _:]", "-", anchor)
                        res = dict(
                            fname=f.name, title=title, replace=title.lower()
                        )
                        self.doc_anchors[anchor] = res
                    if "href" in lnk.attrs and lnk["href"]:
                        self.doc_links[lnk["href"].lower()] = f.name

        for f in pathlib.Path(self.args.path).glob("*.html"):
            if f.name not in metadata_by_uri:
                continue
            _target = metadata_by_uri[f.name]
            target = _target["new_name"]
            target_path = self.get_target_path(
                _target["p_code"], metadata_by_code
            )
            pathlib.Path(self.args.path, "temp").mkdir(
                parents=True, exist_ok=True
            )
            pathlib.Path(self.args.path, "tmp_result/" + target_path).mkdir(
                parents=True, exist_ok=True
            )
            pathlib.Path(dest, target_path).mkdir(parents=True, exist_ok=True)

            # Pre-processing of html content
            with open(f, "r") as reader, open(
                pathlib.Path(self.args.path, f"temp/{target}.tmp"), "w"
            ) as writer:
                # if f.name not in [
                # ]:
                #     continue
                logging.info(f"Pre-Processing {f} as {target}")
                content = reader.read()
                soup = bs4.BeautifulSoup(content, "lxml")
                proc = self.streamline_html(soup, f.name)

                for lnk in proc.find_all("a"):
                    href = lnk.get("href")
                    if href and not href.startswith("http"):
                        # Internal link - replace with :ref:
                        code = soup.new_tag("code")
                        code["class"] = "interpreted-text"
                        code["role"] = "ref"
                        href_parts = href.split("#")
                        if len(href_parts) > 1:
                            # for anchor just use anchor ref
                            link_target = href_parts[1].lower()
                        else:
                            # for other page - use only page name
                            link_target = (
                                href_parts[0].replace(".html", "").lower()
                            )
                        if link_target:
                            # Looks like an anchor on the same page
                            code.string = f"{lnk.string} <{link_target}>"
                            logging.debug(f" replace {lnk} with {code}")
                            lnk.replace_with(code)

                logging.info(f"Saving file {writer.name}")
                writer.write(str(proc))

            # Convert html to rst
            os.system(
                f"pandoc '{self.args.path}/temp/{target}.tmp' -f html "
                f"-o '{self.args.path}/tmp_result/{target_path}/{target}.rst' "
                f"--ascii -s --wrap none"
            )
            # Post processing of rendered rst
            with open(
                f"{self.args.path}/tmp_result/" f"{target_path}/{target}.rst",
                "r",
            ) as reader, open(
                pathlib.Path(dest, target_path, f"{target}.rst"), "w"
            ) as writer:
                logging.info(f"Post processing {target}...")
                writer.write(f":original_name: {f.name}\n\n")
                # Add root file label
                writer.write(f".. _{f.name.replace('.html', '')}:\n\n")
                # post process some usual stuff
                for line in reader.readlines():
                    processed_line = re.sub(r"\.\.\\\\_", ".. _", line)
                    processed_line = re.sub(r"\.\.\\_", ".. _", processed_line)
                    # We could get unwanted anchors from pandoc - get rid of
                    # them
                    anchor = re.search(r"\.\. \_(.*):", processed_line)
                    if anchor and len(anchor.groups()) > 0:
                        if not self.is_element_referred(
                            anchor.group(1), f.name
                        ):
                            # This is most likely some duplicated anchor. It is
                            # not referred from any other place so drop it
                            logging.info("Dropping not referred anchor")
                            continue

                    processed_line = re.sub(r"√", "Y", processed_line)
                    processed_line = re.sub(
                        r"public_sys-resources/", "", processed_line
                    )
                    processed_line = re.sub(
                        r"   :name: .*$", "", processed_line
                    )
                    processed_line = re.sub(
                        r"\*\*Parent topic:.*$", "", processed_line
                    )
                    processed_line = re.sub(
                        r".. code:: screen$",
                        r".. code-block::",
                        processed_line,
                    )
                    for lexer in ["json", "bash", "text", "console"]:
                        processed_line = re.sub(
                            f".. code:: {lexer}$",
                            f".. code-block:: {lexer}",
                            processed_line,
                        )
                        if re.match(rf".. code:: {lexer}\s", processed_line):
                            logging.error(
                                f"'code-block: {lexer}' with something "
                                "afterwards"
                            )
                            exit(1)
                    # spaces are important, since code-block may reside inside
                    # of the cell
                    processed_line = re.sub(
                        r".. code:: screen\s",
                        r".. code-block::  ",
                        processed_line,
                    )
                    processed_line = re.sub(
                        r".. code:: codeblock$",
                        r".. code-block::",
                        processed_line,
                    )
                    processed_line = re.sub(r"[ \t]*$", "", processed_line)
                    writer.write(processed_line)

        # Generate indexes
        for k, v in tree.items():
            path = ""
            title = self.args.title
            page_label = ""
            if k != 0:
                curr = metadata_by_code[k]
                title = curr["title"]
                page_label = curr["uri"].replace(".html", "").lower()
                path = self.get_target_path(curr["code"], metadata_by_code)

            p = pathlib.Path(dest, f"{path}.rst")
            if p.exists():
                logging.warning(
                    f"{p.resolve()} is renamed into {path}/index.rst"
                )
                # Update existing index file
                p.rename(pathlib.Path(dest, f"{path}/index.rst"))
                with open(pathlib.Path(dest, path, "index.rst"), "a") as index:
                    index.write("\n")
                    index.write(".. toctree::\n")
                    index.write("   :maxdepth: 1\n")
                    index.write("   :hidden: \n\n")
                    for child in v:
                        new_name = child["new_name"]
                        if child["code"] in tree:
                            # If this is folder - add /index
                            new_name = new_name + "/index"
                        index.write(f"   {new_name}\n")
            else:
                with open(pathlib.Path(dest, path, "index.rst"), "w") as index:
                    # New index file
                    if page_label:
                        index.write(f".. _{page_label}:\n\n")
                    index.write("=" * (len(title)) + "\n")
                    index.write(title + "\n")
                    index.write("=" * (len(title)) + "\n")
                    index.write("\n")
                    index.write(".. toctree::\n")
                    index.write("   :maxdepth: 1\n\n")
                    for child in v:
                        new_name = child["new_name"]
                        if child["code"] in tree:
                            # If this is folder - add /index
                            new_name = new_name + "/index"
                        index.write(f"   {new_name}\n")
        # Copy used images
        if len(self.doc_images) > 0:
            logging.debug("Processing images...")
            img_dest = pathlib.Path(dest, "_static", "images")
            img_dest.mkdir(parents=True, exist_ok=True)
            for img in self.doc_images:
                shutil.copyfile(
                    pathlib.Path(self.args.path, img).resolve(strict=False),
                    pathlib.Path(img_dest, os.path.basename(img)).resolve(
                        strict=False
                    ),
                )

        context = dict(
            title=self.args.title,
            project=self.args.service,
            repo_name=self.args.repo_name,
            pdf_name=self.args.pdf_name,
        )
        loader = FileSystemLoader([self.args.templates_location])
        env = Environment(loader=loader, autoescape=select_autoescape())
        for f in loader.list_templates():
            outfile_tmpl = env.get_template(f)
            outfile_rendered = outfile_tmpl.render(**context)
            target_file = pathlib.Path(self.args.dest, f)
            target_file.parent.mkdir(parents=True, exist_ok=True)
            with open(target_file, "w", encoding="utf-8", newline="") as out:
                logging.debug(f"Generating {f} from template...")
                out.write(outfile_rendered)


def main():
    OTCDocConvertor().main()


if __name__ == "__main__":
    main()