#!/usr/bin/env python3 import argparse import bs4 import json import logging import os import pathlib import re import shutil from jinja2 import FileSystemLoader, Environment, select_autoescape class OTCDocConvertor: def __init__(self): self.doc_anchors = dict() self.doc_links = dict() @staticmethod def get_new_name(current_name): new_name = current_name.replace(" - ", "_") # This is a unicode char new_name = new_name.replace("–", "_") new_name = new_name.replace(" ", "_") new_name = new_name.replace("/", "_") new_name = new_name.replace("=", "_") new_name = new_name.replace("+", "") new_name = new_name.replace("'", "") new_name = new_name.replace('"', "") new_name = new_name.replace("`", "") new_name = new_name.replace("´", "") new_name = new_name.replace(":", "") new_name = new_name.replace("?", "") new_name = new_name.replace("(", "") new_name = new_name.replace(")", "") new_name = new_name.replace(",", "") new_name = new_name.replace("!", "") new_name = new_name.replace("<", "") new_name = new_name.replace(">", "") new_name = new_name.replace("$", "") new_name = new_name.replace("#", "sharp") new_name = new_name.replace("%", "pct") new_name = new_name.replace('_&_', '_and_') new_name = re.sub(r'(\w+)&(\w+)', r'\1_and_\2', new_name) new_name = re.sub('(_+)', '_', new_name) new_name = new_name.lower() return new_name @staticmethod def build_doc_tree(metadata): flat_tree = dict() for k, v in metadata.items(): parent_id = v.get("p_code") if not parent_id: parent_id = 0 if parent_id not in flat_tree: flat_tree[parent_id] = list() flat_tree[parent_id].append(v) return flat_tree @classmethod def get_target_path(cls, code, metadata): if code in metadata: current = metadata[code] if not current.get("p_code"): return current["new_name"] else: return "{0}/{1}".format( cls.get_target_path(current["p_code"], metadata), current["new_name"], ) else: return "" def make_label(self, soup, name): label = soup.new_tag("p") label.string = f"..\\_{name.lower()}:" return label def is_element_referred(self, ref, fname): return ( ref in self.doc_links or "#" + ref in self.doc_links or fname.lower() + "#" + ref in self.doc_links ) def rawize_me(self, soup, expressions): for to_rawize in expressions: for p in soup.body.find_all(string=re.compile(to_rawize)): if p.string and p.parent.name not in [ "b", "strong", "pre", "code"]: curr = p.string part = re.search(to_rawize, curr) # We should not escape inside of bold - this is wrong if len(part.groups()) > 0: logging.debug( "Found element to rawize %s", part.group(1) ) new = curr.replace( part.group(1), f"{part.group(1)}" ) logging.debug("Replacing string with: %s", new) p.replace_with(bs4.BeautifulSoup(new, "html.parser")) logging.debug("Replacing string with: %s", p.string) else: logging.error( "Cannot find string for rawization anymore" ) def streamline_html(self, soup, file_name, args=None): # Drop eventual header duplicated anchors fname = file_name.replace(".html", "").lower() page_anchors = set() met_page_anchors = dict() for lnk in soup.body.find_all("a"): name = None if "name" in lnk.attrs and lnk.string is None: name = lnk.attrs["name"].lower() if name in met_page_anchors: # Such anchor already existed on this page, drop it lnk.decompose() met_page_anchors[name] = True if name and name.lower() == fname: lnk.decompose() # Process divs for i in soup.body.find_all("div"): if i.decomposed: # if we decompose a later in the code it may still be returned # here in the list. Skip those continue if "note" in i.get("class", []): # Notes del i["id"] if i.img: i.img.decompose() notetitle = i.find("span", class_="notetitle") notebody = i.find(class_="notebody") if not (notebody and notebody.get_text()): # Some smart people make empty notes. Since this is # breaking layout we need to drop those i.decompose() elif notetitle: title = soup.new_tag("div") title["class"] = "title" title.string = "Note:" notetitle.replace_with(title) elif "warning" in i.get("class", []): # Warnings del i["id"] if i.img: i.img.decompose() eltitle = i.find("span", class_="warningtitle") if eltitle: title = soup.new_tag("div") title["class"] = "title" title.string = "Warning:" eltitle.replace_with(title) elif "notice" in i.get("class", []): # Notices del i["id"] if i.img: i.img.decompose() i["class"] = "important" elif "caution" in i.get("class", []): # Cautions del i["id"] if i.img: i.img.decompose() elif "fignone" in i.get("class", []): # Figures # When we found figure generate local label (anchor) if i.get("id"): logging.debug("place figure label") i.insert_before(self.make_label(soup, i.get("id"))) figure = soup.new_tag("figure") img = i.find("img") cap = i.find("span", class_="figcap") if cap is not None: cap.name = "figcaption" figure.append(cap) if img: # Store all referred images for copying self.doc_images.add(img["src"]) img["src"] = ( "/_static/images/" + os.path.basename(img["src"]) ) del img["width"] del img["height"] del img["class"] del img["title"] del img["name"] del img["id"] figure.append(img) i.replace_with(figure) elif "section" in i.get("class", []): # Sections if i.get("id"): # When we found section generate local label (anchor) sec_id = i.get("id").lower() if self.is_element_referred(sec_id, file_name): page_anchors.add(sec_id) i.insert_before(self.make_label(soup, sec_id)) i.unwrap() elif i.get("id") and i.get("id").startswith("body"): i.unwrap() else: i.name = "p" # Process remaining images for img in soup.body.find_all("img"): if img["src"] and not img["src"].startswith("/_static/images"): self.doc_images.add(img["src"]) img["src"] = "/_static/images/" + os.path.basename(img["src"]) del img["width"] del img["height"] del img["class"] del img["title"] del img["id"] # Drop strong in table headers "/" for th in soup.body.find_all("th"): if th.p and th.p.strong: th.p.strong.unwrap() if args and args.improve_table_headers: # Add spaces around "/" for th in soup.body.find_all("th"): if hasattr(th, "p") and th.p.string: th.p.string = re.sub(r"\b/\b", " / ", th.p.string) # Drop strong around links "/" for strong in soup.body.find_all("strong"): if strong.a: strong.unwrap() # table anchors - some tables are referred. Some are having anchor in # front, some not. In order to cope with that we analyze every table # and if it is referred - prepend anchor. Next anchor processing will # skiip it, since such anchor is already placed on the page for table in soup.body.find_all("table"): # Verify this is really called from somewhere: if table.get("id"): local_ref = table["id"].lower() if self.is_element_referred(local_ref, file_name): # We now know something in the document wants this anchor - # replace it with label if local_ref not in page_anchors: lnk = bs4.BeautifulSoup( f"

..\\_{local_ref}:

", "html.parser" ) table.insert_before(lnk) page_anchors.add(local_ref) else: logging.debug( "Not placing replaced anchor %s " "since it already existed", local_ref, ) # local anchors for lnk in soup.body.find_all("a"): if ( lnk.string is None and lnk.has_attr("name") and not re.match(r"^li\d+$", lnk.attrs["name"]) # anywhere section and not re.match(r".*section\d+$", lnk.attrs["name"]) # starts with table and not re.match(r"^table\d+$", lnk.attrs["name"]) ): # Verify this is really called from somewhere: local_ref = lnk["name"].lower() if self.is_element_referred(local_ref, file_name): # We now know something in the document wants this anchor - # replace it with label if local_ref not in page_anchors: logging.debug("Adding anchor") lnk.name = "p" lnk.string = f"..\\_{local_ref}:" del lnk["name"] page_anchors.add(local_ref) else: logging.debug( "Not placing replaced anchor %s " " since it already existed", local_ref, ) else: logging.debug("Dropping unreferred link %s", lnk) # Undeline element should not be used at all for underline in soup.body.find_all("u"): underline.unwrap() for li in soup.body.find_all("li"): if not li.get("id"): continue local_ref = li.get("id").lower() # Delete li ID to prevent broken RST containers del li["id"] if ( self.is_element_referred(local_ref, file_name) and local_ref not in page_anchors ): # LI item referred, but no anchor present logging.debug("Adding missing li %s anchor", local_ref) li.insert(0, self.make_label(soup, local_ref)) page_anchors.add(local_ref) # Sometimes we have code blocks with line numbers. #
#
1
        # 2
        # 3
....
        # 
for td_lines in soup.body.find_all("td", "linenos"): codeblock = td_lines.parent.find("td", "code") table = td_lines.find_parent("table") if codeblock and table: # Replace whole table with only codeblock td logging.debug("Replace %s with %s" % (table, codeblock)) codeblock.name = "pre" del codeblock["class"] table.replace_with(codeblock) for pre in soup.body.find_all("pre"): text = pre.get_text() # if text.startswith("{"): # pre["class"] = "data" if re.search(r"\[[a-z]*@\w+.*\][\s#>]?", text): # Something like "[root@ecs-test-0001 ~]#" pre["class"] = "console" elif re.match(r"^(GET|PUT|POST|DELETE)", text): # Something like "DELETE https://some_url" pre["class"] = "text" if "codeblock" in pre.get("class", []): #
{em.string}"
                em.replace_with(bs4.BeautifulSoup(new, "html.parser"))

        # Incredibly dirty hacks:
        rawize_expressions = [
            # DWS Dev Guide harcodes
            r"^(1\|\"iamtext\"\|\"iamvarchar\"\|2006-07-07\|12:00:00)$",
            r"^(2\|\"iamtext\"\|\"iamvarchar\"\|2022-07-07\|19:00:02)$",
            r"(\*max_files_per_process\*3)",
            r"(&&, &&&, .* <#>)",
            # r"(\*\+)",
            r"(-\|-)",
            r"(^-{8}$)"
        ]
        self.rawize_me(soup, rawize_expressions)

        # Special asterisks treatement
        escape_asterisk_re = r"\((\*)[\.,]"
        self.rawize_me(soup, [escape_asterisk_re])

        # And now remaining specialities
        rawize_strings = [
            # "\*\*\*\*\*\*",
            # r"([\\\/\:\*\?\"\~|<>]{4,})"
            # ModelArts UMN contain this "unallowed" sequence
            r"(\\/:\*\?\"<>\|)",
            # CSS UMN contain "SELECT exclude('*name') FROM my-index"
            r"(\*name)",
            # DMS UMN contain: (`~!@#$%^&*()-_=+\|[{}]:'",<.>/?)
            r"\(([\W\x60_]{10,})\)",
            # MRS UMN contain: /:*?"<>|\\;&,'`!{}[]$%+
            r"\s([^a-zA-Z0-9\s]{8,})",
            # MRS operation guide contain: /*+ MAPJOIN(join_table) \*/
            # RDS UMN contain: /*FORCE_MASTER*/
            r"(/\*.{5,}\*/)",
            # BMS API contain sequence in a dedicated paragraph
            r"^([^a-zA-Z0-9\s]{10,})$",
            # OBS special chars - "\$" "\\" etc
            r"^(\\[\$\\bfnrtvu]{1})$",
            # CES contains: urn:smn:([a-z]|[A-Z]|[0-9]|\\-){1,32}:....
            r"\s(urn:smn:\(.*)\.",
            # "-" only (in tables) is considered as list
            r"^(-)$",
            # MRS component guide has: "./mydate_\\\\d*/"
            r"\w(_)\\",
            # Pandoc is not properly escaping _ before special chars what makes
            # it invalid for Sphinx. A bit weird regex:
            # "[:space:][:word:][:underscore:][:comma:]"
            r"\s([\w_]+_)[,]",
            # DWS dirty-fixes part 2
            r"/(\*\+)",
        ]
        self.rawize_me(soup, rawize_strings)

        # Pandoc seem to be not escaping properly asterisks which are
        # immediately following non word chars
        # (https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#toc-entry-44)
        # NOTE(gtema):
        # 1. this is on purpose placed here since we want to have some special
        # escapings above
        # 2. we are not escaping asterisks at the end of the paragraphs (pandoc
        # deals correctly with that)
        re_escape = re.compile(r"([-/'\"<\([{])(\*+)(.)")
        for p in soup.body.find_all(string=re_escape):
            if p.string and p.parent.name == "p":
                p.string.replace_with(
                    re.sub(re_escape, r"\1``\2``\3", p.string))

        # Special case for multiple asterisks and colons like ecs:*:*
        re_escape = re.compile(r"([:])(\*+)")
        re_escape_new = re.compile(r"([:])(\*)[^$]")
        for p in soup.body.find_all(string=re_escape):
            if p.string and (p.parent.name == "p" or p.parent.name == "li"):
                string = p.string
                while re.search(re_escape, string):
                    if re.search(re_escape_new, string):
                        string = re.sub(
                            re_escape, r"\1``\2``", string, count=1)
                    else:
                        break
                p.string.replace_with(string)

        # Drop parent link at the bottom of the page
        for parent in soup.body.find_all("p", class_="familylinks"):
            parent.decompose()

        return soup.body

    def main(self):
        logging.basicConfig(level=logging.DEBUG)
        parser = argparse.ArgumentParser(description="Process links.")
        parser.add_argument("path", type=str, help="path to the files")
        parser.add_argument(
            "--improve-table-headers",
            action="store_true",
            help="Improve table headers by enforcing spaces around `/`",
        )
        parser.add_argument(
            "--pygments-lexer", help="Set particular code-block lexer language"
        )
        parser.add_argument(
            "--dest", help="Directory to write resulting files"
        )
        parser.add_argument("--title", required=True, help="Document title")
        parser.add_argument(
            "--service", help="Service to which the document belongs to"
        )
        parser.add_argument("--repo-name", help="Service repository")
        parser.add_argument("--pdf-name", help="PDF File name")
        parser.add_argument(
            "--templates-location",
            default="templates",
            help="Location of additional templates",
        )
        self.args = parser.parse_args()
        if self.args.dest:
            dest = pathlib.Path(self.args.dest)
        else:
            dest = pathlib.Path(self.args.path, "result")
        dest.mkdir(parents=True, exist_ok=True)

        metadata_file = pathlib.Path(self.args.path, "CLASS.TXT.json")
        meta_data = dict()

        if not metadata_file.exists():
            logging.warning(
                f"CLASS.TXT.json file is missing in {self.args.path}, "
                f"assuming initial import"
            )
            with open(pathlib.Path(dest, "index.rst"), "w") as index:
                index.write("=" * (len(self.args.title)) + "\n")
                index.write(self.args.title + "\n")
                index.write("=" * (len(self.args.title)) + "\n")
                index.write("\n")
        else:
            meta_data = json.loads(open(metadata_file).read())
        metadata_by_uri = dict()
        metadata_by_code = dict()
        self.doc_images = set()
        for f in meta_data:
            f["new_name"] = self.get_new_name(f["title"])
            metadata_by_uri[f["uri"]] = f
            metadata_by_code[f.get("code")] = f

        tree = self.build_doc_tree(metadata_by_code)

        pathlib.Path(self.args.path, "temp/").mkdir(
            parents=True, exist_ok=True
        )

        # Scan all docs for anchors
        for f in pathlib.Path(self.args.path).glob("*.html"):
            if f.name not in metadata_by_uri:
                continue
            # Registering section links
            with open(f, "r") as reader:
                logging.debug(f"Scanning {f.name}")
                content = reader.read()
                soup = bs4.BeautifulSoup(content, "lxml")
                for lnk in soup.body.find_all("a"):
                    if "name" in lnk.attrs and lnk.string is None:
                        anchor = lnk.attrs["name"]
                        title = re.sub("[ _:]", "-", anchor)
                        res = dict(
                            fname=f.name.lower(),
                            title=title,
                            replace=title.lower()
                        )
                        self.doc_anchors[anchor] = res
                    if "href" in lnk.attrs and lnk["href"]:
                        self.doc_links[lnk["href"].lower()] = f.name

        for f in pathlib.Path(self.args.path).glob("*.html"):
            if f.name not in metadata_by_uri:
                continue
            _target = metadata_by_uri[f.name]
            target = _target["new_name"]
            target_path = self.get_target_path(
                _target["p_code"], metadata_by_code
            )
            pathlib.Path(self.args.path, "temp").mkdir(
                parents=True, exist_ok=True
            )
            pathlib.Path(self.args.path, "tmp_result/" + target_path).mkdir(
                parents=True, exist_ok=True
            )
            pathlib.Path(dest, target_path).mkdir(parents=True, exist_ok=True)

            # Pre-processing of html content
            with open(f, "r") as reader, open(
                pathlib.Path(self.args.path, f"temp/{target}.tmp"), "w"
            ) as writer:
                # if f.name not in [
                # ]:
                #     continue
                logging.info(f"Pre-Processing {f} as {target}")
                content = reader.read()
                # Preprocess - Fix space inside link and not text
                # i.e. `

Some link in text

content = re.sub(r"\s", " ", content) content = re.sub(r"√", "Y", content) content = re.sub(r"×", "x", content) content = re.sub(r"π", "Pi", content) content = re.sub(r"–", "-", content) content = re.sub(r"≤", "<=", content) content = re.sub(r"≥", ">=", content) soup = bs4.BeautifulSoup(content, "lxml") proc = self.streamline_html(soup, f.name, self.args) for lnk in proc.find_all("a"): href = lnk.get("href") if href and not href.startswith("http"): # Internal link - replace with :ref: code = soup.new_tag("code") code["class"] = "interpreted-text" code["role"] = "ref" href_parts = href.split("#") if len(href_parts) > 1: # for anchor just use anchor ref link_target = href_parts[1].lower() else: # for other page - use only page name link_target = ( href_parts[0].replace(".html", "").lower() ) if link_target: # Looks like an anchor on the same page code.string = f"{lnk.string} <{link_target}>" logging.debug(f" replace {lnk} with {code}") lnk.replace_with(code) logging.info(f"Saving file {writer.name}") writer.write(str(proc)) # Convert html to rst os.system( f"pandoc '{self.args.path}/temp/{target}.tmp' -f html " f"-o '{self.args.path}/tmp_result/{target_path}/{target}.rst' " f"--ascii -s --wrap none" ) # Post processing of rendered rst with open( f"{self.args.path}/tmp_result/" f"{target_path}/{target}.rst", "r", ) as reader, open( pathlib.Path(dest, target_path, f"{target}.rst"), "w" ) as writer: logging.info(f"Post processing {target}...") writer.write(f":original_name: {f.name}\n\n") # Add root file label writer.write(f".. _{f.name.replace('.html', '')}:\n\n") # post process some usual stuff for line in reader.readlines(): processed_line = re.sub( r"\.\.\\\\_(.*):$", r".. _\1:", line) # replace anchor when it is itself inside some other block # (i.e. table) processed_line = re.sub( r"\.\.\\\\_(.*):\s", r".. _\1: ", processed_line) # For some reason regex locally and in zuul are not # behaving same - thus same but different processed_line = re.sub( r"\.\.\\_(.*):$", r".. _\1:", processed_line) # replace anchor when it is itself inside some other block # (i.e. table) processed_line = re.sub( r"\.\.\\_(.*):\s", r".. _\1: ", processed_line) # We could get unwanted anchors from pandoc - get rid of # them anchor = re.search(r"\.\. \_(.*):", processed_line) if anchor and len(anchor.groups()) > 0: if not self.is_element_referred( anchor.group(1), f.name ): # This is most likely some duplicated anchor. It is # not referred from any other place so drop it logging.info( "Dropping not referred anchor '%s'", anchor.group(1)) continue processed_line = re.sub( r"public_sys-resources/", "", processed_line ) processed_line = re.sub( r" :name: .*$", "", processed_line ) processed_line = re.sub( r"\*\*Parent topic:.*$", "", processed_line ) processed_line = re.sub( r".. code:: screen$", r".. code-block::", processed_line, ) for lexer in ["json", "bash", "text", "console"]: processed_line = re.sub( f".. code:: {lexer}$", f".. code-block:: {lexer}", processed_line, ) if re.match(rf".. code:: {lexer}\s", processed_line): logging.error( f"'code-block: {lexer}' with something " "afterwards" ) exit(1) # spaces are important, since code-block may reside inside # of the cell processed_line = re.sub( r".. code:: screen\s", r".. code-block:: ", processed_line, ) processed_line = re.sub( r".. code:: codeblock$", r".. code-block::", processed_line, ) processed_line = re.sub(r"[ \t]*$", "", processed_line) writer.write(processed_line) # Generate indexes for k, v in tree.items(): path = "" title = self.args.title page_label = "" if k != 0: curr = metadata_by_code[k] title = curr["title"] page_label = curr["uri"].replace(".html", "").lower() path = self.get_target_path(curr["code"], metadata_by_code) p = pathlib.Path(dest, f"{path}.rst") if p.exists(): logging.warning( f"{p.resolve()} is renamed into {path}/index.rst" ) # Update existing index file p.rename(pathlib.Path(dest, f"{path}/index.rst")) with open(pathlib.Path(dest, path, "index.rst"), "a") as index: index.write("\n") index.write(".. toctree::\n") index.write(" :maxdepth: 1\n") index.write(" :hidden: \n\n") for child in v: new_name = child["new_name"] if child["code"] in tree: # If this is folder - add /index new_name = new_name + "/index" index.write(f" {new_name}\n") else: with open(pathlib.Path(dest, path, "index.rst"), "w") as index: # New index file if page_label: index.write(f".. _{page_label}:\n\n") index.write("=" * (len(title)) + "\n") index.write(title + "\n") index.write("=" * (len(title)) + "\n") index.write("\n") index.write(".. toctree::\n") index.write(" :maxdepth: 1\n\n") for child in v: new_name = child["new_name"] if child["code"] in tree: # If this is folder - add /index new_name = new_name + "/index" index.write(f" {new_name}\n") # Copy used images if len(self.doc_images) > 0: logging.debug("Processing images...") img_dest = pathlib.Path(dest, "_static", "images") img_dest.mkdir(parents=True, exist_ok=True) for img in self.doc_images: shutil.copyfile( pathlib.Path(self.args.path, img).resolve(strict=False), pathlib.Path(img_dest, os.path.basename(img)).resolve( strict=False ), ) context = dict( title=self.args.title, project=self.args.service, repo_name=self.args.repo_name, pdf_name=self.args.pdf_name, ) loader = FileSystemLoader([self.args.templates_location]) env = Environment(loader=loader, autoescape=select_autoescape()) for f in loader.list_templates(): outfile_tmpl = env.get_template(f) outfile_rendered = outfile_tmpl.render(**context) target_file = pathlib.Path(self.args.dest, f) target_file.parent.mkdir(parents=True, exist_ok=True) with open(target_file, "w", encoding="utf-8", newline="") as out: logging.debug(f"Generating {f} from template...") out.write(outfile_rendered) def main(): OTCDocConvertor().main() if __name__ == "__main__": main()